#### Multivariable regression

Load Boston Dataset from original source

In [114]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

Load Boston Dataset from file

In [115]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

Create boston object

In [116]:
from sklearn.utils import Bunch
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
boston = Bunch(data=data, target=target, feature_names=feature_names, DESCR='Boston dataset homemade')

In [117]:
X= boston.data
y = boston.target
print(X.shape, y.shape)

(506, 13) (506,)


In [118]:
import pandas as pd
df = pd.DataFrame(X)
print(boston.feature_names)
df.columns = boston.feature_names
df.describe()

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [119]:
print(X.shape, y.shape)

from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


from sklearn.linear_model import LinearRegression
alg1 = LinearRegression()

(506, 13) (506,)
(379, 13)
(127, 13)
(379,)
(127,)


In [120]:
alg1.fit(X_train, y_train)

In [121]:
Y_pred = alg1.predict(X_test)
train_score = alg1.score(X_train, y_train)
test_score = alg1.score(X_test, y_test)
print("Train Score ", train_score)
print("Test Score ", test_score)

Train Score  0.7697699488741148
Test Score  0.6354638433202113


Add one more feature

In [122]:
df["age_age"] = df.AGE**2
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,age_age
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,5493.309545
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,3449.600164
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,8.41
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,2027.2525
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,6006.29
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,8850.1075
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,10000.0


In [123]:
X2 = df.values
X2.shape

(506, 14)

In [124]:
from sklearn import model_selection
X2_train, X2_test, y2_train, y2_test = model_selection.train_test_split(X2, y,random_state = 0)
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)


from sklearn.linear_model import LinearRegression
alg2 = LinearRegression()

alg2.fit(X2_train, y2_train)

(379, 14)
(127, 14)
(379,)
(127,)


In [126]:
Y_pred2 = alg2.predict(X2_test)
train_score2 = alg2.score(X2_train, y2_train)
test_score2 = alg2.score(X2_test, y2_test)
print("Train Score 1\t", train_score)
print("Test Score 1\t", test_score)
print("Train Score 2\t", train_score2)
print("Test Score 2\t", test_score2)

Train Score 1	 0.7697699488741148
Test Score 1	 0.6354638433202113
Train Score 2	 0.770724544991109
Test Score 2	 0.6433109272342967


Add Features to Boston Dataset

In this problem you need to load the Boston dataset from sklearn toy datasets. After loading, you need to split the dataset into testing and training datasets. Now, fit the linear regression model on the training dataset and store the training and testing scores. 
After this add two more columns to the dataset. These columns are the squared values of the 'AGE' and the 'RM' columns respectively.
Again split the dataset into testing and training datasets. Now, fit the linear regression model on the training dataset and store the training and testing scores.
Compare the scores of both the models and print "Score improved" if both training and testing scores improved after squaring two columns. Otherwise, print "Score not improved".
Output
If both training and testing scores improved after squaring two columns print: "Score improved"

Else print: "Score not improved"


In [4]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

from sklearn.utils import Bunch
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
boston = Bunch(data=data, target=target, feature_names=feature_names, DESCR='Boston dataset homemade')

X= boston.data
y = boston.target
print(X.shape, y.shape)

(506, 13) (506,)


Test, train, fit

In [5]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 0)
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)


from sklearn.linear_model import LinearRegression
alg1 = LinearRegression()

alg1.fit(X_train, y_train)

Y_pred = alg1.predict(X_test)
train_score = alg1.score(X_train, y_train)
test_score = alg1.score(X_test, y_test)
print("Train Score ", train_score)
print("Test Score ", test_score)

X_train (379, 13)
X_test (127, 13)
y_train (379,)
y_test (127,)
Train Score  0.7697699488741148
Test Score  0.6354638433202113


Add two more columns

In [6]:
import pandas as pd
df = pd.DataFrame(X)
print(boston.feature_names)
df.columns = boston.feature_names

df["age_age"] = df.AGE**2
df["RM_RM"] = df.RM**2
df.describe()

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,age_age,RM_RM
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,5493.309545,39.989325
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,3449.600164,9.079531
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,8.41,12.680721
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,2027.2525,34.639111
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,6006.29,38.545473
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,8850.1075,43.870759
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,10000.0,77.0884


Test, train, fit second DataSet

In [8]:
X2 = df.values
print(X2.shape)

from sklearn import model_selection
X2_train, X2_test, y2_train, y2_test = model_selection.train_test_split(X2, y,random_state = 0)
print(X2_train.shape)
print(X2_test.shape)
print(y2_train.shape)
print(y2_test.shape)


from sklearn.linear_model import LinearRegression
alg2 = LinearRegression()

alg2.fit(X2_train, y2_train)

(506, 15)
(379, 15)
(127, 15)
(379,)
(127,)


Predict

In [12]:
Y_pred2 = alg2.predict(X2_test)
train_score2 = alg2.score(X2_train, y2_train)
test_score2 = alg2.score(X2_test, y2_test)
print("Train Score 2\t", train_score2)
print("Test Score 2\t", test_score2)

Train Score 2	 0.8273821084820213
Test Score 2	 0.7233010061059814


Compare Scores

In [13]:
if(train_score < train_score2 and test_score < test_score2):
    print('Score improved')
else:
    print('Score not improved')

Score improved


### Code Gradient Descent

In [24]:
import numpy as np

data = np.loadtxt("DataSets\\data.csv", delimiter = ",")
print("data.shape: ", data.shape)

# update m' , c'
def step_gradient(points, learning_rate, m, c):
    m_slope = 0
    c_slope = 0
    N = len(points)
    for i in range(N):
        x = points[i, 0]
        y = points[i, 1]
        m_slope += (-2/N) * (y - m * x - c) * x
        c_slope += (-2/N) * (y - m * x - c)
    new_m = m - learning_rate * m_slope
    new_c = c - learning_rate * c_slope
    return new_m, new_c

# Gradient Descent
def gd(points, learning_rate, num_iterations):
    m = 0
    c = 0
    for i in range(num_iterations):
        m, c = step_gradient(points, learning_rate, m, c)
        print(i, "Cost: ", cost(points, m, c))
    return m, c

def cost(points, m, c):
    total_cost = 0
    N = len(points)
    for i in range(N):
        x = points[i, 0]
        y = points[i, 1]
        total_cost += (1/N)*((y - m * x -c)**2)
    return total_cost

def run():
    data = np.loadtxt("DataSets\\data.csv", delimiter = ",")
    learning_rate = 0.0001
    num_iterations = 100
    m, c = gd(data, learning_rate, num_iterations)
    print(m, c)

run()



data.shape:  (100, 2)
0 Cost:  1484.5865574086486
1 Cost:  457.8542575737672
2 Cost:  199.5099857255389
3 Cost:  134.50591058200533
4 Cost:  118.1496934223995
5 Cost:  114.0341490603815
6 Cost:  112.99857731713657
7 Cost:  112.73798187568467
8 Cost:  112.6723843590911
9 Cost:  112.65585181499745
10 Cost:  112.65166489759581
11 Cost:  112.6505843615011
12 Cost:  112.65028544701502
13 Cost:  112.65018320293967
14 Cost:  112.650130445072
15 Cost:  112.65009013922885
16 Cost:  112.6500529669463
17 Cost:  112.65001658353178
18 Cost:  112.64998039901865
19 Cost:  112.64994426496071
20 Cost:  112.64990814400622
21 Cost:  112.64987202675677
22 Cost:  112.64983591084761
23 Cost:  112.64979979568368
24 Cost:  112.64976368111523
25 Cost:  112.64972756710469
26 Cost:  112.64969145364236
27 Cost:  112.64965534072611
28 Cost:  112.64961922835512
29 Cost:  112.64958311652944
30 Cost:  112.64954700524868
31 Cost:  112.64951089451318
32 Cost:  112.64947478432279
33 Cost:  112.64943867467744
34 Cost:  1

### Generic Gradient Descent