In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from pandas_profiling import ProfileReport
import pickle

In [2]:
boston = load_boston()
#print(boston)
bos = pd.DataFrame(boston.data,columns=boston.feature_names)
bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
bos['MEDV'] = boston.target
bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
#ProfileReport(bos)

In [6]:
x = bos.drop(columns="MEDV")
x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
y = bos.MEDV
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(x)
scaled_x = scaler.transform(x)
scaled_x

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

In [10]:
pickle.dump(scaler,open('scaler_trans.pkl','wb'))

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_df = pd.DataFrame()

In [12]:
vif_df['vif'] = [variance_inflation_factor(scaled_x,i) for i in range(scaled_x.shape[1])]

In [13]:
vif_df['feature']  = x.columns
vif_df

Unnamed: 0,vif,feature
0,1.792192,CRIM
1,2.298758,ZN
2,3.991596,INDUS
3,1.073995,CHAS
4,4.39372,NOX
5,1.933744,RM
6,3.100826,AGE
7,3.955945,DIS
8,7.484496,RAD
9,9.008554,TAX


In [14]:
x_train,x_test,y_train,y_test = train_test_split(scaled_x,y,test_size=0.25,random_state=100)

In [15]:
x_train

array([[ 0.37095672, -0.48772236,  1.01599907, ...,  0.80657583,
         0.44105193,  0.28692687],
       [-0.39859736, -0.48772236, -0.61672651, ..., -0.2568579 ,
         0.44105193,  0.20983129],
       [-0.41019162,  0.41358857, -0.80203138, ..., -0.76545664,
         0.42635969,  0.11170964],
       ...,
       [-0.38747673, -0.48772236, -0.54814912, ...,  0.52915834,
         0.37778758, -0.12798753],
       [-0.41635358,  0.370669  , -1.13908197, ..., -1.64394538,
         0.33590374, -1.24657432],
       [-0.39593474,  0.04877224, -0.47665354, ..., -1.50523663,
         0.32844797,  2.42177359]])

In [16]:
x_test

array([[-0.41613247,  2.94584308, -1.40317788, ..., -2.70737911,
         0.38951945, -0.8456773 ],
       [-0.36910605, -0.48772236, -0.72032214, ..., -0.48803915,
         0.25948216, -1.24657432],
       [-0.4152492 , -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.50084979],
       ...,
       [-0.40427171, -0.48772236, -0.37597609, ...,  1.13022958,
         0.39489198, -0.045285  ],
       [-0.03403609, -0.48772236,  1.2319449 , ..., -1.73641788,
         0.44105193,  1.98583312],
       [-0.28941366, -0.48772236,  1.2319449 , ..., -1.73641788,
        -0.14685696, -0.07472149]])

In [17]:
y_train

459    20.0
46     20.0
50     19.7
308    22.8
19     18.2
       ... 
343    23.9
359    22.6
323    18.5
280    45.4
8      16.5
Name: MEDV, Length: 379, dtype: float64

In [18]:
lr = LinearRegression()

In [19]:
lr.fit(x_train,y_train)

LinearRegression()

In [20]:
lr.coef_

array([-0.68051889,  1.06559073, -0.13132709,  0.61513632, -1.59251134,
        2.70447325, -0.37128206, -3.11845333,  2.58341035, -2.21688902,
       -1.99023689,  0.99325067, -3.23137278])

In [21]:
lr.intercept_

22.502051393875607

In [22]:
train_score = lr.score(x_train,y_train)
train_score

0.7421573768304615

In [23]:
test_score = lr.score(x_test,y_test)
test_score

0.7246154314616728

In [24]:
def adj_r2(x,y):
    r2 = lr.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [25]:
adj_r2(x_test,y_test)

0.6929340209218653

In [26]:
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
# model evaluation for training set
y_train_predict = lr.predict(x_train)
rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
r2 = r2_score(y_train, y_train_predict)
print("RMSE: ",rmse)
print("r2: ",r2)

RMSE:  4.528441417091423
r2:  0.7421573768304615


In [28]:
# model evaluation for testing set

y_test_predict = lr.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
r2 = r2_score(y_test, y_test_predict)
print("RMSE: ",rmse)
print("r2: ",r2)

RMSE:  5.2127866034438535
r2:  0.7246154314616728


In [29]:
pickle.dump(lr,open("linear_challenge_assignment.pkl",'wb'))

In [30]:
saved_model = pickle.load(open('linear_challenge_assignment.pkl','rb'))

In [31]:
saved_model.predict(x_test)

array([34.32560931, 31.2633507 , 22.28003335, 17.86975069, 20.40606949,
       25.91303352, 26.15242179, 23.66375418, 22.30169002, 19.59234366,
       26.75489062, 17.29420545, 20.80817579, 15.46781924, 41.69247941,
       20.16746428, 28.77029186, 19.02021072, 32.38988683, 40.43872985,
       35.04799251, 16.83853764, 20.33053373, 18.09797446, 13.80644424,
       12.41884725, 27.47062888, 20.40507455, 18.75428942, 20.25662126,
       15.46268828, 24.40432932, 39.15237132, 24.76295642, 31.78210045,
       28.38346572, 15.00749275, 14.45446654, 16.73702223, 23.29876692,
       22.95875889, 23.59744316, 13.77412361, 21.39735646, 31.29998733,
       26.66234266, 19.32157368, 15.87491845, 17.2294449 , 12.65256962,
       21.71569722, 20.1686083 , 23.61536492, 24.10587542, 11.97825659,
       14.74341261, 24.86442941, 34.10640748, 10.12033231, 20.89367197,
       17.54551737, 19.5997927 , 17.74536973, 30.04265302, 21.0171677 ,
       25.35489057, 15.73563341, 25.06394771, 22.10298887, 20.65

In [32]:
saved_scaler = pickle.load(open('scaler_trans.pkl','rb'))

In [35]:
saved_scaler.transform([x.iloc[0]])

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 ]])

In [37]:
scaled_x[0]

array([-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
        0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
       -1.45900038,  0.44105193, -1.0755623 ])

In [39]:
saved_model.predict(saved_scaler.transform([x.iloc[1]]))

array([24.58633728])