In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.datasets import load_boston

In [4]:
boston=load_boston()
X=pd.DataFrame(boston.data,columns=boston.feature_names)

In [5]:
y=boston.target
y=pd.DataFrame(y)

In [6]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
X['RAD'].value_counts()

24.0    132
5.0     115
4.0     110
3.0      38
6.0      26
8.0      24
2.0      24
1.0      20
7.0      17
Name: RAD, dtype: int64

In [8]:
X=pd.get_dummies(X,columns=['RAD'])
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,...,LSTAT,RAD_1.0,RAD_2.0,RAD_3.0,RAD_4.0,RAD_5.0,RAD_6.0,RAD_7.0,RAD_8.0,RAD_24.0
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,296.0,15.3,...,4.98,1,0,0,0,0,0,0,0,0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,242.0,17.8,...,9.14,0,1,0,0,0,0,0,0,0
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,242.0,17.8,...,4.03,0,1,0,0,0,0,0,0,0
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,222.0,18.7,...,2.94,0,0,1,0,0,0,0,0,0
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,222.0,18.7,...,5.33,0,0,1,0,0,0,0,0,0


### Feature Selection 

In [9]:
import statsmodels.api as sm

In [10]:
xc=sm.add_constant(X)
model=sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,0,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.74
Method:,Least Squares,F-statistic:,72.7
Date:,"Sun, 24 Nov 2019",Prob (F-statistic):,8.290000000000001e-132
Time:,00:32:20,Log-Likelihood:,-1489.6
No. Observations:,506,AIC:,3021.0
Df Residuals:,485,BIC:,3110.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,34.7368,4.825,7.200,0.000,25.257,44.217
CRIM,-0.1088,0.033,-3.329,0.001,-0.173,-0.045
ZN,0.0549,0.014,3.880,0.000,0.027,0.083
INDUS,0.0238,0.064,0.373,0.709,-0.101,0.149
CHAS,2.5242,0.863,2.924,0.004,0.828,4.220
NOX,-17.5731,3.896,-4.510,0.000,-25.229,-9.917
RM,3.6655,0.421,8.703,0.000,2.838,4.493
AGE,0.0005,0.013,0.035,0.972,-0.026,0.026
DIS,-1.5545,0.202,-7.699,0.000,-1.951,-1.158

0,1,2,3
Omnibus:,183.89,Durbin-Watson:,1.089
Prob(Omnibus):,0.0,Jarque-Bera (JB):,858.805
Skew:,1.554,Prob(JB):,3.2599999999999997e-187
Kurtosis:,8.575,Cond. No.,5.04e+18


In [11]:
cols=list(X.columns)
pmax=1

while(len(cols)>0):
    p=[]
    x=X[cols]
    xc=sm.add_constant(x)
    model=sm.OLS(y,xc).fit()
    p=pd.Series(model.pvalues.values[1:],index=cols)
    pmax=max(p)
    feature_with_p_max = p.idxmax()
    if(pmax> 0.05):
        cols.remove(feature_with_p_max)
    else:
        break

selected_features = cols
print(selected_features)

['CRIM', 'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'RAD_3.0', 'RAD_4.0', 'RAD_5.0', 'RAD_7.0', 'RAD_8.0', 'RAD_24.0']


In [12]:
X=X[selected_features]

### Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [15]:
kf=KFold(n_splits=5,shuffle=True,random_state=0)
for model,name in zip([lr],['MVLR']):
    rmse=[]
    for train,test in kf.split(X,y):
        Xtrain,Xtest=X.iloc[train,:],X.iloc[test,:]
        ytrain,ytest=y.iloc[train],y.iloc[test]
        model.fit(Xtrain,ytrain)
        ypred=model.predict(Xtest)
        rmse.append(np.sqrt(mean_squared_error(ytest,ypred)))
print(rmse)

[5.769126047411817, 4.419731846450076, 4.520796391669855, 5.372681696637538, 3.975538229257434]


### Lasso,Ridge

In [16]:
from sklearn.linear_model import Ridge,Lasso

In [17]:
m1=LinearRegression()
m2=Ridge(alpha=0.5,normalize=True)
m3=Lasso(alpha=0.1, normalize=True)

In [18]:
kf=KFold(n_splits=5,shuffle=True,random_state=0)
for model,name in zip([m1,m2,m3],['MVLR','Ridge','Lasso']):
    rmse=[]
    for train,test in kf.split(X,y):
        Xtrain,Xtest=X.iloc[train,:],X.iloc[test,:]
        ytrain,ytest=y.iloc[train],y.iloc[test]
        model.fit(Xtrain,ytrain)
        ypred=model.predict(Xtest)
        rmse.append(np.sqrt(mean_squared_error(ytest,ypred)))
    print('Cross Validated score %0.03f (+/- %0.05f) [%s] ' % (np.mean(rmse),np.var(rmse,ddof=1),name))

Cross Validated score 4.812 (+/- 0.54220) [MVLR] 
Cross Validated score 5.215 (+/- 0.76793) [Ridge] 
Cross Validated score 5.875 (+/- 0.41470) [Lasso] 


### Polynomial Features

In [19]:
from sklearn.preprocessing import PolynomialFeatures

In [20]:
lstat=X[['LSTAT','PTRATIO']]

In [21]:
qr = PolynomialFeatures(degree=2)
qr_lstat=qr.fit_transform(lstat)

In [22]:
type(qr_lstat)

numpy.ndarray

In [23]:
df_qr_lstat = pd.DataFrame(qr_lstat)

In [24]:
df_qr_lstat.head()

Unnamed: 0,0,1,2,3,4,5
0,1.0,4.98,15.3,24.8004,76.194,234.09
1,1.0,9.14,17.8,83.5396,162.692,316.84
2,1.0,4.03,17.8,16.2409,71.734,316.84
3,1.0,2.94,18.7,8.6436,54.978,349.69
4,1.0,5.33,18.7,28.4089,99.671,349.69


In [25]:
df_qr_lstat.drop(0,axis=1,inplace=True)
df_qr_lstat.head()

Unnamed: 0,1,2,3,4,5
0,4.98,15.3,24.8004,76.194,234.09
1,9.14,17.8,83.5396,162.692,316.84
2,4.03,17.8,16.2409,71.734,316.84
3,2.94,18.7,8.6436,54.978,349.69
4,5.33,18.7,28.4089,99.671,349.69


In [26]:
from sklearn.linear_model import LinearRegression
qlr=LinearRegression()

In [27]:
kf=KFold(n_splits=5,shuffle=True,random_state=0)
for model,name in zip([qlr],['Quadratic Regression']):
    rmse=[]
    for train,test in kf.split(df_qr_lstat,y):
        Xtrain,Xtest=df_qr_lstat.iloc[train,:],df_qr_lstat.iloc[test,:]
        ytrain,ytest=y.iloc[train],y.iloc[test]
        model.fit(Xtrain,ytrain)
        ypred=model.predict(Xtest)
        rmse.append(np.sqrt(mean_squared_error(ytest,ypred)))
    print('Cross Validated score %0.03f (+/- %0.05f) [%s] ' % (np.mean(rmse),np.var(rmse,ddof=1),name))

Cross Validated score 5.224 (+/- 0.77648) [Quadratic Regression] 
