In [139]:
import time
import random
from math import *
import operator

import pandas as pd
import numpy as np
np.set_printoptions(precision=4)

# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

from matplotlib import style
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

from sklearn.datasets import load_boston

from scipy.stats import shapiro

# import the ML algorithm
from sklearn.linear_model import LinearRegression

# import libraries for model validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut 

# import libraries for metrics and reporting
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import adjusted_rand_score

#### data set
- crim    - per capita crime rate by town.
- zn      - proportion of residential land zoned for lots over 25,000 sq.ft.
- indus   - proportion of non-retail business acres per town.
- chas    - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
- nox     - nitrogen oxides concentration (parts per 10 million).
- rm      - average number of rooms per dwelling.
- age     - proportion of owner-occupied units built prior to 1940.
- dis     - weighted mean of distances to five Boston employment centres.
- rad     - index of accessibility to radial highways.
- tax     - full-value property-tax rate per (dollars) 10,000.
- ptratio - pupil-teacher ratio by town.
- black   - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
- lstat   - lower status of the population (percent).

- __medv    - median value of owner-occupied homes in $1000s.__

In [186]:
# load data set
boston = load_boston()

print (boston.feature_names)
print (boston.data.shape)
print (boston.target.shape)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT']
(506, 13)
(506,)


In [187]:
np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=4)

In [188]:
X      = boston.data
X_orig = boston.data

y      = boston.target

print (X)

[[  0.01  18.     2.31   0.   ... 296.    15.3  396.9    4.98]
 [  0.03   0.     7.07   0.   ... 242.    17.8  396.9    9.14]
 [  0.03   0.     7.07   0.   ... 242.    17.8  392.83   4.03]
 [  0.03   0.     2.18   0.   ... 222.    18.7  394.63   2.94]
 ...
 [  0.05   0.    11.93   0.   ... 273.    21.   396.9    9.08]
 [  0.06   0.    11.93   0.   ... 273.    21.   396.9    5.64]
 [  0.11   0.    11.93   0.   ... 273.    21.   393.45   6.48]
 [  0.05   0.    11.93   0.   ... 273.    21.   396.9    7.88]]


In [189]:
df = pd.DataFrame(X_orig, columns=boston.feature_names)

In [190]:
def run_model(df_selected_cols, y):
    X_train, X_test, y_train, y_test = train_test_split(df_selected_cols, y, random_state=1, test_size=0.3)
    
    # instantiate
    linreg = LinearRegression()

    # fit the model to the training data (learn the coefficients)
    linreg.fit(X_train, y_train)
    
    y_pred = linreg.predict(X_test)
    
    # Model evaluation metrics for regression
    print('y-intercept             : ', linreg.intercept_)
    print('beta coefficients       : ', linreg.coef_)
    print('Mean Abs Error   MAE    : ', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Sq  Error MSE      : ', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Sq Error RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('r2 value                : ', metrics.r2_score(y_test, y_pred))
    print('MAPE                    : ', np.mean(np.abs((y_test - y_pred) / y_test)) * 100)
    
    stat, p = shapiro(y_test - y_pred)
    
    print('\nStatistics=%.3f, p=%.3f' % (stat, p))

    # interpret
    alpha = 0.05
    if p > alpha:
        print('residuals look Gaussian (fail to reject H0)')
    else:
        print('residuals does not look Gaussian (reject H0)')

In [175]:
def model_coeff(df_selected_cols, y):
    X_train, X_test, y_train, y_test = train_test_split(df_selected_cols, y, random_state=1, test_size=0.3)
    
    # instantiate
    linreg = LinearRegression()

    # fit the model to the training data (learn the coefficients)
    linreg.fit(X_train, y_train)
    
    y_pred = linreg.predict(X_test)
    
    # Model evaluation metrics for regression
    for col, coeff in zip(df_selected_cols.columns, linreg.coef_):
        print('{:15s} : {:8.6f}'.format(col, coeff))

## Check for VIF


In [146]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [147]:
# check VIF for all columns
df_selected_cols = df

pd.Series([variance_inflation_factor(df.values, i) for i in range(df.shape[1])], 
           index=df.columns)

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM        2.100373
ZN          2.844013
INDUS      14.485758
CHAS        1.152952
NOX        73.894947
RM         77.948283
AGE        21.386850
DIS        14.699652
RAD        15.167725
TAX        61.227274
PTRATIO    85.029547
B          20.104943
LSTAT      11.102025
dtype: float64

Model stats ...
y-intercept             :  46.396493871823736
beta coefficients       :  [ -0.1    0.06   0.06   2.44 -21.47   2.8    0.    -1.52   0.31  -0.01  -1.01   0.01  -0.57]
Mean Abs Error   MAE    :  3.344665503598751
Mean Sq  Error MSE      :  19.83132367206325
Root Mean Sq Error RMSE :  4.453237437198162
r2 value                :  0.783629538507628
MAPE                    :  16.207536032281407

Statistics=0.945, p=0.000
residuals does not look Gaussian (reject H0)


... start with 2 columns

In [148]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM    1.000015
ZN      1.000015
dtype: float64

Model stats ...
y-intercept             :  22.31477270824998
beta coefficients       :  [-0.36  0.12]
Mean Abs Error   MAE    :  6.085426399467737
Mean Sq  Error MSE      :  75.17595407157832
Root Mean Sq Error RMSE :  8.670406799659306
r2 value                :  0.17978970316990084
MAPE                    :  29.920016910694986

Statistics=0.864, p=0.000
residuals does not look Gaussian (reject H0)


In [149]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'DIS']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM    1.053573
ZN      1.870212
DIS     1.925411
dtype: float64

Model stats ...
y-intercept             :  24.24163742922906
beta coefficients       :  [-0.4   0.15 -0.57]
Mean Abs Error   MAE    :  6.029241015886931
Mean Sq  Error MSE      :  73.89731566116691
Root Mean Sq Error RMSE :  8.596354789163074
r2 value                :  0.1937403394217979
MAPE                    :  30.332081228212402

Statistics=0.878, p=0.000
residuals does not look Gaussian (reject H0)


... add more columns

In [150]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM     1.390754
ZN       1.020292
INDUS    1.411699
dtype: float64

Model stats ...
y-intercept             :  26.520193932111578
beta coefficients       :  [-0.27  0.07 -0.35]
Mean Abs Error   MAE    :  5.779155628120575
Mean Sq  Error MSE      :  66.84675514381013
Root Mean Sq Error RMSE :  8.175986493617154
r2 value                :  0.2706657659917566
MAPE                    :  27.632322711758416

Statistics=0.858, p=0.000
residuals does not look Gaussian (reject H0)


... add more columns

In [151]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM     1.881346
ZN       1.033431
INDUS    3.424151
RAD      4.602849
dtype: float64

Model stats ...
y-intercept             :  26.463738175274095
beta coefficients       :  [-0.28  0.07 -0.36  0.03]
Mean Abs Error   MAE    :  5.793572841743056
Mean Sq  Error MSE      :  67.05613018617353
Root Mean Sq Error RMSE :  8.188780750891645
r2 value                :  0.26838137109758353
MAPE                    :  27.814585400311742

Statistics=0.860, p=0.000
residuals does not look Gaussian (reject H0)


In [152]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM     1.964606
ZN       2.247364
INDUS    4.253120
RAD      4.632232
DIS      3.367350
dtype: float64

Model stats ...
y-intercept             :  34.69061821444144
beta coefficients       :  [-0.32  0.14 -0.57  0.   -1.63]
Mean Abs Error   MAE    :  5.518400891654619
Mean Sq  Error MSE      :  58.18864729142609
Root Mean Sq Error RMSE :  7.628148352741056
r2 value                :  0.3651304029796597
MAPE                    :  27.775337285607034

Statistics=0.885, p=0.000
residuals does not look Gaussian (reject H0)


... add more columns

In [153]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS', 'AGE']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM     1.967048
ZN       2.313830
INDUS    7.992402
RAD      4.701892
DIS      4.157358
AGE      8.379308
dtype: float64

Model stats ...
y-intercept             :  42.37898507868461
beta coefficients       :  [-0.31  0.12 -0.5   0.01 -2.23 -0.09]
Mean Abs Error   MAE    :  5.428357967241754
Mean Sq  Error MSE      :  59.05892043791564
Root Mean Sq Error RMSE :  7.684980184614378
r2 value                :  0.35563525250739825
MAPE                    :  27.064316292016766

Statistics=0.873, p=0.000
residuals does not look Gaussian (reject H0)


... add more columns

In [154]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS', 'AGE', 'B']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM      2.003631
ZN        2.313974
INDUS     8.189024
RAD       4.764478
DIS       7.965842
AGE      11.452106
B        13.297363
dtype: float64

Model stats ...
y-intercept             :  37.72810870934521
beta coefficients       :  [-0.28  0.13 -0.48  0.03 -2.23 -0.09  0.01]
Mean Abs Error   MAE    :  5.277366216299243
Mean Sq  Error MSE      :  56.624547783191716
Root Mean Sq Error RMSE :  7.524928423791931
r2 value                :  0.38219557412067684
MAPE                    :  26.005385263897363

Statistics=0.870, p=0.000
residuals does not look Gaussian (reject H0)


In [155]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS', 'AGE', 'B', 'LSTAT']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM      2.089933
ZN        2.331347
INDUS     8.985794
RAD       4.771754
DIS       8.414378
AGE      13.876848
B        13.475807
LSTAT     8.260737
dtype: float64

Model stats ...
y-intercept             :  39.22587698660257
beta coefficients       :  [-0.11  0.11 -0.23  0.02 -1.68  0.01  0.   -0.84]
Mean Abs Error   MAE    :  4.181873234352762
Mean Sq  Error MSE      :  33.15347161967372
Root Mean Sq Error RMSE :  5.757905141600868
r2 value                :  0.6382777028379397
MAPE                    :  19.12888073245159

Statistics=0.880, p=0.000
residuals does not look Gaussian (reject H0)


In [156]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS', 'AGE', 'B', 'LSTAT', 'NOX']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM      2.090079
ZN        2.346363
INDUS    11.002035
RAD       5.166290
DIS       9.034371
AGE      19.854107
B        16.412430
LSTAT     8.358940
NOX      46.706142
dtype: float64

Model stats ...
y-intercept             :  46.1401550539357
beta coefficients       :  [ -0.12   0.11  -0.16   0.05  -1.94   0.02   0.    -0.82 -13.76]
Mean Abs Error   MAE    :  4.189929665643575
Mean Sq  Error MSE      :  33.7999973666896
Root Mean Sq Error RMSE :  5.81377651502787
r2 value                :  0.6312237574451975
MAPE                    :  19.123798952158904

Statistics=0.885, p=0.000
residuals does not look Gaussian (reject H0)


In [157]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS', 'AGE', 'B', 'LSTAT', 'NOX', 'RM']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM      2.093849
ZN        2.372815
INDUS    11.148754
RAD       5.173441
DIS      11.877719
AGE      21.307113
B        18.538400
LSTAT     9.984013
NOX      71.828036
RM       59.955121
dtype: float64

Model stats ...
y-intercept             :  19.65042741277636
beta coefficients       :  [ -0.1    0.09  -0.1    0.01  -1.73  -0.     0.01  -0.6  -11.93   3.56]
Mean Abs Error   MAE    :  3.778205827866129
Mean Sq  Error MSE      :  23.913197757762486
Root Mean Sq Error RMSE :  4.890112243881779
r2 value                :  0.7390940856916026
MAPE                    :  18.50351783804392

Statistics=0.924, p=0.000
residuals does not look Gaussian (reject H0)


In [158]:
# check VIF
df_selected_cols = df[['CRIM', 'ZN', 'INDUS', 'RAD', 'DIS', 'AGE', 'B', 'LSTAT', 'NOX', 'RM', 'PTRATIO']]

print(pd.Series([variance_inflation_factor(df_selected_cols.values, i) for i in range(df_selected_cols.shape[1])], 
           index=df_selected_cols.columns))

print('\nModel stats ...')
run_model(df_selected_cols, y)

CRIM        2.095450
ZN          2.696840
INDUS      11.706718
RAD         5.598028
DIS        14.638654
AGE        21.331218
B          20.027726
LSTAT      11.063005
NOX        71.861373
RM         77.547333
PTRATIO    81.157915
dtype: float64

Model stats ...
y-intercept             :  45.03691879058434
beta coefficients       :  [ -0.1    0.05  -0.02   0.15  -1.51   0.     0.01  -0.57 -21.79   2.95  -1.09]
Mean Abs Error   MAE    :  3.4512729822191
Mean Sq  Error MSE      :  21.047199038809648
Root Mean Sq Error RMSE :  4.587722641879045
r2 value                :  0.7703636809899711
MAPE                    :  16.774926862617438

Statistics=0.937, p=0.000
residuals does not look Gaussian (reject H0)


## Check multi collinearity using eigen things

In [159]:
from numpy.linalg import inv

import scipy 
import scipy.linalg as la

In [160]:
def multicoll_eigen(df_selected_cols, y):

    X = df_selected_cols.values
    
    # get the corr
    scipy.corrcoef(X, rowvar=False)

    # check the multi collinearity
    corr = scipy.corrcoef(X, rowvar=False)

    eigvals, eigvecs = la.eig(corr)
    eigvals = eigvals.real

    print(eigvals)
    print(eigvecs)

In [161]:
# check VIF for all columns
df_selected_cols = df

multicoll_eigen(df_selected_cols, y)

[6.13 1.43 1.24 0.86 0.83 0.66 0.54 0.4  0.06 0.28 0.17 0.19 0.22]
[[-0.25  0.32 -0.25 -0.06  0.08  0.22 -0.78 -0.15 -0.05 -0.26  0.09  0.11 -0.02]
 [ 0.26  0.32 -0.3  -0.13  0.32  0.32  0.27  0.4   0.08 -0.36 -0.07 -0.26 -0.27]
 [-0.35 -0.11  0.02 -0.02 -0.01  0.08  0.34 -0.17  0.25 -0.64 -0.11  0.3   0.36]
 [-0.01 -0.45 -0.29 -0.82  0.09 -0.17 -0.07  0.02 -0.04  0.01 -0.   -0.01  0.01]
 [-0.34 -0.22 -0.12  0.13  0.14  0.15  0.2  -0.08 -0.04  0.02  0.8  -0.11 -0.23]
 [ 0.19 -0.15 -0.59  0.28 -0.42 -0.06 -0.06  0.33 -0.05 -0.05  0.15 -0.05  0.43]
 [-0.31 -0.31  0.02  0.18  0.02  0.07 -0.12  0.6   0.04  0.07 -0.21  0.46 -0.36]
 [ 0.32  0.35  0.05 -0.22  0.1  -0.02  0.1   0.12  0.02  0.15  0.39  0.7   0.17]
 [-0.32  0.27 -0.29 -0.13 -0.2   0.14  0.14 -0.08  0.63  0.47 -0.11 -0.04 -0.02]
 [-0.34  0.24 -0.22 -0.1  -0.13  0.19  0.31 -0.08 -0.72  0.18 -0.22  0.1   0.04]
 [-0.2   0.31  0.32 -0.28 -0.58 -0.27 -0.    0.32 -0.02 -0.25  0.21 -0.17 -0.15]
 [ 0.2  -0.24  0.3  -0.17 -0.35  0.8  -0.0

there seems to __no multi-collinearity__, as there no eigen value = 0

In [162]:
print('\nModel coeff ...')
run_model(df_selected_cols, y)


Model coeff ...
y-intercept             :  46.396493871823736
beta coefficients       :  [ -0.1    0.06   0.06   2.44 -21.47   2.8    0.    -1.52   0.31  -0.01  -1.01   0.01  -0.57]
Mean Abs Error   MAE    :  3.344665503598751
Mean Sq  Error MSE      :  19.83132367206325
Root Mean Sq Error RMSE :  4.453237437198162
r2 value                :  0.783629538507628
MAPE                    :  16.207536032281407

Statistics=0.945, p=0.000
residuals does not look Gaussian (reject H0)


## let us try adding collinear columns

In [191]:
df_orig = df

In [193]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [194]:
# check VIF
df_selected_cols = df

model_coeff(df_selected_cols, y)

CRIM            : -0.098542
ZN              : 0.060784
INDUS           : 0.059172
CHAS            : 2.439560
NOX             : -21.469965
RM              : 2.795814
AGE             : 0.003575
DIS             : -1.516272
RAD             : 0.307542
TAX             : -0.011280
PTRATIO         : -1.005466
B               : 0.006450
LSTAT           : -0.568835


In [195]:
df['AGE_IN_DAYS'] = df['AGE'] * 365 +10

# check VIF
df_selected_cols = df

model_coeff(df_selected_cols, y)

CRIM            : -0.098542
ZN              : 0.060784
INDUS           : 0.059172
CHAS            : 2.439560
NOX             : -21.469965
RM              : 2.795814
AGE             : 0.000000
DIS             : -1.516272
RAD             : 0.307542
TAX             : -0.011280
PTRATIO         : -1.005466
B               : 0.006450
LSTAT           : -0.568835
AGE_IN_DAYS     : 0.000010


In [196]:
# check VIF for all columns
df_selected_cols = df

multicoll_eigen(df_selected_cols, y)

[6.77 1.59 1.24 0.83 0.89 0.66 0.56 0.5  0.06 0.28 0.24 0.17 0.2  0.  ]
[[-0.23 -0.33  0.24  0.09  0.   -0.26  0.55  0.56 -0.05 -0.25 -0.09 -0.11  0.09 -0.  ]
 [ 0.25 -0.25  0.28  0.33  0.   -0.38  0.   -0.5   0.08 -0.37 -0.11  0.11 -0.35  0.  ]
 [-0.33  0.04 -0.01 -0.01  0.08 -0.02 -0.42 -0.01  0.25 -0.64  0.08  0.07  0.48  0.  ]
 [-0.01  0.36  0.31  0.13  0.85  0.12  0.15 -0.03 -0.04  0.01  0.01  0.01 -0.01 -0.  ]
 [-0.33  0.15  0.13  0.13 -0.06 -0.09 -0.32  0.05 -0.04  0.01 -0.05 -0.77 -0.34  0.  ]
 [ 0.17  0.18  0.6  -0.44 -0.26  0.05  0.13 -0.11 -0.04 -0.11  0.51 -0.12  0.08  0.  ]
 [-0.32  0.32 -0.01  0.01 -0.18 -0.11  0.3  -0.31  0.02  0.04 -0.21  0.04  0.13 -0.71]
 [ 0.31 -0.28 -0.06  0.11  0.11 -0.04  0.12 -0.29  0.02  0.17 -0.06 -0.52  0.62 -0.  ]
 [-0.29 -0.31  0.28 -0.2   0.12 -0.15 -0.12 -0.06  0.63  0.48 -0.01  0.11 -0.02  0.  ]
 [-0.31 -0.29  0.22 -0.12  0.1  -0.18 -0.28 -0.15 -0.72  0.19 -0.05  0.19  0.15 -0.  ]
 [-0.19 -0.32 -0.33 -0.57  0.26  0.2   0.25 -0.29 -0.02 -0

In [198]:
df['AGE_IN_DAYS'] = df['AGE'] * 365 / 10
df['RM_SQUARED']  = df['RM'] ** 2

# check VIF
df_selected_cols = df

model_coeff(df_selected_cols, y)

CRIM            : -0.124955
ZN              : 0.045138
INDUS           : 0.087996
CHAS            : 1.802381
NOX             : -20.797713
RM              : -28.059592
AGE             : -0.000000
DIS             : -1.235400
RAD             : 0.280806
TAX             : -0.011199
PTRATIO         : -0.847498
B               : 0.005826
LSTAT           : -0.556937
AGE_IN_DAYS     : -0.000015
RM_SQUARED      : 2.448717


In [199]:
# check VIF for all columns
df_selected_cols = df

multicoll_eigen(df_selected_cols, y)

[ 6.99  1.95  1.53  0.94  0.86  0.66  0.56  0.51  0.28  0.27  0.2   0.17  0.06  0.   -0.  ]
[[-0.22  0.01 -0.39 -0.11 -0.08  0.25 -0.52  0.6  -0.16 -0.2   0.09  0.11 -0.05 -0.01  0.  ]
 [ 0.24 -0.04 -0.31 -0.29 -0.27  0.37 -0.04 -0.5  -0.24 -0.31 -0.35 -0.1   0.08 -0.01 -0.  ]
 [-0.32  0.04  0.03 -0.05  0.06  0.02  0.4  -0.05 -0.61 -0.23  0.47 -0.09  0.25  0.01 -0.  ]
 [-0.01  0.23  0.24 -0.81  0.43 -0.13 -0.17 -0.03 -0.    0.02 -0.01 -0.01 -0.04 -0.   -0.  ]
 [-0.32  0.17  0.08 -0.09 -0.16  0.09  0.32  0.03  0.01 -0.   -0.33  0.78 -0.04 -0.    0.  ]
 [ 0.19  0.57 -0.17  0.18  0.07 -0.03 -0.06 -0.06 -0.17  0.17  0.04  0.06 -0.03 -0.71 -0.  ]
 [-0.31  0.21  0.26  0.12 -0.13  0.1  -0.3  -0.28  0.17 -0.18  0.13 -0.03  0.02 -0.   -0.71]
 [ 0.3  -0.24 -0.19 -0.1   0.01  0.03 -0.16 -0.3   0.13  0.09  0.63  0.52  0.02  0.02  0.  ]
 [-0.28  0.08 -0.42 -0.07  0.19  0.16  0.14 -0.05  0.43  0.22 -0.02 -0.11  0.63 -0.01 -0.  ]
 [-0.3   0.04 -0.36 -0.07  0.13  0.19  0.28 -0.16  0.19  0.06  0.15 -0.