In [24]:
import pandas as pd
import numpy as np
#from pylab import *
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import linear_model,feature_selection
import statsmodels.graphics.api as smg
from statsmodels.stats.outliers_influence import variance_inflation_factor,OLSInfluence
import statsmodels.api as sm
import statsmodels.stats.diagnostic as ssd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

seed = 2017
##### READ DATA

dataframe=pd.read_csv('BostonHausing.csv')

## FEATURE SELECTION WITH VIF
###### REMOVE MULTICOLLINEARITY
listnames = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
#listnames = ['CRIM', 'ZN', 'AGE', 'DIS', 'RAD', 'LSTAT']
## calc VIF

# use the list to select a subset from original DataFrame
X = dataframe[listnames]
Y = dataframe['MEDV']


for i in np.arange(0,len(listnames)):
    vif = [variance_inflation_factor(X[listnames].values, ix) for ix in range(X[listnames].shape[1])]
    maxloc = vif.index(max(vif))
    if max(vif) > 10:
        #print('vif :', vif)
        #print('dropping' + X[listnames].columns[maxloc] + 'at index: ' + str(maxloc))
        del listnames[maxloc]
    else:
        break
print('Final variables:', listnames)

X = dataframe[listnames]
Y = dataframe['MEDV']

# Normalize Data
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
# evaluate the model by splitting into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)
'''
### CREATE FITTED MODEL
lm=sm.OLS(Y_train,X_train)
lmf = lm.fit()
### PRINT SUMMARY
print(lmf.summary())  # check p-values
'''

## CREAT FITTED MODEL
model = linear_model.LinearRegression()
modelf=model.fit(X_train, Y_train)
y_pred=model.predict(X_test)
print('\nScore without k-fold, train',modelf.score(X_train, Y_train))
print('\nScore without k-fold, test',metrics.r2_score(Y_test,y_pred))

# evaluate the model using 10-fold cross-validation
train_scores = cross_val_score(model, X_train, Y_train, cv=5)
test_scores = cross_val_score(model, X_test, Y_test, cv=5)
print ("\nTrain Fold Scores: ", train_scores)
print ("Train CV Score: ", train_scores.mean())
print ("Test Fold Scores: ", test_scores)
print ("Test CV Score: ", test_scores.mean())
'''
# Using Bagging 

model_Bag = BaggingRegressor(base_estimator=model, n_estimators=70, random_state=0).fit(X_train,Y_train)
results = model_selection.cross_val_score(model_Bag, X_train, Y_train,cv=5)
print('~~~~~~~~~~~')
print ("Linear regression (Bagging) - Train : ", results.mean())
print ("Linear regression  (Bagging) - Test : ", metrics.r2_score(model_Bag.predict(X_test), Y_test))


# Using RandomForestRegressor
model_rfr=RandomForestRegressor(random_state=seed)
model_rfr.fit(X_train,Y_train)
results = model_selection.cross_val_score(model_rfr, X_train, Y_train,cv=5)
print('~~~~~~~~~~~')
print ("Linear regression (RandomForestRegressor) - Train : ", results.mean())
print ("Linear regression  (RandomForestRegressor) - Test : ", metrics.r2_score(model_rfr.predict(X_test), Y_test))

# specify parameters and distributions to sample from
param_dist = {'n_estimators':sp_randint(100,1000),
'criterion': ['mse', 'mae'],
'max_features': ['auto', 'sqrt', 'log2']
}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(model_rfr, param_distributions=param_dist,cv=None, n_iter=n_iter_search,
verbose=5, n_jobs=-1, random_state=seed)
random_search.fit(X_train, Y_train)
# report(random_search.cv_results_)
print ('Best Parameters: ', random_search.best_params_)
results = model_selection.cross_val_score(random_search.best_estimator_,X_train,Y_train, cv=None)
print ("Accuracy - Train CV: ", results.mean())
print ("Accuracy - Train : ", metrics.r2_score(random_search.best_estimator_.predict(X_train), Y_train))
print ("Accuracy - Test : ", metrics.r2_score(random_search.best_estimator_.predict(X_test), Y_test))

#Best Parameters:  {'criterion': 'mse', 'max_features': 'log2', 'n_estimators': 312}
#Accuracy - Train CV:  0.7952687143532291
#Accuracy - Train :  0.9737119717756719
#Accuracy - Test :  0.4946292870977391

'''

# Using GradientBoostingRegressor
model_gbr=GradientBoostingRegressor(random_state=seed)
# specify parameters and distributions to sample from
param_dist = {'n_estimators':sp_randint(100,500),
'learning_rate': [0.1,5],
'loss': ['ls', 'lad', 'huber', 'quantile'],
'criterion':['friedman_mse', 'mse', 'mae'],
'max_depth': [1, 3, 5, 7, 9],
'max_features':['auto', 'sqrt', 'log2']
}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(model_gbr, param_distributions=param_dist,cv=None, n_iter=n_iter_search,
verbose=5, n_jobs=-1, random_state=seed)
random_search.fit(X_train, Y_train)
# report(random_search.cv_results_)
print ('Best Parameters: ', random_search.best_params_)
results = model_selection.cross_val_score(random_search.best_estimator_,X_train,Y_train, cv=None)
print ("Accuracy - Train CV: ", results.mean())
print ("Accuracy - Train : ", metrics.r2_score(random_search.best_estimator_.predict(X_train), Y_train))
print ("Accuracy - Test : ", metrics.r2_score(random_search.best_estimator_.predict(X_test), Y_test))

#Best Parameters:  {'criterion': 'mae', 'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 369}
#Accuracy - Train CV:  0.791102433867587
#Accuracy - Train :  0.9999624822674393
#Accuracy - Test :  0.5717709438572165



Final variables: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'DIS', 'RAD', 'LSTAT']

Score without k-fold, train 0.6614040986850935

Score without k-fold, test 0.5624711065581123

Train Fold Scores:  [0.62255679 0.56031802 0.58291084 0.6616309  0.70104557]
Train CV Score:  0.6256924244196325
Test Fold Scores:  [0.54563004 0.41087265 0.49748181 0.32003529 0.79723598]
Test CV Score:  0.5142511517118004
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] criterion=mse, learning_rate=0.1, loss=lad, max_depth=5, max_features=auto, n_estimators=278 
[CV] criterion=mse, learning_rate=0.1, loss=lad, max_depth=5, max_features=auto, n_estimators=278 
[CV]  criterion=mse, learning_rate=0.1, loss=lad, max_depth=5, max_features=auto, n_estimators=278, score=0.8407000950752024, total=   0.6s
[CV] criterion=mse, learning_rate=0.1, loss=lad, max_depth=5, max_features=auto, n_estimators=278 
[CV]  criterion=mse, learning_rate=0.1, loss=lad, max_depth=5, max_features=auto, n_estimators=278, score=0.71

  return umr_sum(a, axis, dtype, out, keepdims)
  (np.abs(diff[~gamma_mask]) - gamma / 2.0))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=huber, max_depth=7, max_features=sqrt, n_estimators=293, score=-inf, total=   5.1s
[CV] criterion=mse, learning_rate=5, loss=huber, max_depth=7, max_features=sqrt, n_estimators=293 
[CV]  criterion=mae, learning_rate=0.1, loss=huber, max_depth=7, max_features=log2, n_estimators=369, score=0.8023297004837613, total=  11.4s
[CV] criterion=mse, learning_rate=5, loss=huber, max_depth=7, max_features=sqrt, n_estimators=293 


  return umr_sum(a, axis, dtype, out, keepdims)
  (np.abs(diff[~gamma_mask]) - gamma / 2.0))
  sq_loss = np.sum(0.5 * sample_weight[gamma_mask] * diff[gamma_mask] ** 2.0)
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=huber, max_depth=7, max_features=sqrt, n_estimators=293, score=-inf, total=   8.9s
[CV] criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=1, max_features=log2, n_estimators=493 


  return umr_sum(a, axis, dtype, out, keepdims)
  np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   25.5s
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=1, max_features=log2, n_estimators=493, score=-inf, total=   0.4s
[CV] criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=1, max_features=log2, n_estimators=493 


  return umr_sum(a, axis, dtype, out, keepdims)
  np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=1, max_features=log2, n_estimators=493, score=-inf, total=   0.7s
[CV] criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=1, max_features=log2, n_estimators=493 


  return umr_sum(a, axis, dtype, out, keepdims)
  np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=1, max_features=log2, n_estimators=493, score=-inf, total=   0.3s
[CV] criterion=mae, learning_rate=0.1, loss=ls, max_depth=3, max_features=auto, n_estimators=384 


  return umr_sum(a, axis, dtype, out, keepdims)
  (np.abs(diff[~gamma_mask]) - gamma / 2.0))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=huber, max_depth=7, max_features=sqrt, n_estimators=293, score=-inf, total=   5.7s
[CV] criterion=mae, learning_rate=0.1, loss=ls, max_depth=3, max_features=auto, n_estimators=384 
[CV]  criterion=mae, learning_rate=0.1, loss=ls, max_depth=3, max_features=auto, n_estimators=384, score=0.7060131608427861, total=   1.7s
[CV] criterion=mae, learning_rate=0.1, loss=ls, max_depth=3, max_features=auto, n_estimators=384 
[CV]  criterion=mae, learning_rate=0.1, loss=ls, max_depth=3, max_features=auto, n_estimators=384, score=0.7820058347232338, total=   1.9s
[CV] criterion=mae, learning_rate=5, loss=lad, max_depth=7, max_features=auto, n_estimators=341 
[CV]  criterion=mae, learning_rate=0.1, loss=ls, max_depth=3, max_features=auto, n_estimators=384, score=0.8135128712715812, total=   2.0s
[CV] criterion=mae, learning_rate=5, loss=lad, max_depth=7, max_features=auto, n_estimators=341 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=lad, max_depth=7, max_features=auto, n_estimators=341, score=-inf, total=   1.9s
[CV] criterion=mae, learning_rate=5, loss=lad, max_depth=7, max_features=auto, n_estimators=341 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=lad, max_depth=7, max_features=auto, n_estimators=341, score=-inf, total=   1.6s
[CV] criterion=mae, learning_rate=0.1, loss=quantile, max_depth=9, max_features=sqrt, n_estimators=187 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=lad, max_depth=7, max_features=auto, n_estimators=341, score=-inf, total=   1.8s
[CV] criterion=mae, learning_rate=0.1, loss=quantile, max_depth=9, max_features=sqrt, n_estimators=187 
[CV]  criterion=mae, learning_rate=0.1, loss=quantile, max_depth=9, max_features=sqrt, n_estimators=187, score=0.5826660896060565, total=   1.2s
[CV] criterion=mae, learning_rate=0.1, loss=quantile, max_depth=9, max_features=sqrt, n_estimators=187 
[CV]  criterion=mae, learning_rate=0.1, loss=quantile, max_depth=9, max_features=sqrt, n_estimators=187, score=0.6846026768867333, total=   2.2s
[CV] criterion=friedman_mse, learning_rate=0.1, loss=huber, max_depth=5, max_features=log2, n_estimators=195 
[CV]  criterion=mae, learning_rate=0.1, loss=quantile, max_depth=9, max_features=sqrt, n_estimators=187, score=0.7093730422909977, total=   2.7s
[CV] criterion=friedman_mse, learning_rate=0.1, loss=huber, max_depth=5, max_features=log2, n_estimators=195 
[CV]  criteri

  np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
  return umr_sum(a, axis, dtype, out, keepdims)
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=7, max_features=auto, n_estimators=472, score=-inf, total=   0.6s
[CV] criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=7, max_features=auto, n_estimators=472 


  np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
  return umr_sum(a, axis, dtype, out, keepdims)
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=0.1, loss=huber, max_depth=5, max_features=log2, n_estimators=195, score=0.7876985325831637, total=   1.1s
[CV] criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=7, max_features=auto, n_estimators=472 
[CV]  criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=7, max_features=auto, n_estimators=472, score=-inf, total=   0.4s
[CV] criterion=mae, learning_rate=5, loss=huber, max_depth=1, max_features=log2, n_estimators=299 


  return umr_sum(a, axis, dtype, out, keepdims)
  np.sum(sample_weight * ((y - pred.ravel()) ** 2.0)))
  return umr_sum(a, axis, dtype, out, keepdims)
  (np.abs(diff[~gamma_mask]) - gamma / 2.0))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=huber, max_depth=1, max_features=log2, n_estimators=299, score=-inf, total=   0.3s
[CV] criterion=mae, learning_rate=5, loss=huber, max_depth=1, max_features=log2, n_estimators=299 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=ls, max_depth=7, max_features=auto, n_estimators=472, score=-inf, total=   0.4s
[CV] criterion=mae, learning_rate=5, loss=huber, max_depth=1, max_features=log2, n_estimators=299 


  return umr_sum(a, axis, dtype, out, keepdims)
  (np.abs(diff[~gamma_mask]) - gamma / 2.0))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=huber, max_depth=1, max_features=log2, n_estimators=299, score=-inf, total=   0.3s
[CV] criterion=mse, learning_rate=5, loss=lad, max_depth=1, max_features=auto, n_estimators=421 


  return umr_sum(a, axis, dtype, out, keepdims)
  (np.abs(diff[~gamma_mask]) - gamma / 2.0))
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=huber, max_depth=1, max_features=log2, n_estimators=299, score=-inf, total=   0.5s
[CV] criterion=mse, learning_rate=5, loss=lad, max_depth=1, max_features=auto, n_estimators=421 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=lad, max_depth=1, max_features=auto, n_estimators=421, score=-inf, total=   0.4s
[CV] criterion=mse, learning_rate=5, loss=lad, max_depth=1, max_features=auto, n_estimators=421 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=lad, max_depth=1, max_features=auto, n_estimators=421, score=-inf, total=   0.4s
[CV] criterion=mae, learning_rate=0.1, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=128 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=lad, max_depth=1, max_features=auto, n_estimators=421, score=-inf, total=   0.5s
[CV] criterion=mae, learning_rate=0.1, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=128 
[CV]  criterion=mae, learning_rate=0.1, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=128, score=0.6308247230889, total=   0.7s
[CV] criterion=mae, learning_rate=0.1, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=128 
[CV]  criterion=mae, learning_rate=0.1, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=128, score=0.4350641179858209, total=   0.9s
[CV] criterion=mae, learning_rate=5, loss=lad, max_depth=1, max_features=sqrt, n_estimators=378 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=lad, max_depth=1, max_features=sqrt, n_estimators=378, score=-inf, total=   0.3s
[CV] criterion=mae, learning_rate=5, loss=lad, max_depth=1, max_features=sqrt, n_estimators=378 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=lad, max_depth=1, max_features=sqrt, n_estimators=378, score=-inf, total=   0.2s
[CV] criterion=mae, learning_rate=5, loss=lad, max_depth=1, max_features=sqrt, n_estimators=378 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=lad, max_depth=1, max_features=sqrt, n_estimators=378, score=-inf, total=   0.2s
[CV] criterion=friedman_mse, learning_rate=5, loss=quantile, max_depth=1, max_features=log2, n_estimators=312 
[CV]  criterion=mae, learning_rate=0.1, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=128, score=0.6474112760224219, total=   0.8s
[CV] criterion=friedman_mse, learning_rate=5, loss=quantile, max_depth=1, max_features=log2, n_estimators=312 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=quantile, max_depth=1, max_features=log2, n_estimators=312, score=-inf, total=   0.1s
[CV] criterion=friedman_mse, learning_rate=5, loss=quantile, max_depth=1, max_features=log2, n_estimators=312 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=quantile, max_depth=1, max_features=log2, n_estimators=312, score=-inf, total=   0.2s
[CV] criterion=mae, learning_rate=0.1, loss=lad, max_depth=1, max_features=auto, n_estimators=180 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=friedman_mse, learning_rate=5, loss=quantile, max_depth=1, max_features=log2, n_estimators=312, score=-inf, total=   0.2s
[CV] criterion=mae, learning_rate=0.1, loss=lad, max_depth=1, max_features=auto, n_estimators=180 
[CV]  criterion=mae, learning_rate=0.1, loss=lad, max_depth=1, max_features=auto, n_estimators=180, score=0.7203330882231112, total=   0.8s
[CV] criterion=mae, learning_rate=0.1, loss=lad, max_depth=1, max_features=auto, n_estimators=180 
[CV]  criterion=mae, learning_rate=0.1, loss=lad, max_depth=1, max_features=auto, n_estimators=180, score=0.6916690591794521, total=   1.5s
[CV] criterion=mse, learning_rate=5, loss=quantile, max_depth=3, max_features=log2, n_estimators=190 
[CV]  criterion=mse, learning_rate=5, loss=quantile, max_depth=3, max_features=log2, n_estimators=190, score=-9.109386275170741e+228, total=   0.2s
[CV] criterion=mse, learning_rate=5, loss=quantile, max_depth=3, max_features=log2, n_estimators=190 
[CV]  criterion=mae, learning_ra

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=lad, max_depth=9, max_features=sqrt, n_estimators=405, score=-inf, total=   1.9s
[CV] criterion=mse, learning_rate=5, loss=lad, max_depth=9, max_features=sqrt, n_estimators=405 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=lad, max_depth=9, max_features=sqrt, n_estimators=405, score=-inf, total=   2.6s
[CV] criterion=mae, learning_rate=5, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=274 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=274, score=-inf, total=   0.7s
[CV] criterion=mae, learning_rate=5, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=274 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=274, score=-inf, total=   1.2s
[CV] criterion=mae, learning_rate=5, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=274 


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mae, learning_rate=5, loss=quantile, max_depth=7, max_features=sqrt, n_estimators=274, score=-inf, total=   0.3s


  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


[CV]  criterion=mse, learning_rate=5, loss=lad, max_depth=9, max_features=sqrt, n_estimators=405, score=-inf, total=   3.5s


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   51.1s finished
  array_means[:, np.newaxis]) ** 2,
  array_means[:, np.newaxis]) ** 2,


Best Parameters:  {'criterion': 'mae', 'learning_rate': 0.1, 'loss': 'huber', 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 369}
Accuracy - Train CV:  0.791102433867587
Accuracy - Train :  0.9999624822674393
Accuracy - Test :  0.5717709438572165


In [1]:
import pandas as pd
import numpy as np
#from pylab import *
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import linear_model,feature_selection
import statsmodels.graphics.api as smg
from statsmodels.stats.outliers_influence import variance_inflation_factor,OLSInfluence
import statsmodels.api as sm
import statsmodels.stats.diagnostic as ssd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor

seed = 2017
##### READ DATA

dataframe=pd.read_csv('BostonHausing.csv')

## FEATURE SELECTION WITH VIF
###### REMOVE MULTICOLLINEARITY
listnames = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
#listnames = ['CRIM', 'ZN', 'AGE', 'DIS', 'RAD', 'LSTAT']
## calc VIF

# use the list to select a subset from original DataFrame
X = dataframe[listnames]
Y = dataframe['MEDV']


for i in np.arange(0,len(listnames)):
    vif = [variance_inflation_factor(X[listnames].values, ix) for ix in range(X[listnames].shape[1])]
    maxloc = vif.index(max(vif))
    if max(vif) > 10:
        #print('vif :', vif)
        #print('dropping' + X[listnames].columns[maxloc] + 'at index: ' + str(maxloc))
        del listnames[maxloc]
    else:
        break
print('Final variables:', listnames)

X = dataframe[listnames]
Y = dataframe['MEDV']

# Normalize Data
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
# evaluate the model by splitting into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)
'''
### CREATE FITTED MODEL
lm=sm.OLS(Y_train,X_train)
lmf = lm.fit()
### PRINT SUMMARY
print(lmf.summary())  # check p-values
'''

## CREAT FITTED MODEL
model = linear_model.LinearRegression()
modelf=model.fit(X_train, Y_train)
y_pred=model.predict(X_test)
print('\nScore without k-fold, train',modelf.score(X_train, Y_train))
print('\nScore without k-fold, test',metrics.r2_score(Y_test,y_pred))

# Using ExtraTreesRegressor
model_gbr=ExtraTreesRegressor(random_state=seed)
# specify parameters and distributions to sample from
param_dist = {'n_estimators':sp_randint(100,1000),
'criterion': ['mse', 'mae'],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth':[None,5,30],
'bootstrap':['False','True'],
'oob_score':['False','True']
}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(model_gbr, param_distributions=param_dist,cv=None, n_iter=n_iter_search,
verbose=5, n_jobs=-1, random_state=seed)
random_search.fit(X_train, Y_train)
# report(random_search.cv_results_)
print ('Best Parameters: ', random_search.best_params_)
results = model_selection.cross_val_score(random_search.best_estimator_,X_train,Y_train, cv=None)
print ("Accuracy - Train CV: ", results.mean())
print ("Accuracy - Train : ", metrics.r2_score(random_search.best_estimator_.predict(X_train), Y_train))
print ("Accuracy - Test : ", metrics.r2_score(random_search.best_estimator_.predict(X_test), Y_test))

#Best Parameters:  {'bootstrap': 'True', 'criterion': 'mae', 'max_depth': None, 'max_features': 'auto', 'n_estimators': 461, 'oob_score': 'True'}
#Accuracy - Train CV:  0.7838837144860213
#Accuracy - Train :  0.9692502753490986
#Accuracy - Test :  0.5669731711564234


Final variables: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'DIS', 'RAD', 'LSTAT']

Score without k-fold, train 0.6614040986850935

Score without k-fold, test 0.5624711065581123
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] bootstrap=True, criterion=mae, max_depth=30, max_features=sqrt, n_estimators=910, oob_score=False 
[CV] bootstrap=True, criterion=mae, max_depth=30, max_features=sqrt, n_estimators=910, oob_score=False 
[CV]  bootstrap=True, criterion=mae, max_depth=30, max_features=sqrt, n_estimators=910, oob_score=False, score=0.7500485958631056, total=   3.4s
[CV] bootstrap=True, criterion=mae, max_depth=30, max_features=sqrt, n_estimators=910, oob_score=False 
[CV]  bootstrap=True, criterion=mae, max_depth=30, max_features=sqrt, n_estimators=910, oob_score=False, score=0.6864354513836836, total=   3.6s
[CV] bootstrap=False, criterion=mse, max_depth=None, max_features=auto, n_estimators=140, oob_score=False 
[CV]  bootstrap=False, criterion=mse, max_depth=None, max_featu

[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   24.0s


[CV]  bootstrap=False, criterion=mae, max_depth=5, max_features=sqrt, n_estimators=420, oob_score=True, score=0.6048111210214726, total=   3.2s
[CV] bootstrap=False, criterion=mse, max_depth=30, max_features=auto, n_estimators=608, oob_score=True 
[CV]  bootstrap=False, criterion=mse, max_depth=30, max_features=auto, n_estimators=608, oob_score=True, score=0.7312164913555741, total=   3.1s
[CV] bootstrap=False, criterion=mse, max_depth=30, max_features=auto, n_estimators=608, oob_score=True 
[CV]  bootstrap=False, criterion=mse, max_depth=30, max_features=auto, n_estimators=608, oob_score=True, score=0.7904418470480665, total=   1.4s
[CV] bootstrap=False, criterion=mse, max_depth=None, max_features=sqrt, n_estimators=756, oob_score=True 
[CV]  bootstrap=False, criterion=mse, max_depth=30, max_features=auto, n_estimators=608, oob_score=True, score=0.8201007283838212, total=   1.4s
[CV] bootstrap=False, criterion=mse, max_depth=None, max_features=sqrt, n_estimators=756, oob_score=True 
[

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.5min finished


Best Parameters:  {'bootstrap': 'True', 'criterion': 'mae', 'max_depth': None, 'max_features': 'auto', 'n_estimators': 461, 'oob_score': 'True'}
Accuracy - Train CV:  0.7838837144860213
Accuracy - Train :  0.9692502753490986
Accuracy - Test :  0.5669731711564234


In [1]:
import pandas as pd
import numpy as np
#from pylab import *
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import linear_model,feature_selection
import statsmodels.graphics.api as smg
from statsmodels.stats.outliers_influence import variance_inflation_factor,OLSInfluence
import statsmodels.api as sm
import statsmodels.stats.diagnostic as ssd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

seed = 2017
##### READ DATA

dataframe=pd.read_csv('BostonHausing.csv')

## FEATURE SELECTION WITH VIF
###### REMOVE MULTICOLLINEARITY
listnames = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
#listnames = ['CRIM', 'ZN', 'AGE', 'DIS', 'RAD', 'LSTAT']
## calc VIF

# use the list to select a subset from original DataFrame
X = dataframe[listnames]
Y = dataframe['MEDV']


for i in np.arange(0,len(listnames)):
    vif = [variance_inflation_factor(X[listnames].values, ix) for ix in range(X[listnames].shape[1])]
    maxloc = vif.index(max(vif))
    if max(vif) > 10:
        #print('vif :', vif)
        #print('dropping' + X[listnames].columns[maxloc] + 'at index: ' + str(maxloc))
        del listnames[maxloc]
    else:
        break
print('Final variables:', listnames)

X = dataframe[listnames]
Y = dataframe['MEDV']

# Normalize Data
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
# evaluate the model by splitting into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 0)
'''
### CREATE FITTED MODEL
lm=sm.OLS(Y_train,X_train)
lmf = lm.fit()
### PRINT SUMMARY
print(lmf.summary())  # check p-values
'''

## CREAT FITTED MODEL
model = linear_model.LinearRegression()
modelf=model.fit(X_train, Y_train)
y_pred=model.predict(X_test)
print('\nScore without k-fold, train',modelf.score(X_train, Y_train))
print('\nScore without k-fold, test',metrics.r2_score(Y_test,y_pred))



# evaluate the model using 10-fold cross-validation
train_scores = cross_val_score(model, X_train, Y_train, cv=5)
test_scores = cross_val_score(model, X_test, Y_test, cv=5)
print ("\nTrain Fold Scores: ", train_scores)
print ("Train CV Score: ", train_scores.mean())
print ("Test Fold Scores: ", test_scores)
print ("Test CV Score: ", test_scores.mean())



# Using AdaBoostRegressor
model_abr=AdaBoostRegressor(base_estimator=model)
# specify parameters and distributions to sample from
param_dist = {'n_estimators':sp_randint(50,500),
'learning_rate': sp_randint(1,20),
'loss': ['linear', 'square', 'exponential']
}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(model_abr, param_distributions=param_dist,cv=None, n_iter=n_iter_search,
verbose=5, n_jobs=-1, random_state=seed)
random_search.fit(X_train, Y_train)
# report(random_search.cv_results_)
print ('Best Parameters: ', random_search.best_params_)
results = model_selection.cross_val_score(random_search.best_estimator_,X_train,Y_train, cv=None)
print ("Accuracy - Train CV: ", results.mean())
print ("Accuracy - Train : ", metrics.r2_score(random_search.best_estimator_.predict(X_train), Y_train))
print ("Accuracy - Test : ", metrics.r2_score(random_search.best_estimator_.predict(X_test), Y_test))

#Best Parameters:  {'learning_rate': 1, 'loss': 'linear', 'n_estimators': 184}
#Accuracy - Train CV:  0.5934515729681066
#Accuracy - Train :  0.49911852143960267
#Accuracy - Test :  0.3572295579715925

Final variables: ['CRIM', 'ZN', 'INDUS', 'CHAS', 'DIS', 'RAD', 'LSTAT']

Score without k-fold, train 0.6614040986850935

Score without k-fold, test 0.5624711065581123

Train Fold Scores:  [0.62255679 0.56031802 0.58291084 0.6616309  0.70104557]
Train CV Score:  0.6256924244196325
Test Fold Scores:  [0.54563004 0.41087265 0.49748181 0.32003529 0.79723598]
Test CV Score:  0.5142511517118004
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] learning_rate=10, loss=exponential, n_estimators=191 ............
[CV] learning_rate=10, loss=exponential, n_estimators=191 ............
[CV]  learning_rate=10, loss=exponential, n_estimators=191, score=0.12178044529654208, total=   0.3s
[CV] learning_rate=10, loss=exponential, n_estimators=191 ............
[CV]  learning_rate=10, loss=exponential, n_estimators=191, score=0.07938924206802522, total=   0.4s
[CV] learning_rate=11, loss=linear, n_estimators=228 .................
[CV]  learning_rate=10, loss=exponential, n_estimators=191, s

[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    5.5s


[CV] learning_rate=4, loss=square, n_estimators=243 ..................
[CV]  learning_rate=4, loss=square, n_estimators=243, score=-12.349504330754462, total=   0.1s
[CV] learning_rate=4, loss=square, n_estimators=243 ..................
[CV]  learning_rate=4, loss=square, n_estimators=243, score=-3.694854670430627, total=   0.1s
[CV] learning_rate=1, loss=linear, n_estimators=242 ..................
[CV]  learning_rate=1, loss=linear, n_estimators=242, score=0.5428755839452278, total=   0.0s
[CV] learning_rate=1, loss=linear, n_estimators=242 ..................
[CV]  learning_rate=1, loss=linear, n_estimators=242, score=0.5527756639026871, total=   0.0s
[CV] learning_rate=1, loss=linear, n_estimators=242 ..................
[CV]  learning_rate=1, loss=linear, n_estimators=242, score=0.6870985328492785, total=   0.0s
[CV] learning_rate=10, loss=exponential, n_estimators=342 ............
[CV]  learning_rate=4, loss=square, n_estimators=243, score=-3.2499503974674147, total=   0.6s
[CV] lea

[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.1s finished
