In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# load data
housing_df = pd.read_csv('HousingData.csv')
housing_df.head()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,,36.2


In [3]:
# drop null values
housing_df = housing_df.dropna()

In [4]:
# declare X and y
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [5]:
#Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
#Create the regressor: reg
reg = LinearRegression()

In [7]:
#Fit the regressor to the training data
reg.fit(X_train, y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# Predict on the test data: y_pred
y_pred = reg.predict(X_test)

In [9]:
# Compute and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [10]:

#Exercise146 begins from here


In [11]:
from sklearn.model_selection import cross_val_score

In [12]:
def regression_model_cv(model, k=5):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean ())

In [13]:
regression_model_cv(LinearRegression())

Reg rmse: [3.54810173 4.50937757 5.82330106 8.05737628 5.2191234 ]
Reg mean: 5.431456008148543


In [14]:
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 4.40674889  6.15899    19.11540116]
Reg mean: 9.893713348149243


In [15]:
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [ 3.2569782   4.0645006   5.69790223  4.05568051 10.69290989  4.23497297]
Reg mean: 5.333824068010533


In [21]:

#Exercise147 begins from here


In [18]:
from sklearn.neighbors import KNeighborsRegressor
regression_model_cv(KNeighborsRegressor())

Reg rmse: [ 9.45732105  7.9848147  10.9163503  11.09935544  6.57277173]
Reg mean: 9.206122643945283


In [19]:
regression_model_cv(KNeighborsRegressor(n_neighbors=4))

Reg rmse: [ 8.88815792  8.24186836 10.94271657 10.96090976  7.01075639]
Reg mean: 9.208881798413609


In [20]:
regression_model_cv(KNeighborsRegressor(n_neighbors=7))

Reg rmse: [ 9.65464687  8.1939451  11.38313391 11.32567661  6.5464274 ]
Reg mean: 9.420765975578501


In [22]:
regression_model_cv(KNeighborsRegressor(n_neighbors=10))

Reg rmse: [ 7.81856109  8.26859385 11.49600156 10.69216151  5.94022576]
Reg mean: 8.84310875340935


In [23]:

#Exercise148 begins from here


In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
neighbors = np.linspace(1, 20, 20)

In [27]:
k = neighbors.astype(int)

In [28]:
param_grid = {'n_neighbors': k}

In [29]:
knn = KNeighborsRegressor()

In [30]:
knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

In [31]:
knn_tuned.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [32]:
k = knn_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'n_neighbors': 17}
Best score: 8.908161360197397


In [33]:

#Exercise149 begins from here


In [34]:
from sklearn import tree
regression_model_cv(tree.DecisionTreeRegressor())

Reg rmse: [3.92707254 6.76109496 6.46172764 6.52558742 6.15295426]
Reg mean: 5.965687363299925


In [35]:
from sklearn.ensemble import RandomForestRegressor
regression_model_cv(RandomForestRegressor())

Reg rmse: [3.36399232 3.91732665 4.93002375 6.3685335  5.80821269]
Reg mean: 4.877617781853548




In [37]:

#Exercise150 begins from here


In [38]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100))

Reg rmse: [3.39170277 3.71170163 4.60788038 6.45159401 6.10758199]
Reg mean: 4.854092155859847


In [39]:
from sklearn.model_selection import RandomizedSearchCV

In [40]:
param_grid = {'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3],
             'max_features': ['auto', 'sqrt']}

In [41]:
reg = RandomForestRegressor(n_jobs = -1)

In [42]:
reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')

In [43]:
reg_tuned.fit(X, y)





RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=-1, oob_score=False,
                                                   random_state=Non

In [44]:
p = reg_tuned.best_params_
print("Best n_neighbors: {}".format(p))
score = reg_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 100}
Best score: 4.660732470370791


In [45]:
# Setup the hyperparameter grid
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500))

Reg rmse: [3.44912811 3.55687398 4.87946051 6.51052193 6.29685798]
Reg mean: 4.938568500515866
