In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# load data
housing_df = pd.read_csv('HousingData.csv')
housing_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [3]:
# drop null values
housing_df = housing_df.dropna()

In [4]:
# declare X and y
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [5]:
#Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [6]:
#Create the regressor: reg
reg = LinearRegression()

In [7]:
#Fit the regressor to the training data
reg.fit(X_train, y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
# Predict on the test data: y_pred
y_pred = reg.predict(X_test)

In [9]:
# Compute and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [15]:
#Exercise02 begins from here

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
def regression_model_cv(model, k=5):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean ())

In [18]:
regression_model_cv(LinearRegression())

Reg rmse: [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]
Reg mean: 5.33786896287833


In [19]:
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 3.72504914  6.01655701 23.20863933]
Reg mean: 10.983415161090818


In [20]:
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]
Reg mean: 5.0865908108011


In [24]:
#Exercise 3 begins from here

In [25]:
from sklearn.neighbors import KNeighborsRegressor
regression_model_cv(KNeighborsRegressor())

Reg rmse: [ 8.24568226  8.81322798 10.58043836  8.85643441  5.98100069]
Reg mean: 8.495356738515685


In [26]:
regression_model_cv(KNeighborsRegressor(n_neighbors=4))

Reg rmse: [ 8.44659788  8.99814547 10.97170231  8.86647969  5.72114135]
Reg mean: 8.600813339223432


In [27]:
regression_model_cv(KNeighborsRegressor(n_neighbors=7))

Reg rmse: [ 7.99710601  8.68309183 10.66332898  8.90261573  5.51032355]
Reg mean: 8.351293217401393


In [28]:
regression_model_cv(KNeighborsRegressor(n_neighbors=10))

Reg rmse: [ 7.47549287  8.62914556 10.69543822  8.91330686  6.52982222]
Reg mean: 8.448641147609868


In [39]:
#Exercise 4 begins from here

In [31]:
from sklearn.model_selection import GridSearchCV

# 1. Setup hyper-parameter grid
neighbors = np.linspace(1, 20, 20)
# Convert floats to int (required by knn)
k = neighbors.astype(int)
# Place grid in dictionary
param_grid = {'n_neighbors': k}

In [36]:
# 2. Build model for each neighbor
knn = KNeighborsRegressor()

In [38]:
# Instantiate the GridSearchCV object: knn_tuned
knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit knn_tuned to the data
knn_tuned.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [45]:
# 3. Print the tuned parameters and score
k = knn_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'n_neighbors': 7}
Best score: 8.523048500643897


In [46]:
#Exercise 5 begins from here

In [47]:
from sklearn import tree
regression_model_cv(tree.DecisionTreeRegressor())

Reg rmse: [3.73093466 7.17241551 8.56035164 6.62898316 5.68535004]
Reg mean: 6.355607003553722


In [48]:
from sklearn.ensemble import RandomForestRegressor
regression_model_cv(RandomForestRegressor())

Reg rmse: [3.36606934 4.19973929 6.13967673 6.65576794 4.59948004]
Reg mean: 4.992146668618635




In [49]:
#Exercise 6 begins from here

In [50]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100))

Reg rmse: [3.17114517 3.8954232  4.47400625 6.59059091 3.93546126]
Reg mean: 4.413325357365643


In [51]:
from sklearn.model_selection import RandomizedSearchCV

In [52]:
param_grid = {'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3],
             'max_features': ['auto', 'sqrt']}

In [53]:
reg = RandomForestRegressor(n_jobs = -1)

In [54]:
reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit reg_tuned to the data
reg_tuned.fit(X, y)






RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_depth': [None, 10, 30, 50, 70, 100, 200, 400], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3], 'max_features': ['auto', 'sqrt']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [55]:
p = reg_tuned.best_params_
print("Best n_neighbors: {}".format(p))
score = reg_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 70}
Best score: 4.5291297683433935


In [56]:
# Setup the hyperparameter grid
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500))

Reg rmse: [3.18400301 3.76596999 4.88485476 6.51350148 4.09019033]
Reg mean: 4.4877039155564535


In [60]:
#Exercise 12 step 3 and 4 begins from here

In [61]:
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [62]:
from sklearn.ensemble import AdaBoostRegressor
regression_model_cv(AdaBoostRegressor())


Reg rmse: [3.75023024 3.48211969 5.46911888 6.30026928 4.13913715]
Reg mean: 4.628175048702711
