## Cross Validated Grid Search

Let's do a Cross Validated Grid Search for the following Models using features selected in EDA
- Ridge
- Lasso
- KNN (K Nearest Neighbor)
- SVM (Support Vector Machine)
- Decision Tree

In [1]:
cd ..

/home/jovyan/Ames_Housing_Data/ipynb


In [2]:
cd ..

/home/jovyan/Ames_Housing_Data


In [3]:
!pip --quiet install mglearn

In [4]:
import mglearn
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler, scale
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
run src/load_data_2.py

In [6]:
housing_df = load_train_data()

In [7]:
clean_data(housing_df)
housing_df.shape
#housing_df.dtypes

(1423, 78)

#### Split features & target

In [8]:
features, target = split_features_target(housing_df)

#### Scale numerical features & One hot encode the categorical features

In [9]:
numerical_features, categorical_features = split_numerical_categorical(features)

scaled_numerical_features = log_scale_features(numerical_features)
categorical_features = one_hot_encode_features(categorical_features)

scaled_features_df = scaled_numerical_features.merge(categorical_features, left_index=True, right_index=True, how='left')
unscaled_features_df = numerical_features.merge(categorical_features, left_index=True, right_index=True, how='left')

#### Split the dataset into train and test

In [10]:
target = housing_df["SalePrice"]

X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(scaled_features_df, target, test_size = .25, random_state = 42)

X_train, X_test, y_train, y_test = train_test_split(unscaled_features_df, target, test_size = .25, random_state = 42)

### (1) Grid Search CV on Ridge

In [11]:
ridge = Ridge()
ridge.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [12]:
ridge_params = [
    { 'alpha': np.logspace(.1, 1.2, 22), 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] }
]
n_folds = 10

In [13]:
ridge_gs = GridSearchCV(ridge, param_grid= ridge_params, cv=n_folds, n_jobs=-1)

In [14]:
ridge_gs.fit(X_train_scaled, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'alpha': array([  1.25893,   1.4203 ,   1.60237,   1.80777,   2.0395 ,   2.30094,
         2.59589,   2.92864,   3.30406,   3.72759,   4.20542,   4.7445 ,
         5.35268,   6.03882,   6.81292,   7.68625,   8.67152,   9.78309,
        11.03716,  12.45197,  14.04815,  15.84893]), 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
ridge_gs.best_score_

0.86242686511892264

In [16]:
ridge_gs.best_params_

{'alpha': 9.7830931901782865, 'solver': 'auto'}

In [17]:
ridge_gs.best_estimator_

Ridge(alpha=9.7830931901782865, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)

In [18]:
ridge_gs.score(X_train_scaled, y_train), ridge_gs.score(X_test_scaled, y_test)

(0.90556998499539842, 0.81896478353533375)

### (2) Grid Search CV on Lasso

In [19]:
lasso = Lasso()

In [61]:
lasso_params = [
    { 'alpha': np.logspace(.1, 1, 12) }, # last parameter represents the number of samples
    { 'alpha': np.logspace(-3, 3, 5) },
]
n_folds = 10

In [62]:
lasso_gs = GridSearchCV(lasso, param_grid= lasso_params, cv=n_folds, n_jobs=-1)

In [63]:
lasso_gs.fit(X_train_scaled, y_train)







GridSearchCV(cv=10, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'alpha': array([  1.25893,   1.51991,   1.835  ,   2.21541,   2.67469,   3.22917,
         3.8986 ,   4.70682,   5.68258,   6.86062,   8.28289,  10.     ])}, {'alpha': array([  1.00000e-03,   3.16228e-02,   1.00000e+00,   3.16228e+01,
         1.00000e+03])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [64]:
lasso_gs.best_score_

0.85068991423769502

In [65]:
lasso_gs.best_params_

{'alpha': 31.622776601683793}

In [66]:
lasso_gs.best_estimator_

Lasso(alpha=31.622776601683793, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [67]:
lasso_gs.score(X_train_scaled, y_train)

0.92775043920757738

In [68]:
lasso_gs.score(X_test_scaled, y_test)

0.82424734999142324

### (3) Grid Search CV on KNN

In [29]:
knn = KNeighborsRegressor()

In [69]:
knn_params = [
    { 'n_neighbors': range(1, 30, 1) }  
]
n_folds = 10

In [70]:
knn_gs = GridSearchCV(knn, param_grid= knn_params, cv=n_folds, n_jobs=-1)

In [71]:
knn_gs.fit(X_train_scaled, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'n_neighbors': range(1, 30)}], pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [72]:
knn_gs.best_score_

0.77129398483404243

In [73]:
knn_gs.best_params_

{'n_neighbors': 3}

In [74]:
knn_gs.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='uniform')

In [75]:
y_test_pred = knn_gs.predict(X_test_scaled)

In [76]:
knn_gs.score(X_train_scaled, y_train)

0.88888133223170551

In [77]:
knn_gs.score(X_test_scaled, y_test)

0.71007490822956554

### (4) Grid Search CV on SVM

In [39]:
svm = SVR()

In [40]:
svm_params = [
    { 'C': np.logspace(-3,3,7), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [0.001, 0.0001], 'epsilon':[.1,.2] }
]
n_folds = 5

In [41]:
svm_gs = GridSearchCV(svm, param_grid= svm_params, cv=n_folds, n_jobs=-1)

In [42]:
svm_gs.fit(X_train_scaled, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02,   1.00000e+03]), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': [0.001, 0.0001], 'epsilon': [0.1, 0.2]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
svm_gs.best_score_

0.84464771882238443

In [44]:
svm_gs.best_params_

{'C': 1000.0, 'epsilon': 0.1, 'gamma': 0.001, 'kernel': 'linear'}

In [45]:
svm_gs.best_estimator_

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [46]:
y_test_pred = svm_gs.predict(X_test_scaled)

In [47]:
svm_gs.score(X_train_scaled, y_train)

0.86204740393669055

In [48]:
svm_gs.score(X_test_scaled, y_test)

0.80679472263180796

### (5) Grid Search CV on DecisionTreeRegressor

In [49]:
dtree = DecisionTreeRegressor()

In [50]:
dtree_params = [
    { 'criterion': ['mse', 'friedman_mse', 'mae'], 'splitter': ['best', 'random'], 'max_depth': range(1,10,1) }  
]
n_folds = 10

In [51]:
dtree_gs = GridSearchCV(dtree, param_grid= dtree_params, cv=n_folds, n_jobs=-1)

In [52]:
dtree_gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'criterion': ['mse', 'friedman_mse', 'mae'], 'splitter': ['best', 'random'], 'max_depth': range(1, 10)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
dtree_gs.best_score_

0.74424708967434805

In [54]:
dtree_gs.best_params_

{'criterion': 'mse', 'max_depth': 7, 'splitter': 'best'}

In [55]:
dtree_gs.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [56]:
y_test_pred = dtree_gs.predict(X_test)

In [57]:
dtree_gs.score(X_train, y_train)

0.94771390784821441

In [58]:
dtree_gs.score(X_test, y_test)

0.83308594690114868