In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print( "Type of california housing dataset:", type(housing))

Type of california housing dataset: <class 'sklearn.utils._bunch.Bunch'>


### Load data

In [3]:
house_df = pd.DataFrame(housing['data'] )
house_df.columns = housing['feature_names']
house_df['PRICE']= housing['target']
house_df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
from sklearn.model_selection  import train_test_split
from sklearn.utils import shuffle

In [5]:
y = house_df['PRICE']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(house_df.drop('PRICE', axis=1), 
                                                    y, 
                                                    test_size=0.3, random_state=0)

### Defining Models

In [6]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor, 
                              AdaBoostRegressor, GradientBoostingRegressor,
                              HistGradientBoostingRegressor)

from sklearn.metrics import r2_score

### Default Training

In [7]:
models = {
    'linear': LinearRegression(),
    'ridge': RidgeCV(alphas = np.linspace(1e-3,10)),
    'lasso': LassoCV(alphas = np.linspace(1e-3,10)),
    'SVR': SVR(C = 0.5),
    'knn': KNeighborsRegressor(n_neighbors = 5),
    'dt' : DecisionTreeRegressor(max_depth = 5),
    'rf' : RandomForestRegressor(),
    'et' : ExtraTreesRegressor(),
    'gbm': GradientBoostingRegressor(),
    'histgbm': HistGradientBoostingRegressor()
}

scores_results = {k: np.array([0,0])for k in models.keys()}

In [8]:
for k, model in models.items():
    model.fit(X_train, y_train)
    scores_results[k] = np.array([r2_score(y_train,model.predict(X_train)),
                                  r2_score(y_test,model.predict(X_test))])

In [9]:
df_scores = pd.DataFrame.from_dict(scores_results, 
                                   orient='index',
                                   columns = ['Train Score', 'Test Score'])

In [10]:
df_scores

Unnamed: 0,Train Score,Test Score
linear,0.611294,0.592609
ridge,0.611289,0.592612
lasso,0.611257,0.592629
SVR,-0.038183,-0.038798
knn,0.438298,0.140096
dt,0.631082,0.596863
rf,0.972422,0.79159
et,1.0,0.803486
gbm,0.807847,0.782294
histgbm,0.883238,0.832029


# Model Tuning

In [11]:
from sklearn.model_selection import GridSearchCV

#### Knn Tuning

In [12]:
grid_knn = GridSearchCV(models['knn'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {'n_neighbors' : np.linspace(2,50,num = 15).astype(int)}                
                   )

In [13]:
grid_knn.fit(X_train, y_train)

In [14]:
score_cols = ['mean_test_score','rank_test_score']

In [15]:
pd.DataFrame(grid_knn.cv_results_).sort_values('rank_test_score')[['param_n_neighbors',*score_cols]]

Unnamed: 0,param_n_neighbors,mean_test_score,rank_test_score
2,8,0.119762,1
3,12,0.11406,2
4,15,0.104821,3
1,5,0.099362,4
5,19,0.097268,5
6,22,0.092623,6
7,26,0.083617,7
8,29,0.079209,8
9,32,0.074951,9
10,36,0.069093,10


#### Decision Tree

In [16]:
grid_dt = GridSearchCV(models['dt'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {'max_depth' : np.linspace(2,50,num = 15).astype(int)}                
                   )

In [17]:
grid_dt.fit(X_train, y_train)

In [18]:
pd.DataFrame(grid_dt.cv_results_).sort_values('rank_test_score')[['param_max_depth',*score_cols]]

Unnamed: 0,param_max_depth,mean_test_score,rank_test_score
2,8,0.665432,1
3,12,0.641301,2
4,15,0.608608,3
1,5,0.607672,4
12,43,0.590179,5
13,46,0.589862,6
6,22,0.586469,7
5,19,0.586396,8
8,29,0.586105,9
10,36,0.58426,10


#### Tradition Random Forest

In [19]:
grid_rf = GridSearchCV(models['rf'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {'min_samples_leaf' : np.linspace(2,15,num = 15).astype(int),
                                  'max_features': np.array([0.5, 0.66, 0.75, 0.8, 0.9])
                                 }                
                   )

In [20]:
grid_rf.fit(X_train, y_train)

In [21]:
pd.DataFrame(grid_rf.cv_results_).sort_values('rank_test_score')[['param_min_samples_leaf',
                                                                  'param_max_features',
                                                                  *score_cols]]

Unnamed: 0,param_min_samples_leaf,param_max_features,mean_test_score,rank_test_score
1,2,0.5,0.810069,1
0,2,0.5,0.809996,2
2,3,0.5,0.808271,3
15,2,0.66,0.806959,4
16,2,0.66,0.806630,5
...,...,...,...,...
58,14,0.8,0.780199,71
44,15,0.75,0.779342,72
73,14,0.9,0.779097,73
59,15,0.8,0.778672,74


#### Extra Tree

In [22]:
grid_et = GridSearchCV(models['et'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {'min_samples_leaf' : np.linspace(2,15,num = 15).astype(int),
                                  'max_features': [0.5, 0.66,0.75,0.8,0.9]
                                 }                
                   )

In [23]:
grid_et.fit(X_train, y_train)

In [24]:
score_cols = ['mean_test_score','rank_test_score']
pd.DataFrame(grid_rf.cv_results_).sort_values('rank_test_score')[['param_min_samples_leaf',
                                                                  'param_max_features',
                                                                  *score_cols]]

Unnamed: 0,param_min_samples_leaf,param_max_features,mean_test_score,rank_test_score
1,2,0.5,0.810069,1
0,2,0.5,0.809996,2
2,3,0.5,0.808271,3
15,2,0.66,0.806959,4
16,2,0.66,0.806630,5
...,...,...,...,...
58,14,0.8,0.780199,71
44,15,0.75,0.779342,72
73,14,0.9,0.779097,73
59,15,0.8,0.778672,74


#### SVR

In [25]:
grid_svr = GridSearchCV(models['SVR'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {
                                  'C': np.linspace(1, 100, num=10)
                                 }                
                   )

In [26]:
grid_svr.fit(X_train, y_train)

In [27]:
pd.DataFrame(grid_svr.cv_results_).sort_values('rank_test_score')[['param_C',*score_cols]]

Unnamed: 0,param_C,mean_test_score,rank_test_score
9,100.0,0.473768,1
8,89.0,0.460383,2
7,78.0,0.445348,3
6,67.0,0.42456,4
5,56.0,0.399404,5
4,45.0,0.359311,6
3,34.0,0.303316,7
2,23.0,0.227075,8
1,12.0,0.121449,9
0,1.0,-0.031922,10


#### Hist GBM

In [28]:
grid_histgbm = GridSearchCV(models['histgbm'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {
                                  'learning_rate': np.linspace(1e-3, 0.5, num=50)
                                 }                
                   )

In [29]:
grid_histgbm.fit(X_train, y_train)

In [30]:
pd.DataFrame(grid_histgbm.cv_results_).sort_values('rank_test_score')[['param_learning_rate',*score_cols]]

Unnamed: 0,param_learning_rate,mean_test_score,rank_test_score
17,0.174122,0.834391,1
18,0.184306,0.83394,2
15,0.153755,0.833706,3
14,0.143571,0.833111,4
12,0.123204,0.83265,5
19,0.19449,0.831996,6
16,0.163939,0.831808,7
20,0.204673,0.831417,8
13,0.133388,0.831129,9
21,0.214857,0.831103,10


#### GBM

In [31]:
grid_gbm = GridSearchCV(models['gbm'],
                    cv = 5,
                    n_jobs = 4,
                    param_grid = {'learning_rate': np.linspace(1e-3, 0.5, num=50)
                                 }                
                   )

In [32]:
grid_gbm.fit(X_train, y_train)

In [33]:
pd.DataFrame(grid_gbm.cv_results_).sort_values('rank_test_score')[['param_learning_rate',*score_cols]]

Unnamed: 0,param_learning_rate,mean_test_score,rank_test_score
33,0.337061,0.813311,1
42,0.428714,0.813066,2
38,0.38798,0.811609,3
48,0.489816,0.811556,4
45,0.459265,0.811501,5
37,0.377796,0.811358,6
30,0.30651,0.811212,7
39,0.398163,0.810741,8
34,0.347245,0.810692,9
43,0.438898,0.810616,10


### Training New Model

In [34]:
new_models = {
    'SVR': SVR(C = 0.5),
    'knn': KNeighborsRegressor(n_neighbors = 8),
    'dt' : DecisionTreeRegressor(max_depth = 8),
    'rf' : RandomForestRegressor(min_samples_leaf = 2, max_features = 0.5),
    'et' : ExtraTreesRegressor(min_samples_leaf = 2, max_features = 0.5),
    'gbm': GradientBoostingRegressor(learning_rate = 0.337061),
    'histgbm': HistGradientBoostingRegressor(learning_rate = 0.184306)
}

new_scores_results = {k: np.array([0,0])for k in new_models.keys()}

In [35]:
for k, model in new_models.items():
    model.fit(X_train, y_train)
    new_scores_results[k] = np.array([r2_score(y_train,model.predict(X_train)),
                                  r2_score(y_test,model.predict(X_test))])

In [36]:
df_new_scores = pd.DataFrame.from_dict(new_scores_results, 
                                   orient='index',
                                   columns = ['Train Score', 'Test Score'])

df_new_scores.index = 'tuned_' + df_new_scores.index 

In [37]:
pd.concat([df_scores,df_new_scores]).sort_values('Test Score',ascending= False)

Unnamed: 0,Train Score,Test Score
tuned_histgbm,0.906846,0.835539
histgbm,0.883238,0.832029
tuned_gbm,0.855144,0.811253
tuned_et,0.961004,0.808369
tuned_rf,0.954766,0.807331
et,1.0,0.803486
rf,0.972422,0.79159
gbm,0.807847,0.782294
tuned_dt,0.755842,0.667557
dt,0.631082,0.596863
