In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline

In [23]:
df = pd.read_csv("../data/dfperf.csv", index_col = 0)
df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,price,size
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3,4,6,1,6.353,49.437424
1,1.01,3,5,5,9.183,167.551728
2,0.72,3,4,3,7.983,116.024916
3,1.08,2,3,1,8.371,175.1412
4,0.36,4,3,4,6.588,57.7395


In [24]:
df.describe().T.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
carat,40396.0,0.794495,0.468762,0.2,0.4,0.7,1.04,2.72
cut,40396.0,2.700837,1.151197,0.0,2.0,3.0,4.0,4.0
color,40396.0,3.41061,1.702246,0.0,2.0,3.0,5.0,6.0
clarity,40396.0,3.051688,1.644971,0.0,2.0,3.0,4.0,7.0
price,40396.0,7.781305,1.015292,5.787,6.851,7.777,8.577,9.842


In [25]:
y = df["price"]
X = df.drop(columns=["price"])

In [26]:
pipe = Pipeline(steps=[('estimator', LinearRegression())])

In [27]:
params_grid = [{
                'estimator':[LinearRegression()],
                },
                {
                'estimator': [DecisionTreeRegressor()],
                'estimator__max_depth': [4, 5, 6, 7],
                'estimator__max_features': ["sqrt", "log2"],
                'estimator__min_samples_split': [10, 50, 100], 
                'estimator__min_samples_leaf':[1,2,3],
                },
                {'estimator':[RandomForestRegressor()],
                 'estimator__n_estimators':[50,100,200,300],
                 'estimator__max_depth':[4, 5, 6, 7], 
                 'estimator__min_samples_split':[2,3], 
                 'estimator__min_samples_leaf':[1,2,3],
                },
                {'estimator':[KNeighborsRegressor()],
                 'estimator__n_neighbors':[3,5,6,7, 8, 9, 10],
                }

              ]

In [28]:
grid = GridSearchCV(pipe, params_grid, n_jobs= -1, verbose = 3)

In [29]:
%%time
res = grid.fit(X, y)

Fitting 5 folds for each of 176 candidates, totalling 880 fits
Wall time: 16min 58s


In [32]:
best_model = grid.best_params_
best_model

{'estimator': KNeighborsRegressor(n_neighbors=8), 'estimator__n_neighbors': 8}

In [33]:
results = pd.DataFrame(grid.cv_results_).head()
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator,param_estimator__max_depth,param_estimator__max_features,param_estimator__min_samples_leaf,param_estimator__min_samples_split,param_estimator__n_estimators,param_estimator__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.056567,0.002652,0.01879,0.007853646,LinearRegression(),,,,,,,{'estimator': LinearRegression()},0.890025,0.888411,0.890612,0.892459,0.887688,0.889839,0.001683,174
1,0.125527,0.019282,0.012993,0.0006330131,DecisionTreeRegressor(),4.0,sqrt,1.0,10.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",0.887543,0.92509,0.937438,0.862584,0.934113,0.909354,0.029368,171
2,0.117731,0.023307,0.011994,0.0008926051,DecisionTreeRegressor(),4.0,sqrt,1.0,50.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",0.939601,0.929932,0.810047,0.937023,0.82837,0.888995,0.057362,176
3,0.118932,0.015431,0.010794,0.0007481118,DecisionTreeRegressor(),4.0,sqrt,1.0,100.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",0.930833,0.929614,0.907237,0.941916,0.934406,0.928801,0.011603,160
4,0.120331,0.022443,0.010994,5.309834e-07,DecisionTreeRegressor(),4.0,sqrt,2.0,10.0,,,"{'estimator': DecisionTreeRegressor(), 'estima...",0.907609,0.90547,0.932391,0.93055,0.937093,0.922622,0.013321,164


In [34]:
results = cross_validate(estimator=KNeighborsRegressor(n_neighbors=8), 
               X=X, y=y, scoring='neg_root_mean_squared_error', cv=10, verbose = 3,  n_jobs= -1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.8s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.3s finished


In [35]:
rmse_ = abs(results["test_score"].mean())
rmse_

0.12453347491707267