# Домашнее задание по теме «Улучшение качества модели. Продвинутые алгоритмы классификации»

#### Владимир Никифоров

Для выполнения домашнего задания необходимо взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество).

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [2]:
# turn off all warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
# set constants
RANDOM_STATE = 777
N_FOLDS = 5

In [4]:
df = load_boston()

In [5]:
X, y = df['data'], df['target']

In [6]:
X.shape, y.shape

((506, 13), (506,))

In [7]:
# train and apply standartscaler to X
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
param_rf = dict(max_depth=[3,5,7,10,12,15,20], n_estimators=[5,10,15,20,25,50,100])

In [9]:
rand = RandomizedSearchCV(RandomForestRegressor(), param_rf, cv=10, n_iter=10, scoring="neg_mean_squared_error", random_state=RANDOM_STATE)
rand.fit(X, y)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_state=

In [10]:
print(rand.cv_results_['mean_test_score'])

[-22.17574253 -25.22049734 -22.72014993 -21.64450997 -20.42166155
 -22.45337367 -24.47506931 -23.11310426 -23.46864887 -21.37373655]


In [11]:
print(rand.cv_results_['params'])

[{'n_estimators': 5, 'max_depth': 12}, {'n_estimators': 15, 'max_depth': 5}, {'n_estimators': 50, 'max_depth': 5}, {'n_estimators': 50, 'max_depth': 7}, {'n_estimators': 20, 'max_depth': 12}, {'n_estimators': 15, 'max_depth': 7}, {'n_estimators': 15, 'max_depth': 12}, {'n_estimators': 25, 'max_depth': 5}, {'n_estimators': 5, 'max_depth': 10}, {'n_estimators': 20, 'max_depth': 20}]


In [12]:
# examine the best model
print(rand.best_score_)
print(rand.best_params_)
print(rand.best_estimator_)

-20.42166155319852
{'n_estimators': 20, 'max_depth': 12}
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)


Now we can use best params from randomized search to do detailed gridsearch.

In [13]:
def get_score(X, y, model, grid_params, n_folds = N_FOLDS):
    ''' Function to train input model with grid_params '''
    # define grid with cross-validation
    gridsearch = GridSearchCV(model, grid_params, scoring='neg_mean_squared_error', cv=n_folds, n_jobs=-1)
    # fit grid
    gridsearch.fit(X, y)
    # get score of best model
    l_score = gridsearch.best_score_
    print(gridsearch.best_estimator_,'\nScore=',l_score)
    print('*'*50)
    return str(gridsearch.best_estimator_), l_score

In [14]:
test_score = get_score(X, y, LinearRegression(n_jobs=-1), {'normalize': [True,False]})

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False) 
Score= -37.08304954892714
**************************************************


In [15]:
models = [(LinearRegression(n_jobs=-1), {'normalize': [True,False]}),
          (RandomForestRegressor(n_jobs=-1,random_state=RANDOM_STATE), {'max_depth': [11,12,13,15], 'n_estimators': [15,20,25,50], 'max_features': [0.2,0.5,0.7,0.8]}),
          (CatBoostRegressor(loss_function='RMSE',random_state=RANDOM_STATE, silent=True), {'depth': [5,6,7,8,10], 'learning_rate': [0.01, 0.05, 0.1], 'iterations': [20, 50, 100]})
         ]

In [16]:
models_score = {}
for mdl, params in models:
    m, s = get_score(X, y, mdl, params)
    models_score[m] = s

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False) 
Score= -37.08304954892714
**************************************************
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
                      max_features=0.5, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
                      oob_score=False, random_state=777, verbose=0,
                      warm_start=False) 
Score= -19.162788936991596
**************************************************
<catboost.core.CatBoostRegressor object at 0x7f19441ed748> 
Score= -21.20953697500465
**************************************************


### Final scores of best models

In [17]:
models_score

{'LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)': -37.08304954892714,
 "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,\n                      max_features=0.5, max_leaf_nodes=None,\n                      min_impurity_decrease=0.0, min_impurity_split=None,\n                      min_samples_leaf=1, min_samples_split=2,\n                      min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,\n                      oob_score=False, random_state=777, verbose=0,\n                      warm_start=False)": -19.162788936991596,
 '<catboost.core.CatBoostRegressor object at 0x7f19441ed748>': -21.20953697500465}