In [1]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
%matplotlib inline
import numpy as np
import pandas as pd

In [4]:
boston = datasets.load_boston()

In [5]:
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
X, y = boston['data'], boston['target']

In [12]:
import random

In [13]:
random.seed(42)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [18]:
# 1. lasso l1 - подберем alpha, max_iter
l = Lasso()
params_l = {'alpha': [1, 0.1, 0.01, 0.001],
            'max_iter': [100, 1000, 10000]}
grid_l = GridSearchCV(l, params_l, cv=10, scoring = 'r2') #гридсечем подберем наилучшие параметры
grid_l.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=Lasso(),
             param_grid={'alpha': [1, 0.1, 0.01, 0.001],
                         'max_iter': [100, 1000, 10000]},
             scoring='r2')

In [19]:
# Выведем наилучшие параметры и оценку
print(grid_l.best_params_)
print(grid_l.best_score_)
print(grid_l.best_estimator_)

{'alpha': 0.01, 'max_iter': 100}
0.6988357132543653
Lasso(alpha=0.01, max_iter=100)


In [20]:
# 2. ridge l2 - подберем alpha, tol, max_iter, solver
r = Ridge()
params_r = {'alpha': [1, 0.1, 0.01, 0.001],
            'max_iter': [100, 1000, 10000], 
           'tol': [0.1, 0.01,0.001, 0.0001],
           'solver': ['svd', 'cholesky', 'lsqr', 'saga']}
grid_r = GridSearchCV(r, params_r, cv=10, scoring = 'r2') #гридсечем подберем наилучшие параметры
grid_r.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': [1, 0.1, 0.01, 0.001],
                         'max_iter': [100, 1000, 10000],
                         'solver': ['svd', 'cholesky', 'lsqr', 'saga'],
                         'tol': [0.1, 0.01, 0.001, 0.0001]},
             scoring='r2')

In [21]:
# Выведем наилучшие параметры и оценку
print(grid_r.best_params_)
print(grid_r.best_score_)
print(grid_r.best_estimator_)

{'alpha': 1, 'max_iter': 100, 'solver': 'lsqr', 'tol': 0.001}
0.6991650171845755
Ridge(alpha=1, max_iter=100, solver='lsqr')


In [30]:
# 3. elastic net l1-l2 regularization
en = ElasticNet()
params_en = {'alpha': [1, 0.1, 0.01, 0.001],
           'l1_ratio': [0, 0.5, 1],
            'max_iter': [1000, 10000, 100000]}
grid_en = GridSearchCV(en, params_en, cv=10, scoring = 'r2', verbose=2, n_jobs=-1) #гридсечем подберем наилучшие параметры
grid_en.fit(X_train, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   13.4s finished
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=10, estimator=ElasticNet(), n_jobs=-1,
             param_grid={'alpha': [1, 0.1, 0.01, 0.001],
                         'l1_ratio': [0, 0.5, 1],
                         'max_iter': [1000, 10000, 100000]},
             scoring='r2', verbose=2)

In [31]:
# Выведем наилучшие параметры и оценку
print(grid_en.best_params_)
print(grid_en.best_score_)
print(grid_en.best_estimator_)

{'alpha': 0.01, 'l1_ratio': 0, 'max_iter': 10000}
0.6992551267540726
ElasticNet(alpha=0.01, l1_ratio=0, max_iter=10000)


In [32]:
# т.к. в grid_en.best_estimator_ значение l1_ratio=0 - то фактически ушли в l2 - регуляризацию

In [34]:
# 4. Лерево решений (обычно деревья решений для датасетов с большим кол-вом категориальных переменных могут рост в скоре дать)
dt = DecisionTreeRegressor()
params_dt = {'max_depth': range (1,13, 2),
              'min_samples_leaf': range (1,8),
              'min_samples_split': range (2,10,2) }
grid_dt = GridSearchCV(dt, params_dt, cv=10)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(1, 13, 2),
                         'min_samples_leaf': range(1, 8),
                         'min_samples_split': range(2, 10, 2)})

In [35]:
# Выведем наилучшие параметры и оценку
print(grid_dt.best_params_)
print(grid_dt.best_score_)
print(grid_dt.best_estimator_)

{'max_depth': 7, 'min_samples_leaf': 7, 'min_samples_split': 8}
0.7291340046046763
DecisionTreeRegressor(max_depth=7, min_samples_leaf=7, min_samples_split=8)


In [37]:
# 5. RandomForestRegressor
rfr = RandomForestRegressor()
params_rfr = {'n_estimators': range(1,10),
    'max_depth': range (1,13, 2),
              'min_samples_leaf': range (1,8),
              'min_samples_split': range (2,10,2) }
grid_dt = GridSearchCV(rfr, params_rfr, cv=10)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': range(1, 13, 2),
                         'min_samples_leaf': range(1, 8),
                         'min_samples_split': range(2, 10, 2),
                         'n_estimators': range(1, 10)})

In [38]:
# Выведем наилучшие параметры и оценку
print(grid_dt.best_params_)
print(grid_dt.best_score_)
print(grid_dt.best_estimator_)

{'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 8}
0.8319778547405766
RandomForestRegressor(max_depth=11, min_samples_split=4, n_estimators=8)


In [None]:
# Как и ожидалось, RandomForestRegressor дал хороший прирост в качестве (много категориальных переменных)

In [39]:
# 6. GradientBoostingRegressor
gbr = GradientBoostingRegressor()
params_gbr = {'loss': ['ls', 'huber'],
    'n_estimators': [5,10, 15],
    'min_samples_split': [2,4,6],
    'max_depth': [None,1,5,10,20]}
grid_gbr = GridSearchCV(gbr, params_gbr, cv=10)
grid_gbr.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(),
             param_grid={'loss': ['ls', 'huber'],
                         'max_depth': [None, 1, 5, 10, 20],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [5, 10, 15]})

In [40]:
# Выведем наилучшие параметры и оценку
print(grid_gbr.best_params_)
print(grid_gbr.best_score_)
print(grid_gbr.best_estimator_)

{'loss': 'huber', 'max_depth': 5, 'min_samples_split': 6, 'n_estimators': 15}
0.7913156215464927
GradientBoostingRegressor(loss='huber', max_depth=5, min_samples_split=6,
                          n_estimators=15)


In [46]:
# 7. Пробуем сделать байсовскую оптимизацию для RandomForestRegressor
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score

In [48]:
# зададим параметры
space = {
        'max_depth': hp.quniform('max_depth', 10, 1200, 10),
        'min_samples_leaf': hp.uniform ('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.choice('n_estimators', [5, 10, 15])
    }

In [49]:
def objective(space):
    model = RandomForestRegressor(max_depth = space['max_depth'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train).mean()
    return {'loss': -accuracy, 'status': STATUS_OK } #для решения поиска минимума
    
trials = Trials()
best = fmin(fn = objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|███████████████████████████████████████████████| 80/80 [00:09<00:00,  8.64trial/s, best loss: -0.8231426209180567]


{'max_depth': 1060.0,
 'min_samples_leaf': 0.0006187942199335278,
 'min_samples_split': 0.04337420278004596,
 'n_estimators': 1}

In [None]:
# score составил 0.8231426209180567