In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [6]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
df = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [7]:
y = boston_dataset.target
x = df

In [8]:
x1 = pd.get_dummies(x, columns=['RAD'])
x2 = pd.get_dummies(x1, columns=['TAX'])

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [10]:
x_scal = scaler.fit_transform(x2)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_scal, y, test_size=0.3, random_state=42)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor

In [14]:
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

RandomizedSearchCV

In [15]:
models=[ 
      {'name':'Lr',"model": LinearRegression()  , 'params':{'fit_intercept':[True, False], 'normalize':[True, False]}},
      {'name':'R',"model": Ridge(), 'params':{'alpha': uniform(loc=0, scale=4), 'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}},
      {'name':'L',"model": Lasso(), 'params':{'alpha': uniform(loc=0, scale=4), 'selection':['cyclic', 'random']}},
      {'name':'RF',"model": RandomForestRegressor(), 'params':{'n_estimators':[10,25,50,100,150,200], 'criterion':['squared_error', 'absolute_error', 'poisson'], 'max_depth':[3,5,7,9,11]}},
      {'name':'KN',"model": KNeighborsRegressor(), 'params':{'n_neighbors':list(range(1,30)),'weights': ['uniform', 'distance'], 'p':[1,2,3]}},
      {'name':'DT',"model": DecisionTreeRegressor(), 'params':{'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'max_depth':[3,5,7,9,11]}},
      {'name':'BR',"model": BaggingRegressor(), 'params':{'n_estimators':[10,25,50,100,150,200], 'max_samples':list(range(100, 1000, 10))}}
]

In [16]:
res=[]
for v in  models:
    res.append((v['name'], RandomizedSearchCV(v['model'], v['params'], cv=10).fit(x_train, y_train)))

In [17]:
for r in res:
    print(r[0], r[1].best_score_, r[1].best_params_)

Lr -3.795045615697275e+22 {'normalize': True, 'fit_intercept': False}
R 0.7027859616207135 {'alpha': 2.881949392308373, 'solver': 'saga'}
L 0.6838744158905387 {'alpha': 0.28749897493049614, 'selection': 'random'}
RF 0.8223331423020941 {'n_estimators': 100, 'max_depth': 7, 'criterion': 'squared_error'}
KN 0.6850716828749421 {'weights': 'uniform', 'p': 1, 'n_neighbors': 2}
DT 0.7026122596573803 {'max_depth': 5, 'criterion': 'squared_error'}
BR 0.8104763246790968 {'n_estimators': 25, 'max_samples': 140}


In [18]:
best_model = res[3][1].best_estimator_


randomForest после RandomizedSearchCV на тестовых данных выдает:

In [19]:
best_model.score(x_test, y_test)

0.8700923113851383

**Попробуем улучшить качество в лучших окрестнястях пространства гиперпараметров, которые показал RandomizedSearch, с помощью GridSearch**

Оставим только RandomForest и BaggingRegressor


In [26]:
models_2=[ 
      {'name':'RF',"model": RandomForestRegressor(), 'params':{'n_estimators':list(range(95, 106)), 'max_depth':list(range(6, 11))}},
      {'name':'BR',"model": BaggingRegressor(), 'params':{'n_estimators':list(range(15, 31)), 'max_samples':list(range(130, 151))}}
]

In [27]:
res_2=[]
for v in  models_2:
    res_2.append((v['name'], GridSearchCV(v['model'], v['params'], cv=10).fit(x_train, y_train)))

In [28]:
for r in res_2:
    print(r[0], r[1].best_score_, r[1].best_params_)

RF 0.8236893012690107 {'max_depth': 7, 'n_estimators': 104}
BR 0.8290185129893807 {'max_samples': 146, 'n_estimators': 18}


BaggingRegressor после GridSearchCV на тестовых данных выдает:

In [31]:
res_2[1][1].score(x_test, y_test)


0.8399747845574403

RandomForest после GridSearchCV на тестовых данных выдает:

In [33]:
res_2[0][1].best_estimator_.score(x_test, y_test)

0.8595484373793179

**В итоге, так получилось, что модель, найденная RandomizedSearchCV, показывает результат лучше, чем после GridSearchCV**