In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
ds = load_boston()
X = ds.data
y = ds.target
X.shape

(506L, 13L)

In [3]:
pd.DataFrame(X).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
0     506 non-null float64
1     506 non-null float64
2     506 non-null float64
3     506 non-null float64
4     506 non-null float64
5     506 non-null float64
6     506 non-null float64
7     506 non-null float64
8     506 non-null float64
9     506 non-null float64
10    506 non-null float64
11    506 non-null float64
12    506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

# Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr_params = {'fit_intercept' : [True, False]}

In [7]:
grid_lr = GridSearchCV(lr, lr_params, cv=10)
grid_lr.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
print(grid_lr.best_score_)
print(grid_lr.best_params_)
print(grid_lr.best_estimator_)

0.6857708360776597
{'fit_intercept': True}
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


# Decision Tree Regression

In [9]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt_params = {
    'criterion' : ['mse', 'mae'],
    'max_depth' : [2, 3, 4, 5, 7, 9, None],
    'min_samples_leaf' : [1, 2, 4, 8, 16]
}

In [10]:
grid_dt = GridSearchCV(dt, dt_params, cv = 10)
grid_dt.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['mse', 'mae'], 'max_depth': [2, 3, 4, 5, 7, 9, None], 'min_samples_leaf': [1, 2, 4, 8, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
print(grid_dt.best_score_)
print(grid_dt.best_params_)
print(grid_dt.best_estimator_)

0.8204438526940513
{'criterion': 'mse', 'max_depth': 5, 'min_samples_leaf': 2}
DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')


# Random Forest Regression

In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf_params = {
    'n_estimators' : [5, 10, 20, 50],
    'max_depth' : [2, 3, 4, 5, 7, 9, None],
    'min_samples_leaf' : [1, 2, 4, 8, 16]
}

In [13]:
grid_rf = GridSearchCV(rf, rf_params, cv = 10)
grid_rf.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [5, 10, 20, 50], 'max_depth': [2, 3, 4, 5, 7, 9, None], 'min_samples_leaf': [1, 2, 4, 8, 16]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
print(grid_rf.best_score_)
print(grid_rf.best_params_)
print(grid_rf.best_estimator_)

0.8687317760420571
{'n_estimators': 50, 'max_depth': None, 'min_samples_leaf': 1}
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)


# KNeighbors Regression

In [15]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn_params = {
    'n_neighbors' : [1, 3, 5, 7, 9],
    'p' : [1, 2]
}

In [16]:
grid_knn = GridSearchCV(knn, knn_params, cv = 10)
grid_knn.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 3, 5, 7, 9], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
print(grid_knn.best_score_)
print(grid_knn.best_params_)
print(grid_knn.best_estimator_)

0.770406661841183
{'n_neighbors': 3, 'p': 2}
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='uniform')


# SVM

In [20]:
from sklearn.svm import SVR
svr = SVR()
svr_params = {
    'C' : [0.001, 0.01, 0.1, 0.25, 0.5],
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

In [21]:
grid_svr = GridSearchCV(svr, svr_params, cv = 10)
grid_svr.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.001, 0.01, 0.1, 0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
print(grid_svr.best_score_)
print(grid_svr.best_params_)
print(grid_svr.best_estimator_)

0.6621765516181111
{'kernel': 'linear', 'C': 0.5}
SVR(C=0.5, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)


# Проверяем все модели на X_valid

In [23]:
estimators = {
    'Linear Regresion' : grid_lr,
    'Decision Tree' : grid_dt,
    'Random Forest' : grid_rf,
    'KNN' : grid_knn,
    'SVM' : grid_svr
}

In [26]:
for k in estimators:
    v = estimators[k]
    print(k, 'CV score:', v.best_score_, 'Validation score:', v.best_estimator_.score(X_valid, y_valid))

('KNN', 'CV score:', 0.770406661841183, 'Validation score:', 0.7462916828852413)
('Linear Regresion', 'CV score:', 0.6857708360776597, 'Validation score:', 0.7481742629736345)
('SVM', 'CV score:', 0.6621765516181111, 'Validation score:', 0.7321385604522765)
('Random Forest', 'CV score:', 0.8687317760420571, 'Validation score:', 0.8600905845522919)
('Decision Tree', 'CV score:', 0.8204438526940513, 'Validation score:', 0.8036103355205555)


Random Forest выиграл