### Подключение модулей

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Загрузка данных

In [None]:
df = pd.read_csv('USA_Housing.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8,6))
sns.pairplot(df)

In [None]:
sns.distplot(df['Price'])

In [None]:
sns.regplot(y='Price', x='Avg. Area Income', data=df)

In [None]:
sns.heatmap(df.corr(), annot=True)

In [None]:
df.corr()

### Обучение модели линейной регрессии

In [None]:
df.columns

In [None]:
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]

In [None]:
y = df['Price']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

### Линейная регрессия

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lrm = LinearRegression()
lrm.fit(X_train, y_train)

In [None]:
lrm.intercept_

In [None]:
lrm.coef_

In [None]:
predictions = lrm.predict(X_test)

In [None]:
predictions

In [None]:
y_test.round(3)

In [None]:
plt.scatter(y_test, predictions)

In [None]:
sns.regplot(x=y_test,y=predictions,line_kws={"color":"black"})

In [None]:
sns.distplot((y_test - predictions))

### Метрики оценки качества регрессии


Вот три общих показателя оценки для задач регрессии:

**Mean Absolute Error** (MAE Средняя абсолютная ошибка) - это среднее значение абсолютного значения ошибок:

$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$

**Mean Squared Error** (MSE Среднеквадратическая ошибка) - это среднее значение квадратов ошибок:

$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$

**Root Mean Squared Error** (RMSE Среднеквадратическая ошибка) - это квадратный корень из среднего квадрата ошибок:

$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$


In [None]:
from sklearn import metrics

In [None]:
metrics.mean_absolute_error(y_test, predictions)

In [None]:
metrics.mean_squared_error(y_test, predictions)

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, predictions))

In [None]:
metrics.r2_score(y_test, predictions)

### Модель KNeighborsRegressor для задачи регрессии из sklearn.

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

In [None]:
knn.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score

pred_train = knn.predict(X_train)
pred_test = knn.predict(X_test)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

### Модель DecisionTreeRegressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_1 = DecisionTreeRegressor(random_state=1, max_depth=9)
tree_1.fit(X_train, y_train)

In [None]:
pred_train = tree_1.predict(X_train)
pred_test = tree_1.predict(X_test)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
X.describe()

### Масштабирование данных

Есть два основных вида масштабирования данных: нормализация и стандартизация.

### Нормализация

Для того, чтобы сделать нормализацию данных нужно посчитать в каждом признаке его минимум (min) и максимум (max), а затем сделать следующее вычисление:

$$x = \frac{x - min}{max - min}$$

После такого преобразования $min = 0, max = 1$.

In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
mms.fit(X_train)

In [None]:
mms.transform(X_train)

In [None]:
X_train_norm = pd.DataFrame(mms.transform(X_train), columns=X_train.columns)
X_train_norm

In [None]:
X_train_norm.describe()

In [None]:
X_test_norm = pd.DataFrame(mms.transform(X_test), columns=X_train.columns)
X_test_norm.describe()

#### Обучение модели на масштабированных данных

In [None]:
knn.fit(X_train_norm, y_train)

In [None]:
pred_train = knn.predict(X_train_norm)
pred_test = knn.predict(X_test_norm)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
tree_1.fit(X_train_norm, y_train)

In [None]:
pred_train = tree_1.predict(X_train_norm)
pred_test = tree_1.predict(X_test_norm)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
lrm.fit(X_train_norm, y_train)

In [None]:
pred_train = lrm.predict(X_train_norm)
pred_test = lrm.predict(X_test_norm)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

### Стандартизация

Для того, чтобы сделать стандартизацию данных нужно посчитать в каждом признаке его среднее значение (`mean`) и стандартное отклонение (`std`), а затем сделать следующее вычисление:

$$x = \frac{x - mean}{std}$$

После такого преобразования $mean = 0, std = 1$.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train_std = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_train_std

In [None]:
X_train_std.describe()

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

In [None]:
X_train_std.describe()

In [None]:
X_test_std = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)
X_test_std.describe()

#### Обучение модели на масштабированных данных

In [None]:
knn.fit(X_train_std, y_train)

In [None]:
pred_train = knn.predict(X_train_std)
pred_test = knn.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
tree_1.fit(X_train_std, y_train)

In [None]:
pred_train = tree_1.predict(X_train_std)
pred_test = tree_1.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
lrm.fit(X_train_std, y_train)

In [None]:
pred_train = lrm.predict(X_train_std)
pred_test = lrm.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
tree_2 = DecisionTreeRegressor(random_state=1, max_depth=9)
tree_2.fit(X_train_std, y_train)
pred_train = tree_2.predict(X_train_std)
pred_test = tree_2.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(10, 8))
plot_tree(tree_1, max_depth=1, filled=True, feature_names=X_train.columns);

In [None]:
plt.figure(figsize=(10, 8))
plot_tree(tree_2, max_depth=1, filled=True, feature_names=X_train.columns);

Итоги
- Масштабирование данных нужно для более **стабильного** обучения модели.
- Есть два основных вида масштабирование
    1. ***Нормализация***
        - После min = 0, max = 1 у всех признаков
        - Подход лучше зарекомендовал себя в подходах МЛ, которые работают с расстояниями (KNN) 
    2. ***Стандартизация***
        - После mean = 0, std = 1 у всех признаков
        - Подход лучше зарекомендовал себя в линейных подходах МЛ (LinearRegression, SVM) 
- Для моделей, основанных не **дереве решений** (DecisionTree, Bagging, RandomForest, Boosting) масштабирование данных **необязательно**

### Настройка гиперпараметров модели

In [None]:
from sklearn.ensemble import RandomForestRegressor

tree_3 = RandomForestRegressor(random_state=1, max_depth=9)
tree_3.fit(X_train_std, y_train)
pred_train = tree_3.predict(X_train_std)
pred_test = tree_3.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# словарь с гиперпараметрами
params = {"max_depth": range(1,10,1), "min_samples_leaf": range(1,6,1)} 

In [None]:
grid = GridSearchCV(tree_3, params, cv = 5, n_jobs = -1)

In [None]:
%%timeit

# обучение 
grid.fit(X_train_std, y_train) 

In [None]:
# параметры лучшей модели
grid.best_params_ 

In [None]:
# лучший алгоритм
best_clf = grid.best_estimator_ 

In [None]:
pred_train = best_clf.predict(X_train_std)
pred_test = best_clf.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
tree_2 = DecisionTreeRegressor(random_state=1, max_depth=9)
tree_2.fit(X_train_std, y_train)
pred_train = tree_2.predict(X_train_std)
pred_test = tree_2.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
# словарь с гиперпараметрами
params = {"max_depth": range(1,10,1), "min_samples_leaf": range(1,6,1)} 

In [None]:
grid2 = GridSearchCV(tree_2, params, cv = 5, n_jobs = -1)

In [None]:
%%timeit

# обучение 
grid2.fit(X_train_std, y_train) 

In [None]:
# параметры лучшей модели
grid2.best_params_ 

In [None]:
# лучший алгоритм
best_clf2 = grid2.best_estimator_ 

In [None]:
pred_train = best_clf2.predict(X_train_std)
pred_test = best_clf2.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
lrm.fit(X_train_std, y_train)
pred_train = lrm.predict(X_train_std)
pred_test = lrm.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')

In [None]:
# словарь с гиперпараметрами
linear_params = {'normalize': (True, False)}

In [None]:
grid3 = GridSearchCV(lrm, linear_params, cv = 5, n_jobs = -1)

In [None]:
%%timeit

# обучение 
grid3.fit(X_train_std, y_train) 

In [None]:
# параметры лучшей модели
grid3.best_params_ 

In [None]:
# лучший алгоритм
best_clf3 = grid3.best_estimator_ 

In [None]:
pred_train = best_clf3.predict(X_train_std)
pred_test = best_clf3.predict(X_test_std)

print(f'Train R2 {r2_score(y_train, pred_train):.2f}')
print(f'Test R2 {r2_score(y_test, pred_test):.2f}')