# Task 1

Подготовьте данные, проведите обучение моделей:

1. Линейная регрессия;
2. Регрессия дерева решений;
3. LASSO;
4. Гребневая регрессия;
5. Elastic Net регрессия.

Найдите реализации методов в sklearn, оставьте в нотбуке ссылки на документацию. Найдите наилучшие гиперпараметры. Оцените качество моделей: R2, Mean Square Error(MSE), Root Mean Square Error(RMSE),  mean absolute error (MAE). Свои действия снабжайте пояснениями.

In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
forest_fires = pd.read_csv("../data/forest_fires_preprocessed.csv")
forest_fires.pop('Unnamed: 0')
forest_fires.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,6,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,3,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,3,7,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
forest_fires.describe()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,7.475822,4.259188,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,2.27599,2.072929,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,1.0,1.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,7.0,2.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,8.0,5.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,9.0,6.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,12.0,7.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


In [4]:
forest_fires = shuffle(forest_fires, random_state=0)

In [5]:
X = forest_fires.drop("area", 1)
y = forest_fires["area"]

In [6]:
# Масштабируем признаки
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [7]:
from numpy import mean
from numpy import std

In [8]:
def print_metrics(metrics, names = ["R2", "MSE", "RMSE", "MAE"]):
    for i, m in enumerate(metrics):
        print(f'{names[i]} metric\nmean: {-mean(m)}\nstd: {std(m)}\n')

### Linear regression

In [9]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

from sklearn.linear_model import LinearRegression

model = LinearRegression()

scores = list()
scores.append(cross_val_score(model, X, y, cv=5, scoring='r2'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error'))

In [10]:
print_metrics(scores)

R2 metric
mean: 0.2708253783665082
std: 0.3709288577545106

MSE metric
mean: 4098.859977490176
std: 4388.427441688517

RMSE metric
mean: 55.02246142783495
std: 32.73207472667432

MAE metric
mean: 20.213623701471956
std: 3.0056486431920457



### Decision tree regression

In [11]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

from sklearn.tree import DecisionTreeRegressor

parameters = {'max_depth': np.arange(1, 21, 1)}

decision_tree_model = DecisionTreeRegressor()

model = GridSearchCV(decision_tree_model, parameters)

model.fit(X, y)

model.best_params_

{'max_depth': 2}

In [12]:
model = DecisionTreeRegressor(max_depth=2)

scores = list()
scores.append(cross_val_score(model, X, y, cv=5, scoring='r2'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error'))

In [13]:
print_metrics(scores)

R2 metric
mean: 0.0695293991339527
std: 0.060023945806815235

MSE metric
mean: 4283.796688660224
std: 4939.251206527353

RMSE metric
mean: 54.4112016737566
std: 36.376061098997674

MAE metric
mean: 18.035167716595204
std: 6.233327979450484



### Lasso regression

In [14]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso

from sklearn.linear_model import Lasso

parameters = {'alpha':np.arange(0.1, 2, 0.1)}
lasso_model = Lasso()

model = GridSearchCV(lasso_model, parameters)
model.fit(X, y)
model.best_params_

{'alpha': 1.9000000000000001}

In [15]:
model = Lasso(alpha=1.9)

scores = list()
scores.append(cross_val_score(model, X, y, cv=5, scoring='r2'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error'))

In [16]:
print_metrics(scores)

R2 metric
mean: 0.10814197737986933
std: 0.15250959184531854

MSE metric
mean: 4052.3068352334317
std: 4471.504141366694

RMSE metric
mean: 53.77697300225814
std: 34.063822597410166

MAE metric
mean: 18.94211057352556
std: 4.06806315617139



### Ridge regression

In [17]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge

from sklearn.linear_model import Ridge

parameters = {'alpha':np.arange(0.1, 2, 0.1)}
ridge_model = Ridge()

model = GridSearchCV(ridge_model, parameters)
model.fit(X, y)
model.best_params_

{'alpha': 1.9000000000000001}

In [18]:
model = Ridge(alpha=1.9)

scores = list()
scores.append(cross_val_score(model, X, y, cv=5, scoring='r2'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error'))

In [19]:
print_metrics(scores)

R2 metric
mean: 0.26255284232324544
std: 0.3608292341424691

MSE metric
mean: 4094.94843466053
std: 4391.354282121362

RMSE metric
mean: 54.95400393312265
std: 32.78728238782332

MAE metric
mean: 20.15859882117379
std: 3.0375808035999645



### Elastic Net regression

In [20]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet

from sklearn.linear_model import ElasticNet

parameters = {'alpha':np.arange(0.1, 2, 0.1),
             'l1_ratio':np.arange(0.1, 1, 0.1)}
elastic_net_model = ElasticNet()

model = GridSearchCV(elastic_net_model, parameters)
model.fit(X, y)
model.best_params_

{'alpha': 1.9000000000000001, 'l1_ratio': 0.1}

In [21]:
model = ElasticNet(alpha=1.9, l1_ratio=0.1)

scores = list()
scores.append(cross_val_score(model, X, y, cv=5, scoring='r2'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error'))
scores.append(cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error'))

In [22]:
print_metrics(scores)

R2 metric
mean: 0.06684763225291288
std: 0.09658921504605927

MSE metric
mean: 4033.725583105702
std: 4485.38771732169

RMSE metric
mean: 53.40247264465283
std: 34.37879431485065

MAE metric
mean: 18.661012047292473
std: 4.330067657444015



# Task 2

Реализуйте в отдельном модуле(!) линейную регрессию и регрессию дерева решений. Протестируйте их работу на своих данных в нотбуке. При реализации не допускается использование сторонних модулей (за исключением numpy). Модуль не должен находиться в папке notebooks(!).

In [23]:
from sys import path
path.insert(0, r"..\modules")

In [24]:
from regression_models import LR

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [26]:
model = LR()

In [27]:
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)

In [30]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [41]:
scores = list()
scores.append(r2_score(np.array(y_test), np.array(y_pred)))
scores.append(mean_squared_error(np.array(y_test), np.array(y_pred), squared=True))
scores.append(mean_squared_error(np.array(y_test), np.array(y_pred), squared=False))
scores.append(mean_absolute_error(np.array(y_test), np.array(y_pred)))

In [42]:
print_metrics(scores)

R2 metric
mean: 0.5135970406107357
std: 0.0

MSE metric
mean: -1276.9080417337123
std: 0.0

RMSE metric
mean: -35.73385008271166
std: 0.0

MAE metric
mean: -20.38971762608918
std: 0.0

