In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

In [2]:
data_1 = pd.read_csv('../DATA/Advertising.csv')

In [3]:
X = data_1.drop('sales', axis=1) #Значения признаков 
y = data_1['sales'] #Значения целевой функции

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) #Разбиение на
                                        #обучающую и тестовую выборки

In [5]:
#Масщтабирование признаков

In [4]:
scaler = StandardScaler() 

In [7]:
scaler.fit(X_train)

In [8]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

In [9]:
#Линейная регрессия без регуляризации

In [10]:
model = LinearRegression()

In [11]:
model.fit(X_train, y_train)

In [12]:
y_predict = model.predict(X_test)

In [13]:
#Линейная регрессия с регуляризацией

In [14]:
ridge_model = Ridge(alpha=100)

In [15]:
ridge_model.fit(X_train, y_train)

In [16]:
y_predict = ridge_model.predict(X_test)

In [17]:
#Считаем ошибку

In [18]:
mean_squared_error(y_test, y_predict)

7.341775789034128

In [19]:
#Выбираем оптимальный параметр alpha

In [20]:
ridge_model_2 = Ridge(alpha=1)

In [21]:
ridge_model_2.fit(X_train, y_train)

In [22]:
y_predict_2 = ridge_model_2.predict(X_test)

In [23]:
mean_squared_error(y_test, y_predict_2)

2.3190215794287514

In [24]:
#Разбиение данных на 3 части

In [38]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101) #Разбиение на
                                        #2 части

In [39]:
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101) #Разбиение 
                                        #на validation и test set

In [40]:
scaler.fit(X_train)

In [41]:
X_train = scaler.transform(X_train)
X_eval = scaler.transform(X_eval)
X_test = scaler.transform(X_test)

In [None]:
#Alpha = 100

In [42]:
model = Ridge(alpha=100)

In [43]:
model.fit(X_train, y_train)

In [44]:
y_eval_predict = model.predict(X_eval)

In [45]:
mean_squared_error(y_eval, y_eval_predict)

7.320101458823869

In [None]:
#Alpha = 1

In [50]:
model_2 = Ridge(alpha=1)

In [51]:
model_2.fit(X_train, y_train)

In [54]:
y_eval_predict_2 = model_2.predict(X_eval)

In [55]:
mean_squared_error(y_eval, y_eval_predict_2)

2.3837830750569853

In [56]:
#Финальная метрика на Тестовом наборе данных 

In [57]:
y_final_predict = model_2.predict(X_test)

In [58]:
mean_squared_error(y_test, y_final_predict)

2.254260083800517

In [59]:
#Кросс-валидация в K шагов

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [64]:
scaler.fit(X_train)

In [65]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Alpha = 100

In [66]:
model = Ridge(alpha=100)

In [68]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error',
                         cv=5) #Получение метрики при кросс-валидации, cv - кол-во итераций

In [71]:
abs(scores.mean()) #Средняя метрика при alpha = 100

8.215396464543607

In [None]:
#Alpha = 1

In [72]:
model_2 = Ridge(alpha=1)

In [73]:
scores_2 = cross_val_score(model_2, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [74]:
abs(scores_2.mean())

3.344839296530695

In [75]:
#Метод cross_validate() - использует СПИСОК метрик и вычисляет разные параметры

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [77]:
scaler.fit(X_train)

In [78]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [89]:
model = Ridge(alpha=1)

In [90]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [91]:
scores = pd.DataFrame(scores)

In [92]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.001995,0.000382,-2.962508,-1.457174
1,0.001033,0.000807,-3.057378,-1.555308
2,0.000999,0.001404,-2.17374,-1.23877
3,0.0,0.0,-0.833034,-0.768938
4,0.0,0.0,-3.464018,-1.434489
5,0.0,0.0,-8.232647,-1.494316
6,0.0,0.0,-1.905864,-1.081362
7,0.0,0.0,-2.765048,-1.250011
8,0.001033,0.000986,-4.989505,-1.580971
9,0.000992,0.001022,-2.846438,-1.223326


In [93]:
model.fit(X_train, y_train)

In [94]:
y_predict = model.predict(X_test)

In [95]:
mean_squared_error(y_test, y_predict)

2.3190215794287514

In [None]:
#Поиск по сетке для подбора оптимальных гиперпараметров

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) 

In [6]:
scaler.fit(X_train)

In [7]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
base_model = ElasticNet()

In [16]:
param_grid = {'alpha': [0.1, 1, 5, 50, 100], 
              'l1_ratio': [.1, .5, .7, .95, .99, 1]} #Значения гиперпараметров для исследования. Названия должны быть
                              #написаны в точности, как они написаны в модели

In [17]:
#Будут пройдены все комбинации параметров, для каждой комбинации будет выполнена кросс-валидация, выберет 
#наилучшую комбинацию параметров в зависимости от метрики и вернет модель с этими параметрами

In [18]:
grid_model = GridSearchCV(base_model, param_grid, scoring='neg_mean_squared_error', cv=5, verbose=2)

In [19]:
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.1; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.5; total time=   0.0s
[CV] END ............................alpha=0.1, l1_ratio=0.7; total time=   0.0s
[CV] END ............................alpha=0.1,

In [20]:
grid_model.best_estimator_ #Наилучшая модель с параметрами

In [22]:
pd.DataFrame(grid_model.cv_results_).head() #Данные о комбинациях

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001487,0.001739,0.000276,0.000366,0.1,0.1,"{'alpha': 0.1, 'l1_ratio': 0.1}",-3.453021,-1.40519,-5.789125,-2.187302,-4.645576,-3.496043,1.591601,6
1,0.000356,0.000442,0.000661,0.000837,0.1,0.5,"{'alpha': 0.1, 'l1_ratio': 0.5}",-3.32544,-1.427522,-5.59561,-2.163089,-4.451679,-3.392668,1.506827,5
2,0.000205,0.00041,0.000194,0.000388,0.1,0.7,"{'alpha': 0.1, 'l1_ratio': 0.7}",-3.26988,-1.442432,-5.502437,-2.16395,-4.356738,-3.347088,1.462765,4
3,0.000802,0.000401,0.0002,0.000399,0.1,0.95,"{'alpha': 0.1, 'l1_ratio': 0.95}",-3.213052,-1.472417,-5.396258,-2.177452,-4.24108,-3.300052,1.406248,3
4,0.0008,0.0004,0.000593,0.000484,0.1,0.99,"{'alpha': 0.1, 'l1_ratio': 0.99}",-3.208124,-1.478489,-5.380242,-2.181097,-4.222968,-3.294184,1.396953,2


In [23]:
y_predict = grid_model.predict(X_test)

In [24]:
mean_squared_error(y_test, y_predict)

2.387342642087474