# Лабораторная работа 1. Введение в машинное обучение. Обучение с учителем. Задача регрессии

<b>Выполните следующие задания:</b>
- загрузите датасет для регрессии, выделите целевой признак и предикторы, разбейте данные на обучающую и тестовую выборку;
- решите задачу регрессии на ваших данных с использованием моделей sklearn (линейная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
- решите задачу регрессии на ваших данных с использованием моделей sklearn (полиномиальная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
- вычислите значения метрик $R^2$, MAE, MSE, RMSE, MAPE для всех обученных моделей; выберите лучшую модель;
- самостоятельно реализуйте (желательно в виде класса) модель линейной регрессии с регуляризацией (можете выбрать L1 или L2);
- самостоятельно реализуйте вычисление всех используемых метрик (в виде функций, принимающих два аргумента);
- обучите вашу модель линейной регрессии на ваших данных; оцените качество с помощью реализованных вами метрик.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/mumbai_houses_task.csv')
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,New Property,0.0,,0.0,Flat
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,New Property,0.0,,0.0,Flat
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,New Property,0.0,,0.0,Flat
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,New Property,2.0,,2.0,Flat
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,New Property,0.0,Unfurnished,0.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,Resale,0.0,Semi-Furnished,0.0,Flat
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,Resale,1.0,Unfurnished,1.0,Flat
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,Resale,0.0,,0.0,Flat
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,Resale,0.0,Furnished,0.0,Flat


## Предобработка данных

In [3]:
#подсчитываем количество пропущенных значений в столбцах
data.isnull().sum()

price                  0
area                   0
latitude               0
longitude              0
Bedrooms               0
Bathrooms              0
Balcony                0
Status               481
neworold               0
parking                0
Furnished_status    2655
Lift                   0
type_of_building       0
dtype: int64

In [4]:
#смотрим названия всех колонок
data.columns.values.tolist()

['price',
 'area',
 'latitude',
 'longitude',
 'Bedrooms',
 'Bathrooms',
 'Balcony',
 'Status',
 'neworold',
 'parking',
 'Furnished_status',
 'Lift',
 'type_of_building']

In [5]:
# значения в колонке "Status"
data["Status"].value_counts()

Status
Ready to Move         5771
Under Construction       3
Name: count, dtype: int64

In [6]:
# значения в колонке "neworold"
data["neworold"].value_counts()

neworold
Resale          5515
New Property     740
Name: count, dtype: int64

In [7]:
# значения в колонке "type_of_building"
data["type_of_building"].value_counts()

type_of_building
Flat                5828
Individual House     427
Name: count, dtype: int64

In [8]:
# one-hot кодирование категориального признака
data = pd.get_dummies(data, columns=['type_of_building'], drop_first=True, dtype=int)
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,New Property,0.0,,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,New Property,0.0,,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,New Property,0.0,,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,New Property,2.0,,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,New Property,0.0,Unfurnished,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,Resale,0.0,Semi-Furnished,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,Resale,1.0,Unfurnished,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,Resale,0.0,,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,Resale,0.0,Furnished,0.0,0


In [9]:
#поменяем строчные данные на числовые
data["neworold"] = data["neworold"].map({"Resale": 0, "New Property": 1})
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,1,0.0,,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,1,0.0,,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,1,0.0,,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,1,2.0,,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,Unfurnished,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,0,0.0,Semi-Furnished,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,0,1.0,Unfurnished,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,0,0.0,,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,0,0.0,Furnished,0.0,0


In [10]:
# значения в колонке "neworold"
data["neworold"].value_counts()

neworold
0    5515
1     740
Name: count, dtype: int64

In [11]:
# значения в колонке "Furnished_status"
data["Furnished_status"].value_counts()

Furnished_status
Unfurnished       1384
Semi-Furnished    1195
Furnished         1021
Name: count, dtype: int64

In [12]:
#поменяем строчные данные на числовые
data["Furnished_status"] = data["Furnished_status"].map({"Unfurnished": 0, "Semi-Furnished": 1, "Furnished": 2})
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,1,0.0,,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,1,0.0,,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,1,0.0,,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,1,2.0,,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,0,0.0,,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,0,0.0,2.0,0.0,0


In [13]:
# Заполним пропущенные значения средними по столбцу
data["Furnished_status"] = data["Furnished_status"].fillna(data["Furnished_status"].median())
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,0,0.0,2.0,0.0,0


In [14]:
#поменяем строчные данные на числовые
data["Status"] = data["Status"].map({"Ready to Move": 1, "Under Construction": 0})
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [15]:
# Заполним пропущенные значения средними по столбцу
data["Status"] = data["Status"].fillna(data["Status"].median())
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [16]:
# удалbv строки, где остались пропущенные значения
data = data.dropna()
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [17]:
# удалим дубликаты строк во всех столбцах
data.drop_duplicates(inplace=True)
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [18]:
# красиво переименуем все столбцы
data.rename(mapper=str.lower, axis=1, inplace=True)
data

Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [19]:
# проверим не осталось ли пропущенных значений
data.isnull().sum()

price                                0
area                                 0
latitude                             0
longitude                            0
bedrooms                             0
bathrooms                            0
balcony                              0
status                               0
neworold                             0
parking                              0
furnished_status                     0
lift                                 0
type_of_building_individual house    0
dtype: int64

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6238 entries, 0 to 6254
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   price                              6238 non-null   float64
 1   area                               6238 non-null   float64
 2   latitude                           6238 non-null   float64
 3   longitude                          6238 non-null   float64
 4   bedrooms                           6238 non-null   float64
 5   bathrooms                          6238 non-null   float64
 6   balcony                            6238 non-null   float64
 7   status                             6238 non-null   float64
 8   neworold                           6238 non-null   int64  
 9   parking                            6238 non-null   float64
 10  furnished_status                   6238 non-null   float64
 11  lift                               6238 non-null   float64
 1

In [21]:
#приведем все данные к типу float
data = data.astype({'neworold': float, 'type_of_building_individual house': float})

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6238 entries, 0 to 6254
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   price                              6238 non-null   float64
 1   area                               6238 non-null   float64
 2   latitude                           6238 non-null   float64
 3   longitude                          6238 non-null   float64
 4   bedrooms                           6238 non-null   float64
 5   bathrooms                          6238 non-null   float64
 6   balcony                            6238 non-null   float64
 7   status                             6238 non-null   float64
 8   neworold                           6238 non-null   float64
 9   parking                            6238 non-null   float64
 10  furnished_status                   6238 non-null   float64
 11  lift                               6238 non-null   float64
 1

In [23]:
# сохраним предобработанные данные
data.to_csv("../data/mumbai.csv")

### Вычисление метрик

In [24]:
def metrics(y_test, y_pred):
    print(f'MAE: {np.mean(abs(y_test-y_pred))}')  #средняя абсолютная ошибка
    print(f'MSE: {np.mean(np.square(y_test-y_pred))}') #средняя квадратичная ошибка
    print(f'RMSE: {np.sqrt(np.mean(np.square(y_test-y_pred)))}') #корень из средней квадратичной ошибки
    print(f'MAPE: {sqrt(np.sum(np.abs(y_test - y_pred)/y_test)/np.size(y_test))}') # средняя абсолютная ошибка в процентах
    slope, intercept = np.polyfit(y_test, y_pred, 1)
    r_squared = 1 - (sum((y_pred - (slope * y_test + intercept))**2) / ((len(y_pred) - 1) * np.var(y_pred, ddof=1)))
    print(f'R^2: {r_squared}') # коэффициент детерминации

### Выделение целевого признака и предикторов

In [25]:
y = data["price"]  #целевой признак
X = data.drop(["price"], axis=1) #предикторы

In [26]:
X

Unnamed: 0,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,0.0
4,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6250,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6251,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
6252,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6253,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0


In [27]:
y

0       22400000.0
1       35000000.0
2       31700000.0
3       18700000.0
4       13500000.0
           ...    
6250    19500000.0
6251    22000000.0
6252    20000000.0
6253    11000000.0
6254    15000000.0
Name: price, Length: 6238, dtype: float64

### Разделение данных на обучающую и тестовую выборки

In [28]:
# функция из sklearn для разбиения данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=59)
#shuffle = False

In [29]:
X_train

Unnamed: 0,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
257,1250.0,19.212571,72.873522,3.0,3.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
5364,700.0,19.085650,72.908220,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5165,800.0,19.198821,72.870106,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4053,980.0,19.100530,72.911665,3.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5808,870.0,18.968870,72.808820,2.0,2.0,1.0,1.0,0.0,1.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3900,1100.0,19.200165,72.868551,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1389,660.0,19.309393,72.848594,2.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0
664,840.0,19.211035,72.862365,2.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5339,1600.0,18.963299,72.843292,3.0,4.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0


In [30]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape #размерности

((5614, 12), (5614,), (624, 12), (624,))

### Линейная регрессия (Linear Regression)

Линейная регрессия - это вычисление $\hat{y}$ по формуле: $\hat{y}_i=\sum_{j=1}^{n}{{w_{j}}{X_{ij}}}$.

$w$ - веса

в процессе обучения <b>подбираются (модель самостоятельно это делает) такие значения параметров $w$, которые позволят получать очень близкие к истинным значения.

Функция потерь
$L=\frac{1}{m}[y-(\sum_{j=1}^{n}{{w_{j}}{X_{ij}}})]^2$

Ее нужно минимизировать

Обучение модели на обучающей выборке

In [31]:
lr = LinearRegression().fit(X_train, y_train)

In [32]:
lr.predict(X_test)

array([ 4.61338494e+07,  1.63890755e+07,  2.02771663e+07,  4.81626288e+07,
        9.21089449e+07,  1.03036262e+07,  5.09277538e+06, -1.57174496e+06,
        2.15087305e+07, -6.98600574e+06,  2.93019953e+07,  2.44210162e+07,
        8.41362534e+06,  2.42821431e+07,  4.65412048e+07,  2.04208539e+07,
        1.84405304e+07,  1.99615401e+07,  1.12137177e+07,  2.51789110e+07,
        1.74161584e+07,  1.84196106e+07, -4.88101894e+06,  2.10553106e+07,
        2.99639777e+07,  2.46528490e+07,  1.44621480e+07,  6.54601193e+06,
        9.31609740e+06,  2.88760770e+07,  3.84663541e+07,  1.30766455e+07,
        2.25752763e+07,  4.42592209e+07,  1.22601338e+07,  2.32650710e+07,
        3.73797635e+07,  2.47692077e+07,  2.31966723e+07,  4.85768732e+07,
        1.95766438e+04,  1.98590070e+07,  3.46602119e+07,  2.52750064e+07,
        7.54607676e+07,  8.21151267e+06,  7.47449468e+07, -1.20426413e+06,
        3.84434192e+07,  5.73093415e+07,  1.71903792e+07,  5.40752512e+07,
        8.60803755e+06,  

Получение предсказания для тестовой выборки

In [33]:
y_pred = lr.predict(X_test)

In [34]:
# посмотрим метрики
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr.score(X_test, y_test)}')

MAE: 8914245.88432459
MSE: 204629615466344.7
RMSE: 14304880.826708928
MAPE: 0.71255546195285
R^2: 0.7643676620376749


In [35]:
#сравним с теми, что реализованы мной
metrics(y_test, y_pred)

MAE: 8914245.88432459
MSE: 204629615466344.7
RMSE: 14304880.826708928
MAPE: 0.71255546195285
R^2: 0.7684808830678711


In [36]:
len(lr.coef_) # коэффициенты в модели (значения весов)
lr.coef_

array([ 2.05863652e+04, -8.25866290e+07, -7.40326266e+07,  4.46742549e+05,
        8.19390151e+06, -4.40704955e+03, -3.63767811e+06,  2.71521099e+04,
       -4.40704955e+03,  1.88823120e+06, -4.40704955e+03, -2.07034000e+05])

#### Регуляризации

Для борьбы с переобучением накладываем дополнительные ограничения на значения переменных (весов), неявное ограничение добавляется непосредственно в целевую функцию.
Ridge и Lasso отличаются только функцией потерь

функция потерь минимизируется, поэтому посредством регуляризации мы стремимся не допускать, чтобы значениями переменных были большие числа (более того, некоторые веса станут равны 0, что в будущем позволит говорить о низкой значимости соответствующего предиктора). Параметр  𝛼
  определяет интенсивность регуляризации. Он принимает значения от 0 до 1 включительно. При  𝛼
 =0 получается линейная регрессия без регуляризации.

### L1  (Ridge)
$L=\frac{1}{m}[y-(\sum_{j=1}^{n}{{w_{j}}{X_{ij}}})]^2+\frac{1}{m}\alpha{\sum_{j=1}^n{|w_j|}}$.

In [37]:
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred = ridge.predict(X_test)

#метрики, в которых есть слово "ошибка", мы хотим уменьшить. 
#А коэффициент детерминации - наоборот увеличить (он кстати принимает значения от 0 до 1 включительно).
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 8888365.587432252
MSE: 204544216271599.12
RMSE: 14301895.548199166
MAPE: 0.7094091949909705
R^2: 0.7644659997678815


array([ 2.06035232e+04, -8.18814910e+07, -7.20966573e+07,  4.36216178e+05,
        8.22111017e+06, -8.02644411e+03, -3.12973205e+06,  2.67199301e+04,
       -8.02644412e+03,  1.90379807e+06, -8.02644412e+03, -2.20197601e+05])

#### Подбор оптимального значения гиперпараметра

In [38]:
#формируем словарь (ключи-названия гиперпараметров, значения - список (массив numpy) c возможными значениями)
parameters = {'alpha': np.arange(0, 1, 0.1)} 

In [39]:
# подбор значения с помощью GridSearchCV (перебирает все возможные варианты)
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
# выводим оптимальное значение параметра
ridge_optimal.best_params_

{'alpha': 0.1}

In [40]:
# подбор значения с помощью RandomizedSearchCV (перебирает только случайные варианты)
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
# выводим оптимальное значение параметра
ridge_optimal.best_params_

{'alpha': 0.1}

### L2 (Lasso)
 $L=\frac{1}{m}[y-(\sum_{j=1}^{n}{{w_{j}}{X_{ij}}})]^2+\frac{1}{m}\alpha{\sum_{j=1}^n{w_j^2}}$.

In [41]:
lasso = Lasso(alpha=0.1).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 8914245.517658576
MSE: 204629613238455.03
RMSE: 14304880.748837266
MAPE: 0.7125554175391399
R^2: 0.7643676646031045


array([ 2.05863656e+04, -8.25866202e+07, -7.40325962e+07,  4.46742299e+05,
        8.19390170e+06, -4.01349657e+04, -3.63749126e+06,  2.71518844e+04,
        1.79892737e+02,  1.88823115e+06,  2.67336781e+04, -2.07032779e+05])

#### Подбор оптимального значения гиперпараметра

In [42]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [43]:
lasso_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_optimal.best_params_

{'alpha': 0.9}

### Полиномиальная регрессия

в случае полиномиальной регрессии модель подбирает коэффициенты в большом полиноме, в котором значения предикторов входят с более высокими степенями

Полиномиальная регрессия является частным случае линейной регрессии.

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=59)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5614, 12), (5614,), (624, 12), (624,))

In [45]:
# создаем объект, который позволит расширить множество предикторов
p = PolynomialFeatures(2)
# добавляем новые предикторы
X_train=p.fit_transform(X_train)
X_test=p.fit_transform(X_test)
X_train.shape

(5614, 91)

In [46]:
lr2 = LinearRegression().fit(X_train, y_train)
lr2.coef_

array([-5.94781402e+02,  5.20524063e+06, -2.08817797e+10, -2.89965106e+10,
       -1.03534677e+09,  2.48822262e+09, -2.78830704e+06,  4.42906755e+08,
       -3.00269584e+08, -2.78852434e+06, -7.79126357e+07, -2.78837880e+06,
       -3.13013637e+08, -2.56352213e+00, -6.96748969e+04, -9.29377290e+04,
        3.06707330e+03,  1.39515314e+03,  2.48038126e+02,  2.91218762e+06,
       -6.25857094e+02,  3.24121492e+02,  1.07759940e+03,  5.03178395e+02,
        6.82642353e+03,  3.17816099e+07,  4.43823521e+08, -2.32114210e+06,
       -2.76532204e+07,  9.94429031e+05, -1.25963321e+10,  5.22853957e+06,
        9.94429047e+05, -7.65042115e+06,  9.94429596e+05,  1.17144768e+07,
        1.18434717e+08,  2.86412661e+07, -3.57582865e+07, -1.87787297e+05,
        3.25445402e+09,  4.91244014e+06, -1.87787237e+05, -1.96342808e+06,
       -1.87797602e+05,  9.77164957e+06, -1.03046468e+06,  6.85602408e+05,
       -1.75346942e+05, -1.00457741e+09,  2.17213498e+06, -1.75347306e+05,
        9.81371884e+04, -

In [47]:
y_pred = lr2.predict(X_test)
y_pred

array([ 2.54513935e+07,  1.34102480e+07,  1.51748168e+07,  4.96698477e+07,
        1.21765468e+08,  1.30494513e+07,  1.16738843e+07,  6.63361375e+06,
        1.30532762e+07,  5.45992717e+06,  2.52587937e+07,  2.14696508e+07,
        1.36905474e+07,  2.33898163e+07,  4.45145264e+07,  1.66176517e+07,
        2.01418054e+07,  2.09838584e+07,  1.27651458e+07,  2.41632755e+07,
        1.78990638e+07,  1.70200732e+07,  8.04344473e+06,  1.49137825e+07,
        2.80569695e+07,  2.12235750e+07,  1.61753002e+07,  1.14075939e+07,
        1.08994566e+07,  2.29834865e+07,  3.46608535e+07,  1.31552163e+07,
        2.26457402e+07,  4.75031137e+07,  1.53691191e+07,  1.93742545e+07,
        2.51789440e+07,  2.34587708e+07,  2.29879106e+07,  5.16937885e+07,
        9.85739090e+06,  1.77075041e+07,  1.73874121e+07,  2.43305520e+07,
        9.06792980e+07,  1.18791711e+07,  9.14246270e+07,  5.37590906e+06,
        3.82080895e+07,  5.10521786e+07,  1.75426429e+07,  6.08308094e+07,
        1.25426105e+07,  

In [55]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr2.score(X_test, y_test)}')
lr2.coef_

MAE: 8913302.274406146
MSE: 232875375760818.88
RMSE: 15260254.773784705
MAPE: 0.7001204776302845
R^2: 0.7835561036278091


array([-5.94781402e+02,  5.20524063e+06, -2.08817797e+10, -2.89965106e+10,
       -1.03534677e+09,  2.48822262e+09, -2.78830704e+06,  4.42906755e+08,
       -3.00269584e+08, -2.78852434e+06, -7.79126357e+07, -2.78837880e+06,
       -3.13013637e+08, -2.56352213e+00, -6.96748969e+04, -9.29377290e+04,
        3.06707330e+03,  1.39515314e+03,  2.48038126e+02,  2.91218762e+06,
       -6.25857094e+02,  3.24121492e+02,  1.07759940e+03,  5.03178395e+02,
        6.82642353e+03,  3.17816099e+07,  4.43823521e+08, -2.32114210e+06,
       -2.76532204e+07,  9.94429031e+05, -1.25963321e+10,  5.22853957e+06,
        9.94429047e+05, -7.65042115e+06,  9.94429596e+05,  1.17144768e+07,
        1.18434717e+08,  2.86412661e+07, -3.57582865e+07, -1.87787297e+05,
        3.25445402e+09,  4.91244014e+06, -1.87787237e+05, -1.96342808e+06,
       -1.87797602e+05,  9.77164957e+06, -1.03046468e+06,  6.85602408e+05,
       -1.75346942e+05, -1.00457741e+09,  2.17213498e+06, -1.75347306e+05,
        9.81371884e+04, -

#### Подбор оптимального значения гиперпараметра

In [49]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [50]:
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

{'alpha': 0.9}

In [51]:
lasso_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_optimal.best_params_

{'alpha': 0.0}

### L1

In [52]:
ridge = Ridge(alpha=0.9).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 7928973.313203209
MSE: 194537205826731.84
RMSE: 13947659.510711173
MAPE: 0.6057287479552821
R^2: 0.7759891376175214


array([ 0.00000000e+00,  8.95649053e+06, -7.75732106e+05, -5.51337340e+05,
       -3.44877658e+05,  3.33545344e+06, -1.63597841e+06,  8.22158339e+03,
       -9.74281262e+04, -1.63597841e+06,  9.15307980e+05, -1.63597841e+06,
       -5.45965428e+05, -2.72639326e+00, -7.40559315e+04, -1.03268264e+05,
        3.46223175e+03,  1.16127264e+03,  3.69749154e+02, -2.43556966e+03,
       -2.63160213e+02,  3.69745841e+02,  1.51736697e+03,  3.69745837e+02,
        7.21251646e+03, -6.94889039e+05,  1.45507519e+06,  1.09355675e+06,
       -3.13947851e+07,  1.14587383e+06, -6.49856742e+05,  5.91010999e+06,
        1.14587383e+06, -9.43018938e+06,  1.14587383e+06,  8.21860942e+06,
        4.65411765e+04, -2.72486218e+05,  8.11693556e+06, -2.57763745e+05,
        4.04972761e+04, -1.55240874e+06, -2.57763745e+05,  2.40864063e+06,
       -2.57763738e+05, -2.14430603e+06, -4.28124814e+05, -2.56491369e+05,
       -1.89353214e+05, -3.79472254e+05,  2.53937526e+06, -1.89353214e+05,
        4.48053039e+05, -

### L2

In [53]:
lasso = Lasso(alpha=0.05).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 8913302.274406146
MSE: 232875375760818.88
RMSE: 15260254.773784705
MAPE: 0.7001204776302845
R^2: 0.7318424846798306


array([ 0.00000000e+00,  2.65193850e+04, -9.31324239e+07, -9.59311184e+07,
       -3.55946963e+05,  1.37443092e+06, -2.70764792e+04, -4.93577793e+05,
       -1.30434156e+06,  3.16251590e+04,  1.10623062e+06,  1.56626995e+04,
       -1.06404833e+05, -1.96896404e+00, -2.34083393e+03,  6.24837479e+02,
        7.46648926e+02,  3.88639105e+03, -1.71792437e+03, -1.44441623e+04,
       -1.64488275e+03,  1.93870175e+03, -8.28642405e+02,  1.08694973e+02,
        8.84373883e+03,  1.89005486e+06, -7.04487142e+05, -1.93610121e+06,
       -4.66263890e+05,  6.46280897e+04, -3.60065521e+06,  2.50473902e+05,
       -3.58809314e+03, -3.40086701e+05, -4.70684056e+03, -1.40002012e+05,
        2.16854280e+05,  5.12081309e+05,  1.94461115e+04, -1.76840553e+04,
        1.11693831e+06, -4.68537278e+04,  8.23718828e+02,  9.50884230e+04,
        1.34525687e+03, -9.21483104e+04, -7.39044210e+05,  7.68181841e+05,
        7.87313995e+04,  3.43520885e+06,  4.36551701e+06, -3.21427441e+05,
       -5.29310449e+05,  

Лучшая модель L1 полиномиальной регрессии: метрика MAPE минимальная, R2 почти самая высокая

### Построение модели

In [53]:
class BestLinLassoModel:
    def __init__(self):
        self.y = np.array([])
        
    def fit(self, X, y, learning_rate = 0.0001, max_iter = 100, alpha = 1.):
        X = np.array(X)
        y = np.array(y)
        self.w = np.array([0 for i in range(X.shape[1])])
        self.w = np.append(self.w, 1)
        for t in range(max_iter):
            grad = np.array([])
            for i in range(X.shape[1]):
                grad = np.append(grad, (X[:, i]*(y - X.dot(self.w[:-1]) - self.w[-1])).sum() + alpha * self.w[i])
            grad = np.append(grad,(y - X.dot(self.w[:-1]) - self.w[-1]).sum() + alpha * self.w[-1])
            w_next = self.w - 2 * grad * learning_rate / len(X)
        self.w = w_next
        print(self.w)
        
    def predict(self, X):
        X = np.array(X)
        self.y = X.dot(self.w[:-1]) + self.w[-1]
        return self.y

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=59)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5614, 12), (5614,), (624, 12), (624,))

In [55]:
modelka = BestLinLassoModel()
modelka.fit(X_train, y_train)
y_pred = modelka.predict(X_test)

[-8.80052549e+06 -1.02593917e+05 -3.91199592e+05 -1.56055293e+04
 -1.57227473e+04 -5.12937746e+03 -5.36617401e+03 -6.97577817e+02
 -5.12937746e+03 -5.36198663e+03 -5.12937746e+03 -3.72741347e+02
 -5.36834821e+03]


In [56]:
metrics(y_test, y_pred)

MAE: 10491380288.988789
MSE: 1.4824203855747442e+20
RMSE: 12175468720.237198
MAPE: 24.41473259839892
R^2: 0.6981204750982295
