# Лабораторная работа 1. Введение в машинное обучение. Обучение с учителем. Задача регрессии

<b>Выполните следующие задания:</b>
- загрузите датасет для регрессии, выделите целевой признак и предикторы, разбейте данные на обучающую и тестовую выборку;
- решите задачу регрессии на ваших данных с использованием моделей sklearn (линейная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
- решите задачу регрессии на ваших данных с использованием моделей sklearn (полиномиальная регрессия + L1, L2), для моделей с регуляризациями подберите гиперпараметр;
- вычислите значения метрик $R^2$, MAE, MSE, RMSE, MAPE для всех обученных моделей; выберите лучшую модель;
- самостоятельно реализуйте (желательно в виде класса) модель линейной регрессии с регуляризацией (можете выбрать L1 или L2);
- самостоятельно реализуйте вычисление всех используемых метрик (в виде функций, принимающих два аргумента);
- обучите вашу модель линейной регрессии на ваших данных; оцените качество с помощью реализованных вами метрик.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/mumbai_houses_task.csv')
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,New Property,0.0,,0.0,Flat
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,New Property,0.0,,0.0,Flat
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,New Property,0.0,,0.0,Flat
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,New Property,2.0,,2.0,Flat
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,New Property,0.0,Unfurnished,0.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,Resale,0.0,Semi-Furnished,0.0,Flat
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,Resale,1.0,Unfurnished,1.0,Flat
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,Resale,0.0,,0.0,Flat
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,Resale,0.0,Furnished,0.0,Flat


## Предобработка данных

In [3]:
data = pd.get_dummies(data, columns=['type_of_building'], drop_first=True, dtype=int) # категориальный признак

In [4]:
data["neworold"] = data["neworold"].map({"Resale": 0, "New Property": 1})

In [5]:
data["Furnished_status"] = data["Furnished_status"].map({"Unfurnished": 0, "Semi-Furnished": 1, "Furnished": 2})
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,1,0.0,,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,1,0.0,,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,1,0.0,,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,1,2.0,,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,0,0.0,,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,0,0.0,2.0,0.0,0


In [6]:
data["Furnished_status"] = data["Furnished_status"].fillna(data["Furnished_status"].median())
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,Under Construction,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,Under Construction,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,Under Construction,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,Ready to Move,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,Ready to Move,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,Ready to Move,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,Ready to Move,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,Ready to Move,0,0.0,2.0,0.0,0


In [7]:
data["Status"] = data["Status"].map({"Ready to Move": 1, "Under Construction": 0})
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [8]:
data["Status"] = data["Status"].fillna(data["Status"].median())
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [9]:
data = data.dropna()
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [10]:
data.drop_duplicates(inplace=True)
data

Unnamed: 0,price,area,latitude,longitude,Bedrooms,Bathrooms,Balcony,Status,neworold,parking,Furnished_status,Lift,type_of_building_Individual House
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [11]:
data.rename(mapper=str.lower, axis=1, inplace=True)
data

Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [12]:
data.isnull().sum()

price                                0
area                                 0
latitude                             0
longitude                            0
bedrooms                             0
bathrooms                            0
balcony                              0
status                               0
neworold                             0
parking                              0
furnished_status                     0
lift                                 0
type_of_building_individual house    0
dtype: int64

In [13]:
data

Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
0,22400000.0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,35000000.0,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,31700000.0,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,18700000.0,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,13500000.0,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,19500000.0,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,22000000.0,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,20000000.0,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,11000000.0,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


### Вычисление метрик

In [14]:
def metrics(y_test, y_pred):
    print(f'MAE: {np.mean(abs(y_test-y_pred))}')  #средняя абсолютная ошибка
    print(f'MSE: {np.mean(np.square(y_test-y_pred))}') #средняя квадратичная ошибка
    print(f'RMSE: {np.sqrt(np.mean(np.square(y_test-y_pred)))}') #корень из средней квадратичной ошибки
    print(f'MAPE: {sqrt(np.sum(np.abs(y_test - y_pred)/y_test)/np.size(y_test))}') # средняя абсолютная ошибка в процентах
    slope, intercept = np.polyfit(y_test, y_pred, 1)
    r_squared = 1 - (sum((y_pred - (slope * y_test + intercept))**2) / ((len(y_pred) - 1) * np.var(y_pred, ddof=1)))
    print(f'R^2: {r_squared}') # коэффициент детерминации

### Выделение целевого признака и предикторов

In [15]:
y = data["price"]
X = data.drop(["price"], axis=1)

In [16]:
X

Unnamed: 0,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
0,629.0,19.032800,72.896357,2.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
1,974.0,19.032800,72.896357,3.0,2.0,0.0,0.0,1,0.0,1.0,0.0,0
2,968.0,19.085600,72.909277,3.0,3.0,0.0,0.0,1,0.0,1.0,0.0,0
3,629.0,19.155756,72.846862,2.0,2.0,2.0,1.0,1,2.0,1.0,2.0,0
4,1090.0,19.177555,72.849887,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6250,810.0,19.138320,72.810020,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6251,1400.0,19.221920,72.854250,3.0,3.0,1.0,1.0,0,1.0,0.0,1.0,0
6252,750.0,19.144320,72.824111,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
6253,700.0,19.047201,72.872225,2.0,2.0,0.0,1.0,0,0.0,2.0,0.0,0


In [17]:
y

0       22400000.0
1       35000000.0
2       31700000.0
3       18700000.0
4       13500000.0
           ...    
6250    19500000.0
6251    22000000.0
6252    20000000.0
6253    11000000.0
6254    15000000.0
Name: price, Length: 6238, dtype: float64

### Разделение данных на обучающую и тестовую выборки

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=61)
#, shuffle = False   random_state=59, 

In [19]:
X_train

Unnamed: 0,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building_individual house
5254,2700.0,19.224833,72.836704,4.0,4.0,4.0,1.0,0,4.0,1.0,4.0,0
4585,850.0,19.211040,72.869411,2.0,2.0,0.0,1.0,1,0.0,0.0,0.0,1
282,1715.0,19.119446,72.864410,2.0,2.0,5.0,1.0,0,5.0,2.0,5.0,0
5064,646.0,19.126460,72.859070,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
5297,650.0,19.277014,72.872141,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6195,800.0,19.176718,72.944581,2.0,3.0,0.0,1.0,1,0.0,1.0,0.0,1
2069,600.0,19.209944,72.830872,2.0,2.0,0.0,1.0,1,0.0,2.0,0.0,0
4009,1100.0,19.019738,72.833943,2.0,2.0,0.0,1.0,0,0.0,1.0,0.0,0
3609,990.0,19.174674,72.943104,3.0,3.0,0.0,1.0,0,0.0,2.0,0.0,1


In [20]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5614, 12), (5614,), (624, 12), (624,))

### Линейная регрессия (Linear Regression)

Обучение модели на обучающей выборке

In [21]:
lr = LinearRegression().fit(X_train, y_train)

In [22]:
lr.predict(X_test)

array([ 3.97160969e+07,  4.29031930e+07,  1.92762599e+07,  6.08746714e+07,
        2.07391267e+07,  4.55617653e+07,  4.14753378e+07,  2.96017010e+07,
        2.96959675e+06,  6.43322399e+07,  1.50955907e+07,  2.07954735e+07,
        1.57838963e+07,  1.43707385e+07,  6.28930222e+07, -3.37603811e+06,
        1.98649607e+07,  3.96878589e+07,  1.66190960e+07,  2.65998860e+07,
        2.68391900e+07, -1.66943135e+06,  3.19809461e+07,  1.28095951e+07,
        2.49308726e+07,  6.02873674e+07,  3.72262482e+07,  6.61602996e+07,
        1.63235988e+07,  1.23573544e+07,  5.03438292e+06,  3.88955075e+07,
        1.10741238e+07,  4.41375282e+07,  2.03035005e+07,  1.96192992e+07,
        1.91906940e+07,  2.04517366e+07,  1.44186190e+07,  4.79202266e+07,
        4.16698671e+07,  2.32126983e+07,  1.71782255e+07,  3.73613076e+07,
        1.93671115e+07,  5.28901323e+07,  1.14685273e+08,  2.20074960e+07,
        4.78533062e+06,  6.75775413e+07, -6.66439428e+06, -5.72664099e+06,
        2.11614239e+07,  

Получение предсказания для тестовой выборки

In [23]:
y_pred = lr.predict(X_test)

In [24]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr.score(X_test, y_test)}')

MAE: 8695516.334163895
MSE: 199175460250162.7
RMSE: 14112953.633104684
MAPE: 0.6877512220603761
R^2: 0.5946270739633887


In [25]:
metrics(y_test, y_pred)

MAE: 8695516.334163895
MSE: 199175460250162.7
RMSE: 14112953.633104684
MAPE: 0.6877512220603761
R^2: 0.6325264909873296


In [26]:
len(lr.coef_) # коэффициенты в модели (значения весов)
lr.coef_

array([ 2.29777235e+04, -8.20753457e+07, -7.47847185e+07, -3.91875507e+05,
        7.79426085e+06, -8.03397468e+04, -4.29989349e+06,  1.08717265e+05,
       -8.03397468e+04,  1.82809931e+06, -8.03397468e+04, -2.03303588e+05])

#### Регуляризации

### L1

In [27]:
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 8675332.44237951
MSE: 199061044207919.5
RMSE: 14108899.468346903
MAPE: 0.6856649036691315
R^2: 0.5948599398283472


array([ 2.29937559e+04, -8.13742249e+07, -7.28481756e+07, -4.00996607e+05,
        7.82181264e+06, -8.40450962e+04, -3.69220483e+06,  1.13472421e+05,
       -8.40450962e+04,  1.84510628e+06, -8.40450962e+04, -2.15234516e+05])

#### Подбор оптимального значения гиперпараметра

In [28]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [29]:
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

{'alpha': 0.1}

### L2

In [30]:
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 8695514.806515614
MSE: 199175444292299.78
RMSE: 14112953.067742405
MAPE: 0.687751078990807
R^2: 0.5946271064417149


array([ 2.29777236e+04, -8.20753016e+07, -7.47845699e+07, -3.91870787e+05,
        7.79425831e+06, -2.70742060e+05, -4.29895647e+06,  1.08716829e+05,
       -1.35635019e+03,  1.82809921e+06,  3.10780239e+04, -2.03297496e+05])

#### Подбор оптимального значения гиперпараметра

In [31]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [32]:
lasso_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_optimal.best_params_

{'alpha': 0.9}

### Полиномиальная регрессия

In [33]:
p = PolynomialFeatures(2)
X_p = p.fit_transform(X) 
X_p

array([[  1.       , 629.       ,  19.0327996, ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 974.       ,  19.0327996, ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 968.       ,  19.0856   , ...,   0.       ,
          0.       ,   0.       ],
       ...,
       [  1.       , 750.       ,  19.1443196, ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 700.       ,  19.0472012, ...,   0.       ,
          0.       ,   0.       ],
       [  1.       , 900.       ,  19.1688179, ...,   0.       ,
          0.       ,   0.       ]])

In [34]:
y_p = y.values.reshape(-1, 1)
y_p

array([[22400000.],
       [35000000.],
       [31700000.],
       ...,
       [20000000.],
       [11000000.],
       [15000000.]])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p, test_size=0.1, random_state=61)

In [36]:
lr2 = LinearRegression().fit(X_p, y_p)
lr2.coef_

array([[-3.88618732e+06,  5.51856901e+06, -2.08427646e+10,
        -3.03665226e+10, -1.09399154e+09,  2.21524597e+09,
        -2.77635823e+06,  4.60293311e+08, -3.04405694e+08,
        -2.77634953e+06, -9.13841944e+07, -2.77636422e+06,
        -2.70266109e+08, -1.26398884e+00, -6.48240698e+04,
        -1.00620723e+05,  1.97422926e+03,  1.54454790e+03,
        -5.76711829e+02,  3.06455667e+06,  2.80154012e+02,
        -2.39248125e+02,  9.47745503e+02,  8.46384346e+02,
         6.28681327e+03,  2.79309827e+07,  4.40059177e+08,
        -2.65627897e+06, -2.89429745e+07,  8.54602163e+05,
        -1.22147764e+10,  4.11017365e+06,  8.54602424e+05,
        -6.89222547e+06,  8.54596368e+05,  1.02414193e+07,
         1.28967957e+08,  3.02585380e+07, -3.14038741e+07,
        -1.51774574e+05,  3.15471625e+09,  5.12641470e+06,
        -1.51774526e+05, -2.03412517e+06, -1.51865095e+05,
         8.39589794e+06, -7.59468673e+05,  1.98835787e+05,
        -3.31134356e+04, -1.05631606e+09,  2.71619863e+0

In [37]:
y_Ppred = lr2.predict(X_test)

In [38]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lr2.score(X_test, y_test)}')

MAE: 8695514.806515614
MSE: 199175444292299.78
RMSE: 14112953.067742405
MAPE: 0.687751078990807
R^2: 0.6386876360514491


### L1

In [39]:
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_Ppred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 8695514.806515614
MSE: 199175444292299.78
RMSE: 14112953.067742405
MAPE: 0.687751078990807
R^2: 0.5826354925893177


array([[ 0.00000000e+00,  9.69800565e+06, -1.47410079e+06,
        -1.06661728e+06, -2.75873782e+06,  4.87322941e+06,
        -3.41348749e+06,  1.70782873e+04, -1.51608226e+04,
        -3.41348749e+06,  1.69122272e+06, -3.41348749e+06,
        -1.04993072e+06, -5.09123853e-01, -8.27452352e+04,
        -1.11256690e+05,  1.17347401e+03,  6.75661671e+02,
        -2.31312872e+02,  6.77042969e+03, -6.59900201e+02,
        -2.31309172e+02,  1.39636787e+03, -2.31309398e+02,
         6.06946164e+03,  2.97625133e+06, -5.47316878e+05,
         5.47517504e+06, -3.00460169e+07,  1.14733166e+06,
        -1.24100209e+06,  4.46469624e+06,  1.14733166e+06,
        -7.42233763e+06,  1.14733166e+06,  7.79475701e+06,
         3.93401817e+05, -1.36518108e+06,  7.74570995e+06,
        -2.12620017e+05,  1.55827843e+05, -1.19320533e+06,
        -2.12620018e+05,  1.83797916e+06, -2.12620020e+05,
        -2.02065928e+06,  2.21774206e+05, -6.86882092e+05,
         7.09776254e+04, -2.81409474e+06,  3.76992887e+0

#### Подбор оптимального значения гиперпараметра

In [40]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [41]:
ridge_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

{'alpha': 0.9}

### L2

In [42]:
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_Ppred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

MAE: 8695514.806515614
MSE: 199175444292299.78
RMSE: 14112953.067742405
MAPE: 0.687751078990807
R^2: 0.6066276396511256


array([ 0.00000000e+00,  2.67284706e+04, -9.44551810e+07, -9.89455084e+07,
       -8.32884924e+05,  1.78639630e+06, -1.02870582e+06,  1.90017734e+06,
       -8.57846757e+05,  1.85388369e+04,  8.80529265e+05,  1.52550419e+04,
       -2.58502463e+05,  1.33672816e-01, -2.79340638e+03,  6.15224808e+02,
       -1.16491536e+03,  4.15867175e+03, -2.38712053e+03, -5.51888326e+03,
        2.50296019e+02,  7.26749252e+02, -1.26136694e+03,  1.63187370e+02,
        6.91239795e+03,  2.00593329e+06, -7.50464523e+05, -1.79221124e+06,
       -4.24803319e+05,  1.77755005e+05, -3.85892149e+06,  9.65909708e+04,
        5.50088709e+02, -2.27166576e+05, -3.85013032e+03, -7.79541257e+04,
        2.39833840e+05,  5.07046063e+05,  2.01255985e+04, -1.38851931e+04,
        1.07959237e+06, -3.51881172e+04,  5.05465727e+02,  7.35794603e+04,
        1.27452863e+03, -7.43402124e+04, -3.31292321e+05,  9.71633117e+04,
        9.86956557e+05,  2.40699762e+06,  3.29045725e+06, -2.27581368e+05,
        1.21037089e+05, -

#### Подбор оптимального значения гиперпараметра

In [43]:
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [44]:
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

{'alpha': 0.9}

### Построение модели

In [45]:
def printMetrics(y_test, y_pred):
    mae = np.sum(np.abs(y_test - y_pred))/np.size(y_test)
    mse = np.sum((y_test - y_pred)**2)/np.size(y_test)
    rmse = sqrt(mse)
    mape = sqrt(np.sum(np.abs(y_test - y_pred)/y_test)/np.size(y_test))
    r2 = np.corrcoef(y_test, y_pred)[0,1]**2
    print(f'MAE: {mae}')
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')
    print(f'MAPE: {mape}')
    print(f'R^2: {r2}')


class LinearRegresion:
        def fit(self, X_train, Y_train):
            self.m, self.n = X.shape
            self.W = np.zeros(self.n)
            self.b = 0
            self.X = X_train
            self.Y = Y_train
            for i in range(3000):
                y_pred = self.predict(self.X)
                dW = (-(2 * (self.X.T).dot(self.Y - y_pred)))/self.m
                db = -2 * np.sum(self.Y - y_pred)/self.m
                self.W = self.W-0.00001*dW
                self.b = self.b-0.00001*db
            return self
    
        def predict(self, X):
            return X.dot(self.W)+self.b
    
    
class Solution:
    data = pd.read_csv('../data/mumbai_houses_task.csv')
    data["Furnished_status"] = data["Furnished_status"].map({"Unfurnished": 0, "Semi-Furnished": 1, "Furnished": 2})
    data["Furnished_status"] = data["Furnished_status"].fillna(data["Furnished_status"].median())
    data["Status"] = data["Status"].map({"Ready to Move": 1, "Under Construction": 0})
    data["Status"] = data["Status"].fillna(data["Status"].median())
    data = data.dropna().drop_duplicates(inplace=True).rename(mapper=str.lower, axis=1, inplace=True)
    data = pd.get_dummies(data['type_of_building'], drop_first=True, dtype=int)
    y = data["price"]
    X = data.drop(["price"], axis=1)
    X_train = None
    y_train = None
    
    def splitTrainTest(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2)
        
    def LinearSolution(self):
        model = LinearRegresion()
        model.fit(self.X_train, self.y_train)
        self.y_pred = model.predict(self.X_test)
        
    def findAlpha(self):
        parameters = {'alpha': np.arange(0.1, 1, 0.1)}
        self.ridge_optimal = GridSearchCV(Ridge(), parameters).fit(self.X_train, self.y_train)
        self.lasso_optimal = GridSearchCV(Lasso(), parameters).fit(self.X_train, self.y_train)
        
    def RidgeLearn(self):
        ridge = Ridge(alpha=0.5).fit(self.X_train, self.y_train)
        self.y_pred = ridge.predict(self.X_test)
        
    def LassoLearn(self):
        lasso = Lasso(alpha=0.5).fit(self.X_train, self.y_train)
        self.y_pred = lasso.predict(self.X_test)
        
    def getPred(self):
        return self.y_pred
    
    def getTest(self):
        return self.y_test
    

FileNotFoundError: [Errno 2] No such file or directory: '../data/trip_duration_task.csv'

In [None]:
example = Solution()

In [None]:
example.splitTrainTest()
example.LinearSolution()