In [22]:
import pandas as pd

data = pd.read_csv('mumbai_houses_task_filtred.csv')
data

Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building
0,22400000,629,19.032800,72.896357,2,2,0,0,1,0,0,0,0
1,35000000,974,19.032800,72.896357,3,2,0,0,1,0,0,0,0
2,31700000,968,19.085600,72.909277,3,3,0,0,1,0,0,0,0
3,18700000,629,19.155756,72.846862,2,2,2,1,1,2,0,2,0
4,13500000,1090,19.177555,72.849887,2,2,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5668,19500000,810,19.138320,72.810020,2,2,0,1,0,0,2,0,0
5669,22000000,1400,19.221920,72.854250,3,3,1,1,0,1,0,1,0
5670,20000000,750,19.144320,72.824111,2,2,0,1,0,0,0,0,0
5671,11000000,700,19.047201,72.872225,2,2,0,1,0,0,1,0,0


# Выделение целевого признака и предикторов

In [23]:
y = data['price']
X = data.drop(['price'], axis=1)
y

0       22400000
1       35000000
2       31700000
3       18700000
4       13500000
          ...   
5668    19500000
5669    22000000
5670    20000000
5671    11000000
5672    15000000
Name: price, Length: 5673, dtype: int64

In [24]:
X

Unnamed: 0,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building
0,629,19.032800,72.896357,2,2,0,0,1,0,0,0,0
1,974,19.032800,72.896357,3,2,0,0,1,0,0,0,0
2,968,19.085600,72.909277,3,3,0,0,1,0,0,0,0
3,629,19.155756,72.846862,2,2,2,1,1,2,0,2,0
4,1090,19.177555,72.849887,2,2,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5668,810,19.138320,72.810020,2,2,0,1,0,0,2,0,0
5669,1400,19.221920,72.854250,3,3,1,1,0,1,0,1,0
5670,750,19.144320,72.824111,2,2,0,1,0,0,0,0,0
5671,700,19.047201,72.872225,2,2,0,1,0,0,1,0,0


# Разбиение на обучающую и тестовую выборки

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4538, 12), (4538,), (1135, 12), (1135,))

# Использование моделей для обучения

In [26]:
# Импорт моделей обучения
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Импорт метрик качества
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [27]:
# Функция для отображения оценки предсказанных значений относительно реальных
from math import sqrt

def errors(predict_array):
    print(
        f'MAE = {mean_absolute_error(y_test, predict_array)}',
        f'MSE = {mean_squared_error(y_test, predict_array)}',
        f'RMSE = {sqrt(mean_squared_error(y_test, predict_array))}',
        f'MAPE = {mean_absolute_percentage_error(y_test, predict_array)}',
        f'R^2 = {r2_score(y_test, predict_array)}',
        sep='\n'
    )

In [28]:
# Предсказания для тестовой выборки (Линейная регрессия)
lr = LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [29]:
# Предсказанные значения
y_pred_lr

array([16139655.58364773, 19878220.50589561, 23143266.16240025, ...,
       12629983.71419239,   420322.78141689, 29575350.01133251],
      shape=(1135,))

In [30]:
# Оценка предсказанных значения
errors(y_pred_lr)

MAE = 5114285.118291035
MSE = 51558104459323.08
RMSE = 7180397.235482385
MAPE = 0.339075662997055
R^2 = 0.6306331856626719


In [31]:
# Коэффициенты модели
lr.coef_

array([ 1.26283316e+04, -5.99256652e+07, -4.58067410e+07,  2.28327121e+06,
        3.18027988e+06, -5.12379086e+04, -4.54381198e+06,  4.46373164e+05,
       -5.12379086e+04,  1.49851711e+05, -5.12379086e+04, -6.38467884e+05])

In [32]:
# L1 модель (Lasso)
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
errors(y_pred_lasso)
lasso.coef_

MAE = 5114284.470223368
MSE = 51558106202699.695
RMSE = 7180397.356880724
MAPE = 0.33907550838098244
R^2 = 0.6306331731729677


  model = cd_fast.enet_coordinate_descent(


array([ 1.26283341e+04, -5.99256177e+07, -4.58065913e+07,  2.28327057e+06,
        3.18027962e+06, -1.53647030e+05, -4.54305686e+06,  4.46372218e+05,
       -9.87249168e+03,  1.49850535e+05,  9.80434553e+03, -6.38461316e+05])

In [33]:
# L2 модель (Ridge)
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
errors(y_pred_ridge)
ridge.coef_

MAE = 5107414.41521954
MSE = 51591350074591.99
RMSE = 7182711.888596952
MAPE = 0.33728399369127304
R^2 = 0.6303950111383883


array([ 1.26429803e+04, -5.92756889e+07, -4.43371863e+07,  2.27922459e+06,
        3.19858520e+06, -5.48175383e+04, -3.91530817e+06,  4.48206824e+05,
       -5.48175383e+04,  1.51424426e+05, -5.48175383e+04, -6.42542117e+05])

In [34]:
# Elastic Net (комбинация L1 и L2)
el = ElasticNet(alpha=0.5).fit(X_train, y_train)
y_pred_el = el.predict(X_test)
errors(y_pred_el)
el.coef_

MAE = 6997871.174776766
MSE = 91137608345692.2
RMSE = 9546601.9266382
MAPE = 0.520898328786816
R^2 = 0.34708212386803106


array([   17833.3559226 , -2522380.49215358,  -525819.26127652,
        1305007.00434454,  1923430.68704802,  -341816.17949745,
         -28265.3475858 ,   302367.46833136,  -341816.17838571,
         271622.67735294,  -341816.17952676,   -65580.62366624])

In [35]:
# Полиномиальная регрессия

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_p = poly.fit_transform(X_train)
lr2 = LinearRegression().fit(X_train_p, y_train)
X_test_p = poly.transform(X_test)
y_pred_p = lr2.predict(X_test_p)
errors(y_pred_p)
lr2.coef_

MAE = 4996029.65583281
MSE = 48698864523682.164
RMSE = 6978457.173593757
MAPE = 0.3255134710716283
R^2 = 0.6511170331106899


array([-6.88231805e+02,  3.01001804e+06, -8.10929672e+09, -1.52649248e+10,
       -2.54782249e+07, -4.85106009e+08,  1.43371730e+07,  2.23001781e+08,
       -1.95477857e+08,  1.43373185e+07,  1.71370379e+08,  1.43372983e+07,
       -2.43702824e+08, -7.46478514e-01, -3.21284488e+04, -3.19284781e+04,
       -4.47364107e+02,  8.84511261e+02,  4.16094584e+02, -5.59398480e+04,
       -2.87543839e+03,  3.74852308e+02,  1.30305265e+03,  3.75002394e+02,
       -6.94545771e+02, -1.82898466e+07,  1.73602514e+08, -2.05101548e+06,
       -1.56990641e+06,  1.50919590e+05, -3.85272721e+09, -2.71978633e+06,
        1.50919549e+05, -3.06035405e+06,  1.50919536e+05,  5.60583701e+06,
        7.48183713e+07,  7.57750629e+05,  4.42649504e+06, -4.33557045e+05,
        9.94084541e+08,  5.65741966e+06, -4.33557059e+05, -3.84762127e+06,
       -4.33557058e+05,  8.54843009e+06, -1.36602388e+06,  6.69636299e+05,
       -9.38619953e+04,  1.79677880e+07,  4.39371138e+06, -9.38619982e+04,
        1.71294895e+05, -

# Поиск оптимального гиперпараметра для моделей 

In [36]:
# Импорт методов для подбора гиперпараметра
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [40]:
# Какие значения будет принимать гиперпараметр
parameters = {'alpha': np.arange(0.1, 1, 0.1)}

In [41]:
# Гиперпараметр для L1 (Lasso)

lasso_grid_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_grid_optimal.best_params_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'alpha': np.float64(0.1)}

In [42]:
lasso_random_optimal = RandomizedSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_random_optimal.best_params_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'alpha': np.float64(0.1)}

In [43]:
# Гиперпараметр для L2 (Ridge)

ridge_grid_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_grid_optimal.best_params_

{'alpha': np.float64(0.1)}

In [44]:
ridge_random_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_random_optimal.best_params_



{'alpha': np.float64(0.1)}

In [45]:
# Гиперпараметр для Elastic Net

el_grid_optimal = GridSearchCV(ElasticNet(), parameters).fit(X_train, y_train)
el_grid_optimal.best_params_

{'alpha': np.float64(0.1)}

In [46]:
el_random_optimal = RandomizedSearchCV(ElasticNet(), parameters).fit(X_train, y_train)
el_random_optimal.best_params_



{'alpha': np.float64(0.1)}