In [250]:
import pandas as pd

data = pd.read_csv('mumbai_houses_task_filtred.csv')
data

Unnamed: 0,price,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building
0,22400000,629,19.032800,72.896357,2,2,0,0,1,0,0,0,0
1,35000000,974,19.032800,72.896357,3,2,0,0,1,0,0,0,0
2,31700000,968,19.085600,72.909277,3,3,0,0,1,0,0,0,0
3,18700000,629,19.155756,72.846862,2,2,2,1,1,2,0,2,0
4,13500000,1090,19.177555,72.849887,2,2,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5668,19500000,810,19.138320,72.810020,2,2,0,1,0,0,2,0,0
5669,22000000,1400,19.221920,72.854250,3,3,1,1,0,1,0,1,0
5670,20000000,750,19.144320,72.824111,2,2,0,1,0,0,0,0,0
5671,11000000,700,19.047201,72.872225,2,2,0,1,0,0,1,0,0


# Выделение целевого признака и предикторов

In [251]:
y = data['price']
X = data.drop(['price'], axis=1)
y

0       22400000
1       35000000
2       31700000
3       18700000
4       13500000
          ...   
5668    19500000
5669    22000000
5670    20000000
5671    11000000
5672    15000000
Name: price, Length: 5673, dtype: int64

In [252]:
X

Unnamed: 0,area,latitude,longitude,bedrooms,bathrooms,balcony,status,neworold,parking,furnished_status,lift,type_of_building
0,629,19.032800,72.896357,2,2,0,0,1,0,0,0,0
1,974,19.032800,72.896357,3,2,0,0,1,0,0,0,0
2,968,19.085600,72.909277,3,3,0,0,1,0,0,0,0
3,629,19.155756,72.846862,2,2,2,1,1,2,0,2,0
4,1090,19.177555,72.849887,2,2,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5668,810,19.138320,72.810020,2,2,0,1,0,0,2,0,0
5669,1400,19.221920,72.854250,3,3,1,1,0,1,0,1,0
5670,750,19.144320,72.824111,2,2,0,1,0,0,0,0,0
5671,700,19.047201,72.872225,2,2,0,1,0,0,1,0,0


# Разбиение на обучающую и тестовую выборки

In [253]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4538, 12), (4538,), (1135, 12), (1135,))

# Использование моделей для обучения

In [254]:
# Импорт моделей обучения
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

# Импорт метрик качества
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [255]:
# Функция для отображения оценки предсказанных значений относительно реальных
from math import sqrt

def errors(predict_array):
    print(
        f'MAE = {mean_absolute_error(y_test, predict_array)}',
        f'MSE = {mean_squared_error(y_test, predict_array)}',
        f'RMSE = {sqrt(mean_squared_error(y_test, predict_array))}',
        f'MAPE = {mean_absolute_percentage_error(y_test, predict_array)}',
        f'R^2 = {r2_score(y_test, predict_array)}',
        sep='\n'
    )

In [256]:
# Предсказания для тестовой выборки (Линейная регрессия)
lr = LinearRegression().fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [257]:
# Предсказанные значения
y_pred_lr

array([18319515.00043583, 22377839.12760067, 27654959.30328369, ...,
       20660910.95917892, 15662460.38487434, 25862629.02737045])

In [258]:
# Оценка предсказанных значения
errors(y_pred_lr)

MAE = 4901473.360361942
MSE = 47561449080796.01
RMSE = 6896480.920063218
MAPE = 0.33315448589790847
R^2 = 0.6736678792829439


In [259]:
# Коэффициенты модели
lr.coef_

array([ 1.23240704e+04, -5.99568412e+07, -4.76759121e+07,  2.23103684e+06,
        3.09149902e+06, -7.52495530e+04, -4.60881549e+06,  4.19480916e+05,
       -7.52495530e+04,  1.87640855e+05, -7.52495530e+04, -3.22667183e+05])

In [260]:
# L1 модель (Lasso)
lasso = Lasso(alpha=0.5).fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
errors(y_pred_lasso)
lasso.coef_

MAE = 4901472.416986111
MSE = 47561443567582.805
RMSE = 6896480.520351145
MAPE = 0.333154252453812
R^2 = 0.6736679171106088


  model = cd_fast.enet_coordinate_descent(


array([ 1.23240724e+04, -5.99567936e+07, -4.76757516e+07,  2.23103615e+06,
        3.09149885e+06, -2.29876791e+05, -4.60805983e+06,  4.19479909e+05,
       -8.69579822e+03,  1.87639751e+05,  1.28224873e+04, -3.22661226e+05])

In [261]:
# L2 модель (Ridge)
ridge = Ridge(alpha=0.5).fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
errors(y_pred_ridge)
ridge.coef_

MAE = 4890659.01995477
MSE = 47502933891752.805
RMSE = 6892237.219637235
MAPE = 0.3305549549644295
R^2 = 0.6740693680118133


array([ 1.23343713e+04, -5.93006205e+07, -4.60456532e+07,  2.22517166e+06,
        3.11229793e+06, -7.88238240e+04, -3.96554641e+06,  4.20724145e+05,
       -7.88238240e+04,  1.89776215e+05, -7.88238240e+04, -3.32473437e+05])

In [262]:
# Elastic Net (комбинация L1 и L2)
el = ElasticNet(alpha=0.5).fit(X_train, y_train)
y_pred_el = el.predict(X_test)
errors(y_pred_el)
el.coef_

MAE = 7067552.814186947
MSE = 87922357922931.53
RMSE = 9376692.269821567
MAPE = 0.5952507888945732
R^2 = 0.3967406360833675


array([   17304.80339883, -2509061.40098743,  -516465.21300624,
        1244660.31033113,  1885208.25823148,  -354964.02056011,
         -27832.70534507,   282808.1272905 ,  -354964.01942919,
         317543.85417458,  -354964.02060867,   -55969.694521  ])

In [263]:
# Полиномиальная регрессия

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_p = poly.fit_transform(X_train)
lr2 = LinearRegression().fit(X_train_p, y_train)
X_test_p = poly.transform(X_test)
y_pred_p = lr2.predict(X_test_p)
errors(y_pred_p)
lr2.coef_

MAE = 4777105.974142388
MSE = 44944925207295.64
RMSE = 6704097.643031136
MAPE = 0.31966164672622954
R^2 = 0.6916205657769083


array([-3.29771086e+02,  2.55163660e+06, -9.68235991e+09, -1.79332165e+10,
       -1.86441307e+08, -2.24016327e+08,  2.27232017e+07,  2.62614118e+08,
       -1.95488562e+08,  2.27232073e+07,  2.22451697e+08,  2.27232091e+07,
       -2.86545776e+08, -2.85007176e+00, -3.45669965e+04, -3.12815545e+04,
        9.14041248e+02,  1.04598145e+03,  3.32091286e+02,  4.03285655e+05,
       -2.41599582e+03,  3.08351860e+02,  5.26360083e+02,  3.08670676e+02,
        1.54259189e+02, -1.70413555e+07,  2.06026185e+08,  7.23075310e+05,
       -4.09752663e+06,  4.65815358e+04, -4.68746814e+09, -2.85578611e+06,
        4.65815804e+04, -3.70151406e+06,  4.65815720e+04,  6.53857249e+06,
        8.73577793e+07,  4.39121220e+06,  9.16768239e+05, -6.37115758e+05,
        1.20956710e+09,  5.15006956e+06, -6.37115758e+05, -5.08174055e+06,
       -6.37115762e+05,  1.00536869e+07, -1.32717322e+06,  9.09527381e+05,
       -1.16804239e+05, -1.41129923e+08,  3.53022445e+06, -1.16804237e+05,
       -9.90672044e+04, -

# Поиск оптимального гиперпараметра для моделей 

In [264]:
# Импорт методов для подбора гиперпараметра
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np

In [265]:
# Какие значения будет принимать гиперпараметр
parameters = {'alpha': np.arange(0, 1, 0.1)}

In [266]:
# Гиперпараметр для L1 (Lasso)

lasso_grid_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
lasso_grid_optimal.best_params_

{'alpha': np.float64(0.1)}

In [267]:
lasso_random_optimal = RandomizedSearchCV(Ridge(), parameters).fit(X_train, y_train)
lasso_random_optimal.best_params_

{'alpha': np.float64(0.1)}