In [1]:
import pandas as pd
import numpy as np
import matplotlib as mlp
import seaborn as sns
import plotly as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv("final_data.csv")

In [5]:
data.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
0,43,27.0,4.0,12.558974,1983.0,1.909804,6.399301,2.107025,15.594247,2.590241,1.422391,29.0048,44898.7,11.84,0.014,15.581952,0.0
1,34,19.0,3.0,12.558974,1958.0,1.909804,6.399301,2.107025,15.864842,0.9367,9.503405,28.9525,44898.7,11.84,0.014,15.60727,0.0
2,43,29.0,2.0,12.558974,1958.0,1.909804,6.399301,2.107025,15.613141,2.120999,5.6048,28.8082,44898.7,11.84,0.014,15.555977,0.0
3,89,50.0,9.0,12.558974,2000.0,1.909804,6.399301,2.107025,15.914449,1.489049,2.677824,28.9655,44898.7,11.92,0.014,16.388123,0.0
4,77,77.0,4.0,12.558974,1915.0,1.909804,6.399301,2.107025,16.091227,1.257186,11.616653,29.4625,44898.7,11.92,0.014,16.608603,0.0


In [7]:
X = data.drop('log_price_doc', axis=1)
Y = data['log_price_doc']

In [9]:
standart_scaler = StandardScaler()
standart_scaler.fit(X)
X_scaler = standart_scaler.transform(X)

In [10]:
model = Ridge(alpha=0.001)
model.fit(X_scaler, Y)

In [11]:
cv_result = cross_validate(model, X_scaler, Y, 
                            scoring='neg_mean_squared_error',
                            cv=5, return_train_score=True)

In [12]:
print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_result['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_result['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.125
Среднее MSLE на тестовых фолдах: 0.127


Применим последний шаг. У нас есть один категориальный признак, который существенно влияет на наш таргет - тип недвижимости, а именно, первичка или вторичка (product_type). Сделаем две модели под каждый тип отдельно, что должно помочь нам с точностью прогноза.

In [13]:
Owner_Occupier = data[data['product_type_OwnerOccupier'] == 1].copy()
Investment = data[data['product_type_OwnerOccupier'] == 0].copy()

In [14]:
X_Occupier = Owner_Occupier.drop('log_price_doc', axis=1)
X_Investment = Investment.drop('log_price_doc', axis=1)

Y_Occupier = Owner_Occupier['log_price_doc']
Y_Investment = Investment['log_price_doc']

Модель для Owner_Occupier

In [15]:
standart_scaler = StandardScaler()
standart_scaler.fit(X_Occupier)
X_Occupier_scaler = standart_scaler.transform(X_Occupier)

In [16]:
model_OO = Ridge(alpha=0.001)
model_OO.fit(X_Occupier_scaler, Y_Occupier)

In [17]:
cv_model_OO_result = cross_validate(model_OO, X_Occupier_scaler, Y_Occupier, 
                                scoring='neg_mean_squared_error',
                                cv=5, return_train_score=True)

In [18]:
print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_model_OO_result['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_model_OO_result['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.035
Среднее MSLE на тестовых фолдах: 0.039


Модель для Investment

In [19]:
standart_scaler = StandardScaler()
standart_scaler.fit(X_Investment)
X_Investment_scaler = standart_scaler.transform(X_Investment)

In [20]:
model_I = Ridge(alpha=0.001)
model_I.fit(X_Investment_scaler, Y_Investment)

In [21]:
cv_model_I_result = cross_validate(model_I, X_Investment_scaler, Y_Investment, 
                                scoring='neg_mean_squared_error',
                                cv=5, return_train_score=True)

In [22]:
print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_model_I_result['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_model_I_result['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.173
Среднее MSLE на тестовых фолдах: 0.177
