In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [133]:
train_data_linear_p2 = pd.read_csv('../Data/train_data_with_feats_p2.csv')
test_data_linear_p2 = pd.read_csv('../Data/test_data_with_feats_p2.csv')
train_data_linear_p3 = pd.read_csv('../Data/train_data_with_feats_p3.csv')
test_data_linear_p3 = pd.read_csv('../Data/test_data_with_feats_p3.csv')
train_data_ne = pd.read_csv('../Data/train_data_with_feats.csv')
test_data_ne = pd.read_csv('../Data/test_data_with_feats.csv')

In [64]:
cat_feats = ['Ecology_2', 'Ecology_3', 'Shops_2']

In [66]:
for cat in cat_feats:
    print(f'{cat}: {X_train_linear_p2[cat].unique()}')

Ecology_2: ['B' 'A']
Ecology_3: ['A' 'B']
Shops_2: ['B' 'A']


In [29]:
def cat_feats_encode(df, cat_feats):
    result = df.copy()
    for cat in cat_feats:
        result[cat] = result[cat].replace({'A': 0, 'B': 1})
    return result

In [67]:
train_data_linear_p2 = cat_feats_encode(train_data_linear_p2, cat_feats)
test_data_linear_p2 = cat_feats_encode(test_data_linear_p2, cat_feats)
train_data_linear_p3 = cat_feats_encode(train_data_linear_p3, cat_feats)
test_data_linear_p3 = cat_feats_encode(test_data_linear_p3, cat_feats)
train_data = cat_feats_encode(train_data_ne, cat_feats)
test_data = cat_feats_encode(test_data_ne, cat_feats)

In [70]:
y_train = train_data['Price']
y_test = test_data['Price']

В качестве бейзлайна возьмем среднее значение целевой переменной.

In [72]:
def baseline_model(y):
    result = y.apply(lambda x: y.mean())
    return result

In [73]:
y_b_train = baseline_model(y_train)
y_b_test = baseline_model(y_test)

In [76]:
mse_b_train = mean_squared_error(y_train, y_b_train)
mse_b_test = mean_squared_error(y_test, y_b_test)
r2_b_train = r2_score(y_train, y_b_train)
r2_b_test = r2_score(y_test, y_b_test)

In [129]:
models_results = pd.DataFrame({'Model': ['Baseline'], 'CV_MSE': ['NaN'], 'CV_RMSE': ['NaN'], 'CV_R2': ['NaN'], 'MSE_train': [mse_b_train], 'RMSE_train': [mse_b_train**0.5], 
                             'MSE_test': [mse_b_test], 'RMSE_test': [mse_b_test**0.5], 'R2_train': [r2_b_train], 
                              'R2_test': [r2_b_test]})

In [130]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0


In [79]:
X_train_linear_p2 = train_data_linear_p2.drop(['DistrictId', 'Price'], axis=1)
X_test_linear_p2 = test_data_linear_p2.drop(['DistrictId', 'Price'], axis=1)

In [108]:
X_train_linear_p3 = train_data_linear_p3.drop(['DistrictId', 'Price'], axis=1)
X_test_linear_p3 = test_data_linear_p3.drop(['DistrictId', 'Price'], axis=1)

In [109]:
X_train = train_data.drop(['DistrictId', 'Price'], axis=1)
X_test = test_data.drop(['DistrictId', 'Price'], axis=1)

In [146]:
num_feats = [x for x in X_train.columns if x not in cat_feats]

In [80]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [107]:
st_scaler = StandardScaler()
mm_scaler = MinMaxScaler()
r_scaler = RobustScaler()
scalers = [st_scaler, mm_scaler, r_scaler, 'None', 'Normalize']

In [110]:
data_list = [('Data', X_train, X_test), ('Data_p2', X_train_linear_p2, X_test_linear_p2), ('Data_p3', X_train_linear_p3, X_test_linear_p3)]

In [90]:
from sklearn.model_selection import cross_validate

In [131]:
%%time
for name, X_train, X_test in data_list:
    for scaler in scalers:
        if not isinstance(scaler, str):
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            scaler_name = str(type(scaler)).split('.')[-1][:-2]
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
            scaler_name = scaler
        if scaler == 'Normalize': 
            model = LinearRegression(normalize=True)
        else:
            model = LinearRegression()
        model_cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring=['r2', 'neg_mean_squared_error'])
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        model_mse_train = mean_squared_error(y_train, y_pred_train)
        model_mse_test = mean_squared_error(y_test, y_pred_test)
        r2_train = r2_score(y_train, y_pred_train)
        r2_train = r2_score(y_test, y_pred_test)
        df_res = pd.DataFrame({'Model': ['LinearRegression+' + name + '+' + scaler_name], 'CV_MSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())], 
                              'CV_RMSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())**0.5], 
                              'CV_R2': [model_cv_results['test_r2'].mean()], 'MSE_train': [model_mse_train], 
                              'RMSE_train': [model_mse_train**0.5], 
                              'MSE_test': [model_mse_test], 'RMSE_test': [model_mse_test**0.5], 'R2_train': [r2_train], 
                              'R2_test': [r2_train]})
        models_results = pd.concat([models_results, df_res], axis=0)

Wall time: 3.33 s


In [132]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data+StandardScaler,3640090000.0,60333.1,0.576847,3430853000.0,58573.486171,4239733000.0,65113.233694,0.507442,0.507442
0,LinearRegression+Data+MinMaxScaler,3643830000.0,60364.1,0.576409,3429486000.0,58561.813858,4241458000.0,65126.475445,0.507242,0.507242
0,LinearRegression+Data+RobustScaler,3640610000.0,60337.5,0.576785,3429926000.0,58565.572984,4239003000.0,65107.628694,0.507527,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.507635,0.507635
0,LinearRegression+Data+Normalize,3673790000.0,60611.8,0.572927,3429542000.0,58562.290739,4238341000.0,65102.541186,0.507604,0.507604
0,LinearRegression+Data_p2+StandardScaler,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+MinMaxScaler,3080220000.0,55499.7,0.642231,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+RobustScaler,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+None,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077546,0.584325,0.584325


In [135]:
scalers = [st_scaler, mm_scaler, r_scaler, 'None']

In [142]:
%%time
for name, X_train, X_test in [data_list[0]]:
    for scaler in scalers:
        if not isinstance(scaler, str):
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            scaler_name = str(type(scaler)).split('.')[-1][:-2]
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
            scaler_name = scaler
        model = RandomForestRegressor()
        model_cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring=['r2', 'neg_mean_squared_error'])
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        model_mse_train = mean_squared_error(y_train, y_pred_train)
        model_mse_test = mean_squared_error(y_test, y_pred_test)
        r2_train = r2_score(y_train, y_pred_train)
        r2_train = r2_score(y_test, y_pred_test)
        df_res = pd.DataFrame({'Model': ['RandomForestRegressor+' + name + '+' + scaler_name], 'CV_MSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())], 
                              'CV_RMSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())**0.5], 
                              'CV_R2': [model_cv_results['test_r2'].mean()], 'MSE_train': [model_mse_train], 
                              'RMSE_train': [model_mse_train**0.5], 
                              'MSE_test': [model_mse_test], 'RMSE_test': [model_mse_test**0.5], 'R2_train': [r2_train], 
                              'R2_test': [r2_train]})
        models_results = pd.concat([models_results, df_res], axis=0)

Wall time: 3min 16s


In [143]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data+StandardScaler,3640090000.0,60333.1,0.576847,3430853000.0,58573.486171,4239733000.0,65113.233694,0.507442,0.507442
0,LinearRegression+Data+MinMaxScaler,3643830000.0,60364.1,0.576409,3429486000.0,58561.813858,4241458000.0,65126.475445,0.507242,0.507242
0,LinearRegression+Data+RobustScaler,3640610000.0,60337.5,0.576785,3429926000.0,58565.572984,4239003000.0,65107.628694,0.507527,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.507635,0.507635
0,LinearRegression+Data+Normalize,3673790000.0,60611.8,0.572927,3429542000.0,58562.290739,4238341000.0,65102.541186,0.507604,0.507604
0,LinearRegression+Data_p2+StandardScaler,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+MinMaxScaler,3080220000.0,55499.7,0.642231,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+RobustScaler,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+None,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077546,0.584325,0.584325


In [144]:
X_train_ne = train_data_ne.drop(['DistrictId', 'Price'], axis=1)
X_test_ne = test_data_ne.drop(['DistrictId', 'Price'], axis=1)

In [154]:
%%time
for name, X_train, X_test in [('Data_ne', X_train_ne, X_test_ne)]:
    for scaler in scalers:
        if not isinstance(scaler, str):
            X_train_scaled = scaler.fit_transform(X_train[num_feats])
            X_train_scaled = pd.concat([pd.DataFrame(X_train_scaled, columns=num_feats), X_train[cat_feats]], axis=1)
            X_test_scaled = scaler.transform(X_test[num_feats])
            X_test_scaled = pd.concat([pd.DataFrame(X_test_scaled, columns=num_feats), X_test[cat_feats]], axis=1)
            scaler_name = str(type(scaler)).split('.')[-1][:-2]
        else:
            X_train_scaled = X_train
            X_test_scaled = X_test
            scaler_name = scaler
        model = CatBoostRegressor(cat_features=cat_feats, silent=True)
        model_cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring=['r2', 'neg_mean_squared_error'])
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        model_mse_train = mean_squared_error(y_train, y_pred_train)
        model_mse_test = mean_squared_error(y_test, y_pred_test)
        r2_train = r2_score(y_train, y_pred_train)
        r2_train = r2_score(y_test, y_pred_test)
        df_res = pd.DataFrame({'Model': ['CatBoostRegressor+' + name + '+' + scaler_name], 'CV_MSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())], 
                              'CV_RMSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())**0.5], 
                              'CV_R2': [model_cv_results['test_r2'].mean()], 'MSE_train': [model_mse_train], 
                              'RMSE_train': [model_mse_train**0.5], 
                              'MSE_test': [model_mse_test], 'RMSE_test': [model_mse_test**0.5], 'R2_train': [r2_train], 
                              'R2_test': [r2_train]})
        models_results = pd.concat([models_results, df_res], axis=0)

Wall time: 1min 15s


In [155]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data+StandardScaler,3640090000.0,60333.1,0.576847,3430853000.0,58573.486171,4239733000.0,65113.233694,0.507442,0.507442
0,LinearRegression+Data+MinMaxScaler,3643830000.0,60364.1,0.576409,3429486000.0,58561.813858,4241458000.0,65126.475445,0.507242,0.507242
0,LinearRegression+Data+RobustScaler,3640610000.0,60337.5,0.576785,3429926000.0,58565.572984,4239003000.0,65107.628694,0.507527,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.507635,0.507635
0,LinearRegression+Data+Normalize,3673790000.0,60611.8,0.572927,3429542000.0,58562.290739,4238341000.0,65102.541186,0.507604,0.507604
0,LinearRegression+Data_p2+StandardScaler,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+MinMaxScaler,3080220000.0,55499.7,0.642231,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+RobustScaler,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077543,0.584325,0.584325
0,LinearRegression+Data_p2+None,3080250000.0,55500.0,0.642228,2669497000.0,51667.179548,3577963000.0,59816.077546,0.584325,0.584325
