In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [2]:
train_data_linear_p2 = pd.read_csv('../Data/train_data_with_feats_p2.csv')
test_data_linear_p2 = pd.read_csv('../Data/test_data_with_feats_p2.csv')
train_data_linear_p3 = pd.read_csv('../Data/train_data_with_feats_p3.csv')
test_data_linear_p3 = pd.read_csv('../Data/test_data_with_feats_p3.csv')
train_data_ne = pd.read_csv('../Data/train_data_with_feats.csv')
test_data_ne = pd.read_csv('../Data/test_data_with_feats.csv')

In [3]:
cat_feats = ['Ecology_2', 'Ecology_3', 'Shops_2']

In [5]:
for cat in cat_feats:
    print(f'{cat}: {train_data_ne[cat].unique()}')

Ecology_2: ['B' 'A']
Ecology_3: ['A' 'B']
Shops_2: ['B' 'A']


In [6]:
def cat_feats_encode(df, cat_feats):
    result = df.copy()
    for cat in cat_feats:
        result[cat] = result[cat].replace({'A': 0, 'B': 1})
    return result

In [7]:
train_data_linear_p2 = cat_feats_encode(train_data_linear_p2, cat_feats)
test_data_linear_p2 = cat_feats_encode(test_data_linear_p2, cat_feats)
train_data_linear_p3 = cat_feats_encode(train_data_linear_p3, cat_feats)
test_data_linear_p3 = cat_feats_encode(test_data_linear_p3, cat_feats)
train_data = cat_feats_encode(train_data_ne, cat_feats)
test_data = cat_feats_encode(test_data_ne, cat_feats)

In [8]:
y_train = train_data['Price']
y_test = test_data['Price']

В качестве бейзлайна возьмем среднее значение целевой переменной.

In [9]:
def baseline_model(y):
    result = y.apply(lambda x: y.mean())
    return result

In [10]:
y_b_train = baseline_model(y_train)
y_b_test = baseline_model(y_test)

In [11]:
mse_b_train = mean_squared_error(y_train, y_b_train)
mse_b_test = mean_squared_error(y_test, y_b_test)
r2_b_train = r2_score(y_train, y_b_train)
r2_b_test = r2_score(y_test, y_b_test)

In [117]:
models_results = pd.DataFrame({'Model': ['Baseline'], 'CV_MSE': ['NaN'], 'CV_RMSE': ['NaN'], 'CV_R2': ['NaN'], 'MSE_train': [mse_b_train], 'RMSE_train': [mse_b_train**0.5], 
                             'MSE_test': [mse_b_test], 'RMSE_test': [mse_b_test**0.5], 'R2_train': [r2_b_train], 
                              'R2_test': [r2_b_test]})

In [118]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0


In [14]:
X_train_linear_p2 = train_data_linear_p2.drop(['DistrictId', 'Price'], axis=1)
X_test_linear_p2 = test_data_linear_p2.drop(['DistrictId', 'Price'], axis=1)

In [15]:
X_train_linear_p3 = train_data_linear_p3.drop(['DistrictId', 'Price'], axis=1)
X_test_linear_p3 = test_data_linear_p3.drop(['DistrictId', 'Price'], axis=1)

In [77]:
X_train = train_data.drop(['DistrictId', 'Price'], axis=1)
X_test = test_data.drop(['DistrictId', 'Price'], axis=1)

In [78]:
X_test_linear = X_test.copy()
X_test_linear['1'] = 1
X_train_linear = X_train.copy()
X_train_linear['1'] = 1

In [17]:
num_feats = [x for x in X_train.columns if x not in cat_feats]

In [18]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [19]:
st_scaler = StandardScaler()
mm_scaler = MinMaxScaler()
r_scaler = RobustScaler()
scalers = [st_scaler, mm_scaler, r_scaler, 'None']

In [88]:
data_list = [('Data_lr', X_train_linear, X_test_linear, y_train, y_test), ('Data', X_train, X_test, y_train, y_test), 
             ('Data_p2', X_train_linear_p2, X_test_linear_p2, y_train, y_test), 
             ('Data_p3', X_train_linear_p3, X_test_linear_p3, y_train, y_test)]

In [145]:
data_list[0]

('Data_lr',
       Rooms      Square  KitchenSquare  Floor  HouseFloor  HouseYear  \
 0       2.0   40.320370            6.0    3.0        12.0       1966   
 1       2.0   38.220579            7.0    9.0         9.0       1968   
 2       1.0   86.988870            1.0    8.0         8.0       1977   
 3       2.0   40.075940            5.0    5.0         5.0       1962   
 4       4.0  101.392298           10.0   17.0        22.0       1998   
 ...     ...         ...            ...    ...         ...        ...   
 6969    1.0   47.929851            1.0   11.0        11.0       1977   
 6970    3.0   76.157858            9.0   17.0        22.0       1986   
 6971    3.0   65.904217            7.0    8.0        12.0       1976   
 6972    2.0   60.299356            1.0    5.0         5.0       1977   
 6973    1.0   36.854948            9.0    1.0        12.0       1969   
 
       Ecology_1  Ecology_2  Ecology_3  Social_1  ...  log_Square  \
 0      0.307467          1          0   

In [86]:
feats_not_to_scale = cat_feats[:]
feats_not_to_scale.append('1')

In [21]:
from sklearn.model_selection import cross_validate

In [22]:
from sklearn.base import clone

In [125]:
model_lr = LinearRegression()
model_catboost = CatBoostRegressor(silent=True)
model_forest = RandomForestRegressor()
model_lightgbm = LGBMRegressor()

In [114]:
def model_fit_results(estimator, model_name, data_list, scaler_list, feats_not_to_scale, df_for_results):
    df_res = df_for_results.copy()
    for name, X_train, X_test, y_train, y_test in data_list:
        feats_to_scale = [x for x in X_train.columns if x not in feats_not_to_scale]
        for sc in scaler_list:
            if not isinstance(sc, str):
                scaler = clone(sc)              
                X_train_scaled = np.hstack((scaler.fit_transform(X_train[feats_to_scale]), X_train.drop(feats_to_scale, axis=1).values))
                X_test_scaled = np.hstack((scaler.transform(X_test[feats_to_scale]), X_test.drop(feats_to_scale, axis=1).values))
                scaler_name = scaler.__str__()[:-2]
            elif sc == 'None':
                X_train_scaled = X_train
                X_test_scaled = X_test
                scaler_name = sc
            model = clone(estimator)
            model_cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring=['r2', 'neg_mean_squared_error'])
            model.fit(X_train_scaled, y_train)
            y_pred_train = model.predict(X_train_scaled)
            y_pred_test = model.predict(X_test_scaled)
            model_mse_train = mean_squared_error(y_train, y_pred_train)
            model_mse_test = mean_squared_error(y_test, y_pred_test)
            r2_train = r2_score(y_train, y_pred_train)
            r2_test = r2_score(y_test, y_pred_test)
            model_results = pd.DataFrame({'Model': [model_name + '+' + name + '+' + scaler_name], 'CV_MSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())], 
                                  'CV_RMSE': [abs(model_cv_results['test_neg_mean_squared_error'].mean())**0.5], 
                                  'CV_R2': [model_cv_results['test_r2'].mean()], 'MSE_train': [model_mse_train], 
                                  'RMSE_train': [model_mse_train**0.5], 
                                  'MSE_test': [model_mse_test], 'RMSE_test': [model_mse_test**0.5], 'R2_train': [r2_train], 
                              'R2_test': [r2_test]})
            df_res = pd.concat([df_res, model_results], axis=0)
    
    return df_res

In [119]:
%%time
models_results = model_fit_results(model_lr, 'LinearRegression', data_list, scalers, feats_not_to_scale, models_results)

Wall time: 3.22 s


In [120]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data_lr+StandardScaler,3639280000.0,60326.4,0.576941,3429410000.0,58561.160775,4250756000.0,65197.825703,0.60095,0.506162
0,LinearRegression+Data_lr+MinMaxScaler,3641520000.0,60345.0,0.576679,3430432000.0,58569.890956,4238983000.0,65107.470545,0.600832,0.50753
0,LinearRegression+Data_lr+RobustScaler,3639460000.0,60328.0,0.576915,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data_lr+None,3640170000.0,60333.8,0.576834,3429308000.0,58560.291135,4238063000.0,65100.403463,0.600962,0.507637
0,LinearRegression+Data+StandardScaler,3640280000.0,60334.7,0.576822,3430676000.0,58571.975264,4239123000.0,65108.544625,0.600803,0.507513
0,LinearRegression+Data+MinMaxScaler,3641300000.0,60343.2,0.576703,3430781000.0,58572.866267,4247854000.0,65175.56063,0.600791,0.506499
0,LinearRegression+Data+RobustScaler,3639200000.0,60325.8,0.576944,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.600938,0.507635
0,LinearRegression+Data_p2+StandardScaler,3080140000.0,55499.1,0.64224,2669495000.0,51667.155766,3577984000.0,59816.25057,0.689375,0.584322


In [121]:
scalers_for_gb = ['None']

In [126]:
%%time
models_results = model_fit_results(model_catboost, 'CatBoostRegressor_encoded', [data_list[1]], scalers_for_gb, feats_not_to_scale, models_results)

Wall time: 17.5 s


In [127]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data_lr+StandardScaler,3639280000.0,60326.4,0.576941,3429410000.0,58561.160775,4250756000.0,65197.825703,0.60095,0.506162
0,LinearRegression+Data_lr+MinMaxScaler,3641520000.0,60345.0,0.576679,3430432000.0,58569.890956,4238983000.0,65107.470545,0.600832,0.50753
0,LinearRegression+Data_lr+RobustScaler,3639460000.0,60328.0,0.576915,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data_lr+None,3640170000.0,60333.8,0.576834,3429308000.0,58560.291135,4238063000.0,65100.403463,0.600962,0.507637
0,LinearRegression+Data+StandardScaler,3640280000.0,60334.7,0.576822,3430676000.0,58571.975264,4239123000.0,65108.544625,0.600803,0.507513
0,LinearRegression+Data+MinMaxScaler,3641300000.0,60343.2,0.576703,3430781000.0,58572.866267,4247854000.0,65175.56063,0.600791,0.506499
0,LinearRegression+Data+RobustScaler,3639200000.0,60325.8,0.576944,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.600938,0.507635
0,LinearRegression+Data_p2+StandardScaler,3080140000.0,55499.1,0.64224,2669495000.0,51667.155766,3577984000.0,59816.25057,0.689375,0.584322


In [128]:
%%time
models_results = model_fit_results(model_forest, 'RandomForestRegressor', [data_list[1]], scalers_for_gb, feats_not_to_scale, models_results)

Wall time: 50.5 s


In [129]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data_lr+StandardScaler,3639280000.0,60326.4,0.576941,3429410000.0,58561.160775,4250756000.0,65197.825703,0.60095,0.506162
0,LinearRegression+Data_lr+MinMaxScaler,3641520000.0,60345.0,0.576679,3430432000.0,58569.890956,4238983000.0,65107.470545,0.600832,0.50753
0,LinearRegression+Data_lr+RobustScaler,3639460000.0,60328.0,0.576915,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data_lr+None,3640170000.0,60333.8,0.576834,3429308000.0,58560.291135,4238063000.0,65100.403463,0.600962,0.507637
0,LinearRegression+Data+StandardScaler,3640280000.0,60334.7,0.576822,3430676000.0,58571.975264,4239123000.0,65108.544625,0.600803,0.507513
0,LinearRegression+Data+MinMaxScaler,3641300000.0,60343.2,0.576703,3430781000.0,58572.866267,4247854000.0,65175.56063,0.600791,0.506499
0,LinearRegression+Data+RobustScaler,3639200000.0,60325.8,0.576944,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.600938,0.507635
0,LinearRegression+Data_p2+StandardScaler,3080140000.0,55499.1,0.64224,2669495000.0,51667.155766,3577984000.0,59816.25057,0.689375,0.584322


In [130]:
%%time
models_results = model_fit_results(model_lightgbm, 'LGBMRegressor', [data_list[1]], scalers_for_gb, feats_not_to_scale, models_results)

Wall time: 1.49 s


In [131]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data_lr+StandardScaler,3639280000.0,60326.4,0.576941,3429410000.0,58561.160775,4250756000.0,65197.825703,0.60095,0.506162
0,LinearRegression+Data_lr+MinMaxScaler,3641520000.0,60345.0,0.576679,3430432000.0,58569.890956,4238983000.0,65107.470545,0.600832,0.50753
0,LinearRegression+Data_lr+RobustScaler,3639460000.0,60328.0,0.576915,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data_lr+None,3640170000.0,60333.8,0.576834,3429308000.0,58560.291135,4238063000.0,65100.403463,0.600962,0.507637
0,LinearRegression+Data+StandardScaler,3640280000.0,60334.7,0.576822,3430676000.0,58571.975264,4239123000.0,65108.544625,0.600803,0.507513
0,LinearRegression+Data+MinMaxScaler,3641300000.0,60343.2,0.576703,3430781000.0,58572.866267,4247854000.0,65175.56063,0.600791,0.506499
0,LinearRegression+Data+RobustScaler,3639200000.0,60325.8,0.576944,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.600938,0.507635
0,LinearRegression+Data_p2+StandardScaler,3080140000.0,55499.1,0.64224,2669495000.0,51667.155766,3577984000.0,59816.25057,0.689375,0.584322


In [132]:
X_train_ne = train_data_ne.drop(['DistrictId', 'Price'], axis=1)
X_test_ne = test_data_ne.drop(['DistrictId', 'Price'], axis=1)

In [134]:
model_catboost_ne = CatBoostRegressor(cat_features=cat_feats, silent=True)

In [135]:
%%time
models_results = model_fit_results(model_catboost_ne, 'CatBoostRegressor_ne', [('Data_ne', X_train_ne, X_test_ne, y_train, y_test)], 
                                   scalers_for_gb, feats_not_to_scale, models_results)

Wall time: 17.6 s


In [136]:
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
0,LinearRegression+Data_lr+StandardScaler,3639280000.0,60326.4,0.576941,3429410000.0,58561.160775,4250756000.0,65197.825703,0.60095,0.506162
0,LinearRegression+Data_lr+MinMaxScaler,3641520000.0,60345.0,0.576679,3430432000.0,58569.890956,4238983000.0,65107.470545,0.600832,0.50753
0,LinearRegression+Data_lr+RobustScaler,3639460000.0,60328.0,0.576915,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data_lr+None,3640170000.0,60333.8,0.576834,3429308000.0,58560.291135,4238063000.0,65100.403463,0.600962,0.507637
0,LinearRegression+Data+StandardScaler,3640280000.0,60334.7,0.576822,3430676000.0,58571.975264,4239123000.0,65108.544625,0.600803,0.507513
0,LinearRegression+Data+MinMaxScaler,3641300000.0,60343.2,0.576703,3430781000.0,58572.866267,4247854000.0,65175.56063,0.600791,0.506499
0,LinearRegression+Data+RobustScaler,3639200000.0,60325.8,0.576944,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
0,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.600938,0.507635
0,LinearRegression+Data_p2+StandardScaler,3080140000.0,55499.1,0.64224,2669495000.0,51667.155766,3577984000.0,59816.25057,0.689375,0.584322


In [140]:
models_results = models_results.reset_index().drop('index', axis=1)
models_results

Unnamed: 0,Model,CV_MSE,CV_RMSE,CV_R2,MSE_train,RMSE_train,MSE_test,RMSE_test,R2_train,R2_test
0,Baseline,,,,8593945000.0,92703.532793,8607589000.0,92777.092629,0.0,0.0
1,LinearRegression+Data_lr+StandardScaler,3639280000.0,60326.4,0.576941,3429410000.0,58561.160775,4250756000.0,65197.825703,0.60095,0.506162
2,LinearRegression+Data_lr+MinMaxScaler,3641520000.0,60345.0,0.576679,3430432000.0,58569.890956,4238983000.0,65107.470545,0.600832,0.50753
3,LinearRegression+Data_lr+RobustScaler,3639460000.0,60328.0,0.576915,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
4,LinearRegression+Data_lr+None,3640170000.0,60333.8,0.576834,3429308000.0,58560.291135,4238063000.0,65100.403463,0.600962,0.507637
5,LinearRegression+Data+StandardScaler,3640280000.0,60334.7,0.576822,3430676000.0,58571.975264,4239123000.0,65108.544625,0.600803,0.507513
6,LinearRegression+Data+MinMaxScaler,3641300000.0,60343.2,0.576703,3430781000.0,58572.866267,4247854000.0,65175.56063,0.600791,0.506499
7,LinearRegression+Data+RobustScaler,3639200000.0,60325.8,0.576944,3429926000.0,58565.572984,4239003000.0,65107.628694,0.60089,0.507527
8,LinearRegression+Data+None,3639810000.0,60330.8,0.576875,3429517000.0,58562.078999,4238075000.0,65100.501558,0.600938,0.507635
9,LinearRegression+Data_p2+StandardScaler,3080140000.0,55499.1,0.64224,2669495000.0,51667.155766,3577984000.0,59816.25057,0.689375,0.584322


In [141]:
models_results.to_csv('../Data/models_results.csv', index=False)