In [185]:
%matplotlib inline

### Создадим по 2 предикта для train и test, которые будут использоваться для обучения основной модели

In [186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold

plt.style.use('seaborn-v0_8-talk')
plt.rcParams['figure.figsize'] = (12,8)

font = {'family': 'Verdana',
        'weight': 'normal'}
plt.rc('font', **font)

In [187]:
df_train = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")
df_train.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,560,,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,4510000
1,1,2011-1,667,,10,50,2,1,25.0,,...,0,0,0,0,0,0,0,0,0,13231000
2,2,2011-1,90,0.0,1,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,2008000
3,3,2011-1,94,1.0,3,62,3,1,30.0,,...,0,0,0,0,0,0,0,0,0,12680000
4,4,2011-1,232,0.0,3,60,3,0,25.0,,...,0,0,0,0,0,0,0,0,0,3335000


In [188]:
X_train = df_train.iloc[:, np.arange(0, 24)]
Y_train = df_train['price']
X_test = df_test
X_test.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,100000,2012-3,459,,1,60,3,1,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100001,2012-3,344,1.0,10,52,2,1,,,...,0,0,0,0,0,0,0,0,0,0
2,100002,2012-3,585,0.0,4,54,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,100003,2012-3,494,,2,52,2,1,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,100004,2012-3,622,1.0,9,60,3,1,15.0,,...,0,0,0,0,0,0,0,0,0,0


In [189]:
def df_fill_missed(df):
    df.loc[:, "g_lift"] = df.groupby("floor")["g_lift"].transform(lambda s: s.fillna(s.mode().iloc[0]))
    df.loc[:, "build_tech"] = df["build_tech"].transform(lambda s: s.fillna(s.mean()))
    df.loc[:, "metro_dist"] = df.groupby("street_id")["metro_dist"].transform(lambda s: s.fillna(s.mean()))
    return df

In [190]:
X_train = df_fill_missed(X_train)
X_test = df_fill_missed(X_test)
X_train.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,0,2011-1,560,0.529837,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,2011-1,667,0.529837,10,50,2,1,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,2,2011-1,90,0.0,1,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,2011-1,94,1.0,3,62,3,1,30.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,4,2011-1,232,0.0,3,60,3,0,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [191]:
def df_transform_date(df):
    dt = pd.to_datetime(df["date"], format="%Y-%m")
    uniq = dt.sort_values().unique()
    mp = {d: i for i, d in enumerate(uniq)}
    df.loc[:, "date"] = dt.map(mp)
    return df

In [192]:
X_train = df_transform_date(X_train)
X_test = df_transform_date(X_test)

In [193]:
def df_create_new_features(df):
    df['area_per_room'] = df['area'] / df['rooms']
    df['build_tech_balcon'] = df['balcon'] * df['build_tech']
    df['build_tech_balcon_area'] = df['build_tech_balcon'] * df['area']
    return df

In [194]:
X_train = df_create_new_features(X_train)
X_test = df_create_new_features(X_test)
X_train.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw7,kw8,kw9,kw10,kw11,kw12,kw13,area_per_room,build_tech_balcon,build_tech_balcon_area
0,0,0,560,0.529837,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,19.666667,0.0,0.0
1,1,0,667,0.529837,10,50,2,1,25.0,1.0,...,0,0,0,0,0,0,0,25.0,0.529837,26.491852
2,2,0,90,0.0,1,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,24.0,0.0,0.0
3,3,0,94,1.0,3,62,3,1,30.0,1.0,...,0,0,0,0,0,0,0,20.666667,1.0,62.0
4,4,0,232,0.0,3,60,3,0,25.0,1.0,...,0,0,0,0,0,0,0,20.0,0.0,0.0


In [195]:
def df_normalize_train_test(X_train, X_test, categorical_features, numeric_features):
    scaler = RobustScaler()
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    
    X_train_num = scaler.fit_transform(X_train[numeric_features])
    X_train_cat = ohe.fit_transform(X_train[categorical_features])
    
    X_test_num = scaler.transform(X_test[numeric_features])
    X_test_cat = ohe.transform(X_test[categorical_features])
    
    X_train_union = np.hstack([X_train_num, X_train_cat])
    X_test_union = np.hstack([X_test_num, X_test_cat])
    
    all_cols = numeric_features + list(ohe.get_feature_names_out(categorical_features))
    
    X_train_normalized = pd.DataFrame(X_train_union, columns=all_cols, index=X_train.index)
    X_test_normalized = pd.DataFrame(X_test_union, columns=all_cols, index=X_test.index)
    
    return X_train_normalized, X_test_normalized

In [196]:
categorical_features = ['street_id', 'balcon', 'rooms']
numeric_features = [
    'date', 'build_tech', 'floor',
    'area', 'metro_dist',
    'area_per_room', 'build_tech_balcon', 
    'build_tech_balcon_area',
]

X_train_scaled, X_test_scaled = df_normalize_train_test(
    X_train, X_test, categorical_features, numeric_features
)
Y_train_scaled = np.log(Y_train)
X_train_scaled

Unnamed: 0,date,build_tech,floor,area,metro_dist,area_per_room,build_tech_balcon,build_tech_balcon_area,street_id_0,street_id_1,...,street_id_671,balcon_0,balcon_1,balcon_2,rooms_1,rooms_2,rooms_3,rooms_4,rooms_5,rooms_6
0,-0.888889,0.000000,-0.4,0.35,0.333333,-0.460526,0.000000,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.888889,0.000000,1.2,-0.10,0.000000,-0.039474,0.529837,0.756910,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.888889,-0.529837,-0.6,-0.20,0.000000,-0.118421,0.000000,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.888889,0.470163,-0.2,0.50,0.333333,-0.381579,1.000000,1.771429,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.888889,-0.529837,-0.2,0.40,0.000000,-0.434211,0.000000,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.666667,-0.529837,0.2,-1.10,0.333333,0.355263,0.000000,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99996,0.666667,-0.529837,0.0,-0.90,0.333333,0.671053,0.000000,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
99997,0.666667,0.000000,-0.2,0.00,-1.333333,0.039474,0.000000,0.000000,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
99998,0.666667,0.470163,-0.2,6.45,-1.333333,0.844737,1.000000,5.171429,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Используя K-fold, будем постепенно находить предикты для каждого объекта выборки

In [197]:
def create_oof_predictions(X_train, y_train, X_test, model_params, n_folds=5, random_state=42):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    oof_train_preds = np.zeros(len(X_train))  
    test_preds = np.zeros(len(X_test))        
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):        
        X_fold_train = X_train.iloc[train_idx]
        X_fold_val = X_train.iloc[val_idx]
        y_fold_train = y_train.iloc[train_idx]
        y_fold_val = y_train.iloc[val_idx]
        
        model = CatBoostRegressor(**model_params)
        model.fit(X_fold_train, y_fold_train)
        
        oof_train_preds[val_idx] = model.predict(X_fold_val)
        
        fold_test_preds = model.predict(X_test)
        test_preds += fold_test_preds / n_folds
    
    return oof_train_preds, test_preds

In [198]:
def create_multiple_oof_features(X_train, y_train, X_test):    
    model_configs = [
        {
            'name': 'CatBoostRegressor_predict',
            'params': {
                'iterations': 2000,
                'depth': 6,
                'learning_rate': 0.8,
                'l2_leaf_reg': 3,
                'random_seed': 42,
                'loss_function': 'MAE',  
                'silent': True
            }
        },
        {
            'name': 'RobustScaler_CatBoostRegressor_predict', 
            'params': {
                'iterations': 1500,
                'depth': 5,
                'learning_rate': 0.5,
                'l2_leaf_reg': 8,
                'random_seed': 42,
                'loss_function': 'MAE',  
                'silent': True
            }
        }
    ]
    
    oof_features_train = pd.DataFrame(index=X_train.index)
    oof_features_test = pd.DataFrame(index=X_test.index)
    
    for config in model_configs:        
        oof_train, oof_test = create_oof_predictions(
            X_train, y_train, X_test, config['params'], 10
        )
        train_col_name = f"{config['name']}"
        test_col_name = f"{config['name']}"  
        
        oof_features_train[train_col_name] = oof_train
        oof_features_test[test_col_name] = oof_test  
        
    return oof_features_train, oof_features_test

In [199]:
oof_train_features, oof_test_features = create_multiple_oof_features(
    X_train_scaled, Y_train, X_test_scaled
)
oof_train_features

Unnamed: 0,CatBoostRegressor_predict,RobustScaler_CatBoostRegressor_predict
0,4.440805e+06,4.301709e+06
1,1.159053e+07,1.299189e+07
2,2.100388e+06,2.053757e+06
3,1.844535e+07,1.465414e+07
4,4.012178e+06,4.306491e+06
...,...,...
99995,1.557028e+06,1.635755e+06
99996,3.654711e+06,3.488520e+06
99997,5.924082e+06,5.860592e+06
99998,2.397073e+07,2.586346e+07


In [200]:
oof_test_features

Unnamed: 0,CatBoostRegressor_predict,RobustScaler_CatBoostRegressor_predict
0,7.222853e+06,7.217352e+06
1,3.536005e+06,3.731788e+06
2,3.099441e+06,3.418726e+06
3,2.546569e+06,2.489952e+06
4,5.021386e+06,5.657773e+06
...,...,...
99995,2.034673e+06,2.082788e+06
99996,6.449927e+06,6.334637e+06
99997,4.880881e+06,4.656134e+06
99998,3.668659e+06,3.733241e+06


In [201]:
oof_train_features.to_csv('train_ensemble_predicts.csv', index=False)
oof_test_features.to_csv('test_ensemble_predicts.csv', index=False)

In [202]:
from sklearn.metrics import mean_absolute_error

In [203]:
Y_pred = oof_train_features['RobustScaler_CatBoostRegressor_predict']
mean_absolute_error(Y_pred, Y_train)

652434.6009115992

In [204]:
Y_pred

0        4.301709e+06
1        1.299189e+07
2        2.053757e+06
3        1.465414e+07
4        4.306491e+06
             ...     
99995    1.635755e+06
99996    3.488520e+06
99997    5.860592e+06
99998    2.586346e+07
99999    3.715449e+06
Name: RobustScaler_CatBoostRegressor_predict, Length: 100000, dtype: float64

### Получили неплохой MAE для train выборки