In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
import config
import xgboost as xgb
from catboost import CatBoostRegressor

In [6]:
df = pd.read_csv(config.CONFIG['paths']['train_with_folds_fe'])
df = pd.read_csv(config.CONFIG['paths']['train_with_folds_fe'])
TARGET_COL = config.CONFIG['validation']['target_column']
N_SPLITS = config.CONFIG['validation']['n_splits']

feature_cols = [c for c in df.columns if c not in [TARGET_COL, 'fold'] and pd.api.types.is_numeric_dtype(df[c])]
print('Фич', len(feature_cols), 'Строк', len(df))

Фич 296 Строк 1460


In [7]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [8]:
def run_cv_oof(model_class, model_params, df, target_col=TARGET_COL, n_splits=N_SPLITS):
    oof_preds = np.zeros(len(df))
    scores = []

    for fold in range(n_splits):
        train_mask = df['fold'] != fold
        val_mask = df['fold'] == fold
        X_train = df.loc[train_mask, feature_cols]
        y_train = df.loc[train_mask, target_col]
        X_val = df.loc[val_mask, feature_cols]
        y_val = df.loc[val_mask, target_col]
        
        model = model_class(**model_params)
        model.fit(X_train, y_train)
        oof_preds[val_mask] = model.predict(X_val)
        scores.append(rmse(y_val, oof_preds[val_mask]))

    return oof_preds, scores

In [10]:
params_ridge = {**config.CONFIG['models']['ridge']}
params_rf = {**config.CONFIG['models']['random_forest']}
params_xgb = {**config.CONFIG['models']['xgboost'], 'random_state': config.CONFIG['seed']}
params_cat = {**config.CONFIG['models']['catboost'], 'verbose': 0}

In [11]:
oof_ridge, scores_ridge = run_cv_oof(Ridge, params_ridge, df)
print('Ridge: RMSE', round(np.mean(scores_ridge), 4), '+-', round(np.std(scores_ridge), 4))

Ridge: RMSE 0.1435 +- 0.0373


In [12]:
oof_rf, scores_rf = run_cv_oof(RandomForestRegressor, params_rf, df)
print('RF: RMSE', round(np.mean(scores_rf), 4), '+-', round(np.std(scores_rf), 4))

RF: RMSE 0.1432 +- 0.0177


In [15]:
oof_xgb, scores_xgb = run_cv_oof(xgb.XGBRegressor, params_xgb, df)
print('XGBoost: RMSE', round(np.mean(scores_xgb), 4), '+-', round(np.std(scores_xgb), 4))

XGBoost: RMSE 0.1257 +- 0.0173


In [14]:
oof_cat, scores_cat = run_cv_oof(CatBoostRegressor, params_cat, df)
print('CatBoost: RMSE', round(np.mean(scores_cat), 4), '+-', round(np.std(scores_cat), 4))

CatBoost: RMSE 0.1221 +- 0.018


In [None]:
estimators = [
    ('ridge', Ridge(**params_ridge)),
    ('rf', RandomForestRegressor(**params_rf)),
    ('xgb', xgb.XGBRegressor(**params_xgb)),
    ('cat', CatBoostRegressor(**params_cat))
]

In [18]:
voting = VotingRegressor(estimators=estimators)
_, scores_vote = run_cv_oof(VotingRegressor, {'estimators': estimators}, df)
print('Voting OOF RMSE:', round(np.mean(scores_vote), 4), '+-', round(np.std(scores_vote), 4))

Voting OOF RMSE: 0.1245 +- 0.0231


In [20]:
y_true = df[TARGET_COL].values
oof_list = [oof_ridge, oof_rf, oof_xgb, oof_cat]

In [24]:
X_stack = np.column_stack(oof_list)
meta_ridge = Ridge(alpha=1.0)
meta_ridge.fit(X_stack, y_true)
pred_stack_ridge = meta_ridge.predict(X_stack)
rmse_stack_ridge = rmse(y_true, pred_stack_ridge)
print('Stacking (Ridge на OOF) RMSE:', round(rmse_stack_ridge, 4))

meta_lr = LinearRegression()
meta_lr.fit(X_stack, y_true)
pred_stack_lr = meta_lr.predict(X_stack)
rmse_stack_lr = rmse(y_true, pred_stack_lr)
print('Stacking (LinearRegression на OOF) RMSE:', round(rmse_stack_lr, 4))

Stacking (Ridge на OOF) RMSE: 0.1232
Stacking (LinearRegression на OOF) RMSE: 0.1229


In [25]:
oof_avg = np.mean(oof_list, axis=0)
rmse_avg = rmse(y_true, oof_avg)

results = [
    {'model': 'Ridge', 'mean_rmse': np.mean(scores_ridge), 'std_rmse': np.std(scores_ridge)},
    {'model': 'RF', 'mean_rmse': np.mean(scores_rf), 'std_rmse': np.std(scores_rf)},
    {'model': 'XGBoost', 'mean_rmse': np.mean(scores_xgb), 'std_rmse': np.std(scores_xgb)},
    {'model': 'CatBoost', 'mean_rmse': np.mean(scores_cat), 'std_rmse': np.std(scores_cat)},
    {'model': 'Averaging', 'mean_rmse': rmse_avg, 'std_rmse': 0.0},
    {'model': 'Voting', 'mean_rmse': np.mean(scores_vote), 'std_rmse': np.std(scores_vote)},
    {'model': 'StackingRidge', 'mean_rmse': rmse_stack_ridge, 'std_rmse': 0.0},
    {'model': 'StackingLR', 'mean_rmse': rmse_stack_lr, 'std_rmse': 0.0},
]

res_df = pd.DataFrame(results).sort_values('mean_rmse', ascending=False)
print(res_df.to_string(index=False))

path_ens = config.CONFIG['paths']['ensemble_results']
res_df.to_csv(path_ens, index=False)
print('Сохранено:', path_ens)

        model  mean_rmse  std_rmse
        Ridge   0.143453  0.037257
           RF   0.143215  0.017718
    Averaging   0.126675  0.000000
      XGBoost   0.125665  0.017309
       Voting   0.124548  0.023111
StackingRidge   0.123185  0.000000
   StackingLR   0.122918  0.000000
     CatBoost   0.122060  0.017975
Сохранено: C:\newTry2\classicMLpractice\ProjectKaggle\HousePrices\checkpoints\ensemble_results.csv
