Cross validation stacking for development

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import PredictionErrorDisplay

def plot_predictions_errors(y, y_pred, title):
    fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
    PredictionErrorDisplay.from_predictions(
        y,
        y_pred=y_pred,
        kind="actual_vs_predicted",
        subsample=100,
        ax=axs[0],
        random_state=0,
    )
    axs[0].set_title("Actual vs. Predicted values")
    PredictionErrorDisplay.from_predictions(
        y,
        y_pred=y_pred,
        kind="residual_vs_predicted",
        subsample=1000,
        ax=axs[1],
        random_state=SEED,
    )
    axs[1].set_title("Residuals vs. Predicted Values")
    fig.suptitle(title)
    plt.tight_layout()
    plt.show()

In [None]:
# CV stacking: 
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

regressors = {
    'LGBMRegressor11': LGBMRegressor(random_state=SEED, n_jobs=-1, boosting_type='gbdt', num_leaves=48, 
                                     max_depth=14, learning_rate=0.08, n_estimators=240, subsample=0.7, colsample_bytree=0.6),
    'XGBRegressor6': XGBRegressor(random_state=SEED, n_jobs=-1, learning_rate=0.055, n_estimators=200, 
                                  max_depth=8, min_child_weight=1, gamma=0.07, colsample_bytree=0.67, 
                                  colsample_bylevel=0.67, colsample_bynode=0.8, subsample=0.7, objective='reg:squarederror'),
    'CatBoostRegressor': CatBoostRegressor(random_state=SEED, silent=True), # Promising but fails on the cv
    'HistGradientBoostingRegressor3': HistGradientBoostingRegressor(random_state=SEED, max_iter=1000, 
                                                                    max_depth=10, learning_rate=0.1, 
                                                                    l2_regularization=0.1, max_leaf_nodes=100, 
                                                                    min_samples_leaf=20, max_bins=255),
}

meta_regressors = [
    ('LinearRegression', LinearRegression()),
    ('RidgeCV', RidgeCV(alphas=np.logspace(-3, 3, 13), cv=cv)),
    ('ElasticNetCV', ElasticNetCV(alphas=np.logspace(-3, 3, 13), cv=cv, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=100000)),
    # ('LassoCV', LassoCV(alphas=np.logspace(-3, 3, 13), cv=cv, max_iter=100000)),  # = ElasticNetCV with l1_ratio=1
    # ('LarsCV', LarsCV(cv=cv, max_iter=100000, n_jobs=-1)),
    # ('OrthogonalMatchingPursuitCV', OrthogonalMatchingPursuitCV(cv=cv, n_jobs=-1)),
    # ('LassoLarsCV', LassoLarsCV(cv=cv, max_iter=10000, n_jobs=-1)),
    # ('BayesianRidge', BayesianRidge()),
]

FIT_REGRESSORS = False
DISPLAY_REGRESSOR_RESULTS = True
PLOT_ERRORS = False
TARGET_TRANSFORMATION = False

if FIT_REGRESSORS:
    # Store out of fold predictions for meta learner
    X_meta_trains = {}  # Dict of datasets used for meta learner training
    X_meta_hold_outs = {}  # Dict of datasets used for meta learner validation
    X_meta_tests = {}  # Dict of datasets used for meta learner testing

for i, (train_index, hold_out_index) in enumerate(cv.split(X_train_prep)):
    t0 = time.time()
    print(f'Fold {i+1} of {cv.get_n_splits()}')
    X_train_cv, X_hold_out = X_train_prep.iloc[train_index].copy(), X_train_prep.iloc[hold_out_index].copy()
    y_train_cv, y_hold_out = y_train.iloc[train_index].copy(), y_train.iloc[hold_out_index].copy()
    
    X_meta_train = pd.DataFrame(index=train_index, columns=[name for name, _ in regressors.items()])
    X_meta_hold_out = pd.DataFrame(index=hold_out_index, columns=[name for name, _ in regressors.items()])
    X_meta_test = pd.DataFrame(index=X_test_prep.index, columns=[name for name, _ in regressors.items()])
    
    if FIT_REGRESSORS:
        # for name, regressor in regressors:
        for name, regressor in regressors.items():
            print(f'Fitting {name} ...')
            if TARGET_TRANSFORMATION:
                ttr = TransformedTargetRegressor(regressor=regressor, func=target_transform, inverse_func=inverse_target_transform, check_inverse=False)
            else:
                ttr = regressor
            
            if name == 'CatBoostRegressor':
                X_meta_train[name] = cross_val_predict(ttr, X_train_cv, y_train_cv, cv=cv, verbose=0)  # CatBoostRegressor fails on n_jobs=-1
            else:
                X_meta_train[name] = cross_val_predict(ttr, X_train_cv, y_train_cv, cv=cv, n_jobs=-1, verbose=0)
            
            # fit the model on the full cv training set
            ttr.fit(X_train_cv, y_train_cv)
            X_meta_hold_out[name] = ttr.predict(X_hold_out)
            X_meta_test[name] = ttr.predict(X_test_prep)

            if DISPLAY_REGRESSOR_RESULTS:
                print(f'Hold out score of {name}: {mean_squared_error(y_hold_out, X_meta_hold_out[name], squared=False):.4f}')
                if not SUBMIT:
                    print(f'Test score of {name}: {mean_squared_error(y_test, X_meta_test[name], squared=False):.4f}')

        # Store datasets for meta learner
        X_meta_trains[i] = X_meta_train.copy()
        X_meta_hold_outs[i] = X_meta_hold_out.copy()
        X_meta_tests[i] = X_meta_test.copy()
        
    # Transform the predictions of regressors with target transform
    if TARGET_TRANSFORMATION:  
        X_meta_train = target_transform(X_meta_trains[i])
        X_meta_hold_out = target_transform(X_meta_hold_outs[i])
        X_meta_test = target_transform(X_meta_tests[i])
    else:
        X_meta_train = X_meta_trains[i]
        X_meta_hold_out = X_meta_hold_outs[i]
        X_meta_test = X_meta_tests[i]

    for name, meta_regressor in meta_regressors:
        # Fit the final estimator on the hold out predictions
        if TARGET_TRANSFORMATION:
            meta_ttr = TransformedTargetRegressor(
                meta_regressor,
                func=target_transform, inverse_func=inverse_target_transform, check_inverse=False)
        else:
            meta_ttr = meta_regressor
        
        meta_ttr.fit(X_meta_train, y_train_cv)
        y_hold_out_pred = meta_ttr.predict(X_meta_hold_out)
        y_test_pred = meta_ttr.predict(X_meta_test)
        
        if not SUBMIT:
            score_eval = mean_squared_error(y_test, y_test_pred, squared=False)
        else:
            score_eval = np.nan
        
        l1_ratio = getattr(meta_ttr.regressor_, 'l1_ratio_', np.nan) if TARGET_TRANSFORMATION else getattr(meta_ttr, 'l1_ratio_', np.nan)
        alpha = getattr(meta_ttr.regressor_, 'alpha_', np.nan) if TARGET_TRANSFORMATION else getattr(meta_ttr, 'alpha_', np.nan)
        coef = getattr(meta_ttr.regressor_, 'coef_', np.nan) if TARGET_TRANSFORMATION else getattr(meta_ttr, 'coef_', np.nan)
        intercept = getattr(meta_ttr.regressor_, 'intercept_', np.nan) if TARGET_TRANSFORMATION else getattr(meta_ttr, 'intercept_', np.nan)
        
        print(f'Meta regressor: {name}, RMSE hold out: {mean_squared_error(y_hold_out, y_hold_out_pred, squared=False)},',
              f'RMSE test: {score_eval:.4f}, fit time: {time.time() - t0:.2f} s,',
              f'Coefficients: {coef}, intercept: {intercept}, l1_ratio: {l1_ratio}, alpha: {alpha}', end='\n'
        )
        if PLOT_ERRORS:
            plot_predictions_errors(y_hold_out, y_hold_out_pred, 'Hold out')
            if not SUBMIT:
                plot_predictions_errors(y_test, y_test_pred, 'Test')

    print('-'*80, end='\n\n')

Save and load out-of-fold predictions to avoid fitting again estimators

In [None]:
# Save all dataframes of X_meta_trains and X_meta_hold_outs to csv seprarately
for i in range(5):
    X_meta_trains[i].to_csv(f'datasets/X_meta_trains_{i}.csv')
    X_meta_hold_outs[i].to_csv(f'datasets/X_meta_hold_outs_{i}.csv')
    X_meta_tests[i].to_csv(f'datasets/X_meta_tests_{i}.csv')

In [None]:
# Load all dataframes of X_meta_trains and X_meta_hold_outs from csv seprarately
X_meta_trains_2, X_meta_hold_outs_2 = {}, {}
for i in range(5):
    X_meta_trains_2[i] = pd.read_csv(f'datasets/X_meta_trains_{i}.csv', index_col=0)
    X_meta_hold_outs_2[i] = pd.read_csv(f'datasets/X_meta_hold_outs_{i}.csv', index_col=0)
    X_meta_tests_2[i] = pd.read_csv(f'datasets/X_meta_tests_{i}.csv', index_col=0)

For final predictions, use sklearn stacking function

In [None]:
# Simple stacking with sklearn without target transformation
regressors = [
    ('LGBMRegressor11', LGBMRegressor(random_state=SEED, n_jobs=1, boosting_type='gbdt', num_leaves=48, 
                                     max_depth=14, learning_rate=0.08, n_estimators=240, subsample=0.7, colsample_bytree=0.6)),
    ('XGBRegressor6', XGBRegressor(random_state=SEED, n_jobs=-1, learning_rate=0.055, n_estimators=200,  
                                  max_depth=8,  min_child_weight=1, gamma=0.07,  colsample_bytree=0.67, 
                                  colsample_bylevel=0.67, colsample_bynode=0.8, subsample=0.7, 
                                  objective='reg:squarederror')),
    ('CatBoostRegressor', CatBoostRegressor(random_state=SEED, silent=True)),
    ('HistGradientBoostingRegressor2', HistGradientBoostingRegressor(random_state=SEED, max_iter=1000, 
                                                                    max_depth=10, learning_rate=0.1, 
                                                                    l2_regularization=0.1, max_leaf_nodes=100, 
                                                                    min_samples_leaf=20, max_bins=255)),
    
]

model = StackingRegressor(
    estimators=regressors,
    final_estimator=LinearRegression(cv=cv, max_iter=10000, n_jobs=-1),
    cv=cv,
    n_jobs=-1,
    verbose=1,
    )

model.fit(X_train_prep, y_train)
y_pred_StackingRegressor = model.predict(X_test_prep)

# Save predictions
sub = pd.read_csv('submissions/sample_submission.csv')
sub['price'] = y_pred_StackingRegressor
now = time.strftime("%Y-%m-%d %H_%M_%S")
sub.to_csv(f'submissions/submission_StackingRegressor{now}.csv', index=False)