In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import optuna

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import StackingRegressor

In [2]:
# Снятие ограничений по колонкам
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('data/train.csv')
df_pred = pd.read_csv('data/test.csv')
original_df = pd.read_csv('data/flood.csv')

In [4]:
# Удаляю id
df.drop(columns='id', inplace=True)
df_pred.drop(columns='id', inplace=True)

In [5]:
X = df.drop(columns='FloodProbability')
y = df['FloodProbability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
cat_params = {'random_seed': 42,
               'verbose': 0,
               'eval_metric': 'R2',
               'iterations': 1498,
               'learning_rate': 0.06715518880782499,
               'depth': 11,
               'l2_leaf_reg': 1.7801302641717838}

lgbm_params = {"metric": 'R2',
                "verbosity": -1,
                "boosting_type": "gbdt",
                "random_state": 42,
                'learning_rate': 0.04267423467176455,
                'n_estimators': 998,
                'lambda_l1': 1.0058562181737075e-08,
                'lambda_l2': 5.572970804653375,
                'max_depth': 21,
                'min_child_samples': 18,
                'feature_fraction': 0.841233081764558,
                'bagging_fraction': 0.8761104983674854}

xgbr_params= {'max_depth': 9,
            'learning_rate': 0.8472487450322445,
            'n_estimators': 603,
            'min_child_weight': 2,
            'gamma': 0.000605797605256695,
            'subsample': 0.7538618508364402,
            'colsample_bytree': 0.07179130792340982,
            'reg_alpha': 0.5002004024869536,
            'reg_lambda': 0.27763193395884594}


In [9]:
base_models = [
    # ('XGBoost', XGBRegressor(**xgbr_params)),
    ('LightGBM', LGBMRegressor(**lgbm_params)),
    ('CatBoost', CatBoostRegressor(**cat_params))]

# meta_model - default  RidgeCV  # score: 0.8478528009750723
meta_model = XGBRegressor(**xgbr_params)  # score: 0.8605694219420743 kaglle score: 0.86038
# meta_model = LGBMRegressor(**lgbm_params)  # score: 
# meta_model = CatBoostRegressor(**cat_params)  # score: 

# score test: 

In [10]:
# Validation model
stacking_valid_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
stacking_valid_model.fit(X_train, y_train)

y_pred_val = stacking_valid_model.predict(X_test)
r2_val = r2_score(y_test, y_pred_val)
r2_val

0.8605694219420743

In [11]:
# # Testing model
# stacking_test_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
# stacking_test_model.fit(X, y)

pred = stacking_valid_model.predict(df_pred)

In [12]:
df_pred_duble = pd.read_csv('data/test.csv')
df_pred_Id = df_pred_duble['id']

output = pd.DataFrame({'id': df_pred_Id, 'FloodProbability': pred})
output.to_csv('data/ensemble_model.csv', index=False)