### Import libraries and read the dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns  
from matplotlib import pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

seed = 42
N_SPLITS = 5
N_REPEATS = 1

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,KFold, StratifiedKFold
from sklearn.base import clone
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.ensemble import VotingRegressor

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

import optuna

from sklearn.metrics import mean_squared_error, r2_score

#### Convert the original dataset into its respective statistical features and use only those features,
#### as we observed in our [previous notebook](https://www.kaggle.com/code/shlokshivkar/s4e5-flood-eda-featengg-stats-beginnerfriendly) that using only these features we obtain the best model.

In [None]:
'''
cols = df_train.drop(columns='FloodProbability').columns
cols_test = df_test.drop(columns='id').columns

def add_stat_features(df, cols):
    df['Sum'] = df[cols].sum(axis = 1)   
    df['Mean'] = 0.1*df[cols].mean(axis = 1)
    df['Max'] = df[cols].max(axis = 1)
    df['Min'] = df[cols].min(axis = 1)
    df['Median'] = 0.1*df[cols].median(axis = 1)
    df['Std'] = df[cols].std(axis = 1)
    quantiles = df[cols].quantile([0.25, 0.75], axis=1)
    df['q1'] = quantiles.loc[0.25]
    df['q3'] = quantiles.loc[0.75]
    df['IQR'] = df['q3'] - df['q1']
    df['ptp'] = df[cols].apply(lambda x: np.ptp(x), axis=1)  #ptp stands for peak to peak and it is basically [max - min]
    
    return df

add_stat_features(df_train, cols)
add_stat_features(df_test, cols_test)

'''

In [81]:
df_train = pd.read_csv('statistical_train.csv')
df_test = pd.read_csv('statistical_test.csv')

In [82]:
TARGET = 'FloodProbability'
NUMERIC_COLS = df_train.select_dtypes(include='number').columns.drop(TARGET).tolist()

In [90]:
X = df_train.drop(columns='FloodProbability')
y = df_train.FloodProbability

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#optuna for XGB
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 10),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'random_state': 42,
        'verbose': -1
    }
    
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    return r2    

In [92]:
study_XGB = optuna.create_study(direction='maximize')
study_XGB.optimize(objective, n_trials=20)

[I 2024-05-16 00:24:17,251] A new study created in memory with name: no-name-5f9a7ec4-f1ea-468a-960c-c7d4b3e35d8a
[I 2024-05-16 00:24:28,550] Trial 0 finished with value: 0.8564611316868064 and parameters: {'n_estimators': 232, 'max_depth': 10, 'learning_rate': 0.22354989617017423, 'subsample': 0.8994866520966754, 'colsample_bytree': 0.7650420316285818, 'min_child_weight': 8, 'reg_alpha': 8, 'reg_lambda': 8, 'gamma': 3}. Best is trial 0 with value: 0.8564611316868064.
[I 2024-05-16 00:24:44,720] Trial 1 finished with value: 0.8129018117357888 and parameters: {'n_estimators': 355, 'max_depth': 5, 'learning_rate': 0.12321391008440057, 'subsample': 0.34839667732429735, 'colsample_bytree': 0.23863132675401907, 'min_child_weight': 9, 'reg_alpha': 10, 'reg_lambda': 0, 'gamma': 6}. Best is trial 0 with value: 0.8564611316868064.
[I 2024-05-16 00:25:22,050] Trial 2 finished with value: 0.8085778117934843 and parameters: {'n_estimators': 950, 'max_depth': 9, 'learning_rate': 0.2559252001653466,

KeyboardInterrupt: 

In [None]:
XGB_params = {'n_estimators': 111,
            'max_depth': 8,
            'learning_rate': 0.21377892857200032,
            'subsample': 0.976892966708101,
            'colsample_bytree': 0.984880269641595,
            'min_child_weight': 4,
            'reg_alpha': 2,
            'reg_lambda': 3,
            'gamma': 0}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#optuna for LGB
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_int('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_int('reg_lambda', 0, 10),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'random_state': 42,
        'verbose': -1
    }
    
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    return r2

In [None]:
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective, n_trials=20)

In [None]:
LGB_params = {'n_estimators': 544,
 'max_depth': 7,
 'learning_rate': 0.09078206198624092,
 'subsample': 0.37888095128502086,
 'colsample_bytree': 0.872126903661994,
 'min_child_weight': 4,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'verbose': -1,
 'gamma': 4}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#optuna for CAT

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'random_state': 42,
        'verbose': 0
    }
    
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)   
    
    return r2

In [None]:
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective, n_trials=20)

In [None]:
CAT_params = {'n_estimators': 575,
 'max_depth': 9,
 'learning_rate': 0.08349124099475252,
 'subsample': 0.6660595433113499}

In [55]:
VotingRegressor1 = VotingRegressor(estimators=[('XGB', XGBRegressor(**XGB_params)), ('LGB', LGBMRegressor(**LGB_params)), ('CAT', CatBoostRegressor(**CAT_params))])

VotingRegressor1.fit(X_train, y_train)
y_pred = VotingRegressor1.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(r2)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 525
[LightGBM] [Info] Number of data points in the train set: 894365, number of used features: 10
[LightGBM] [Info] Start training from score 0.504480
0:	learn: 0.0474552	total: 180ms	remaining: 1m 43s
1:	learn: 0.0442277	total: 347ms	remaining: 1m 39s
2:	learn: 0.0412944	total: 524ms	remaining: 1m 39s
3:	learn: 0.0386595	total: 702ms	remaining: 1m 40s
4:	learn: 0.0362856	total: 871ms	remaining: 1m 39s
5:	learn: 0.0341428	total: 1.03s	remaining: 1m 38s
6:	learn: 0.0322280	total: 1.19s	remaining: 1m 36s
7:	learn: 0.0305378	total: 1.53s	remaining: 1m 48s
8:	learn: 0.0290287	total: 1.81s	remaining: 1m 53s
9:	learn: 0.0276933	total: 2s	remaining: 1m 53s
10:	learn: 0.0265109	total: 2.23s	remaining: 1m 54s
11:	learn: 0.0254684	total: 2.44s	remaining: 1m 54s
12:	learn: 0.0245544	total: 2.66s	remaining: 1m

In [58]:
submission = VotingRegressor1.predict(df_test.drop(columns = 'id'))
submission



array([0.57842254, 0.45622205, 0.45010377, ..., 0.62164899, 0.54878674,
       0.52970055])

In [59]:
submission_csv = pd.DataFrame({'id': df_test.id, 'FloodProbability': submission})
submission_csv.to_csv('submission6.csv', index=False)