In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import early_stopping,log_evaluation, Dataset, LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import roc_auc_score, r2_score,make_scorer 
from sklearn import set_config
import warnings
import optuna
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from pprint import pprint
from time import time
from scipy.stats import kurtosis, skew, gmean, mode,trim_mean, mstats
from scipy.stats.mstats import winsorize
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta

warnings.filterwarnings('ignore')

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

## Quick EDA

In [None]:
train.info()

In [None]:
from sklearn.mixture import GaussianMixture

numerical_columns = [column for column in train.columns if train[column].dtype in ['int64','float64'] and column not in ['id','FloodProbability']]

#num_subplots = min(len(numerical_columns), 20)  
#fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(15, 15))

# Plot KDE plots for train and test sets
#for i in range(num_subplots):
#    row = i // 2
#    col = i % 2
    
#    sns.kdeplot(data=train, x=numerical_columns[i], ax=axes[row, col], color='blue', label='Train')
#    sns.kdeplot(data=test, x=numerical_columns[i], ax=axes[row, col], color='orange', label='Test')
   
#    axes[row, col].set_title(numerical_columns[i])
#    axes[row, col].legend()

#plt.tight_layout()
#plt.show()

In [None]:
def  get_gmm_class_features(feat,n):
    gmm=GaussianMixture(n_components=n,random_state=42)
    gmm.fit(train[feat].fillna(train[feat].median()).values.reshape(-1,1))
    train[f'{feat}_class']=gmm.predict(train[feat].fillna(train[feat].median()).values.reshape(-1,1))
    test[f'{feat}_class']=gmm.predict(test[feat].fillna(test[feat].median()).values.reshape(-1,1))

    
for col in numerical_columns:
    get_gmm_class_features(col,5)
    

## Modelling

In [None]:
## Preprocessing idea
# https://www.kaggle.com/code/oscarm524/ps-s4-ep5-eda-modeling-submission#Loading-Libraries

class Model:
    def __init__(self, train, test):
        self.train = train
        self.test = test
        self.model_dict = dict()
        self.test_predict_list = list()
        self.risk_factors = ['MonsoonIntensity','TopographyDrainage','RiverManagement',
         'Deforestation','Urbanization','ClimateChange','DamsQuality','Siltation','AgriculturalPractices',
         'Encroachments','IneffectiveDisasterPreparedness','DrainageSystems','CoastalVulnerability','Landslides',
         'Watersheds','DeterioratingInfrastructure','PopulationScore',
         'WetlandLoss','InadequatePlanning','PoliticalFactors']
        
    def preprocess(self):
        for i in np.arange(0.0, 1.01, 0.01):
            self.train[f'Q_{i}_Risk'] = self.train[self.risk_factors].quantile(i, axis=1)
            self.test[f'Q_{i}_Risk'] = self.test[self.risk_factors].quantile(i, axis=1)
        # Calculate risk-related statistics for train data
        self.train['fsum2'] = self.train[self.risk_factors].product(axis=1)
        self.train['log_fsum2'] = np.log1p(self.train['fsum2'])
        self.train['fmin'] = np.linalg.norm(self.train[self.risk_factors], axis=1)
        self.train['total'] = self.train[self.risk_factors].sum(axis=1)
        self.train['special1'] = self.train['total'].isin(np.arange(72, 76))
        self.train['special2'] = self.train['total'].isin(np.arange(87, 94))
        self.train['Avg_Risk'] = self.train[self.risk_factors].mean(axis=1)
        self.train['Med_Risk'] = self.train[self.risk_factors].median(axis=1)
#         self.train['Q5_Risk'] = self.train[self.risk_factors].quantile(0.05, axis=1)
#         self.train['Q10_Risk'] = self.train[self.risk_factors].quantile(0.10, axis=1)
#         self.train['Q15_Risk'] = self.train[self.risk_factors].quantile(0.15, axis=1)
#         self.train['Q20_Risk'] = self.train[self.risk_factors].quantile(0.20, axis=1)
#         self.train['Q25_Risk'] = self.train[self.risk_factors].quantile(0.25, axis=1)
#         self.train['Q30_Risk'] = self.train[self.risk_factors].quantile(0.30, axis=1)
#         self.train['Q35_Risk'] = self.train[self.risk_factors].quantile(0.35, axis=1)
#         self.train['Q40_Risk'] = self.train[self.risk_factors].quantile(0.40, axis=1)
#         self.train['Q45_Risk'] = self.train[self.risk_factors].quantile(0.45, axis=1)
#         self.train['Q50_Risk'] = self.train[self.risk_factors].quantile(0.50, axis=1)
#         self.train['Q55_Risk'] = self.train[self.risk_factors].quantile(0.55, axis=1)
#         self.train['Q60_Risk'] = self.train[self.risk_factors].quantile(0.60, axis=1)
#         self.train['Q65_Risk'] = self.train[self.risk_factors].quantile(0.65, axis=1)
#         self.train['Q70_Risk'] = self.train[self.risk_factors].quantile(0.70, axis=1)
#         self.train['Q72_Risk'] = self.train[self.risk_factors].quantile(0.72, axis=1)
#         self.train['Q73_Risk'] = self.train[self.risk_factors].quantile(0.73, axis=1)
#         self.train['Q74_Risk'] = self.train[self.risk_factors].quantile(0.74, axis=1)
#         self.train['Q75_Risk'] = self.train[self.risk_factors].quantile(0.75, axis=1)
#         self.train['Q76_Risk'] = self.train[self.risk_factors].quantile(0.76, axis=1)
#         self.train['Q80_Risk'] = self.train[self.risk_factors].quantile(0.80, axis=1)
#         self.train['Q85_Risk'] = self.train[self.risk_factors].quantile(0.85, axis=1)
#         self.train['Q90_Risk'] = self.train[self.risk_factors].quantile(0.90, axis=1)
#         self.train['Q95_Risk'] = self.train[self.risk_factors].quantile(0.95, axis=1)
#         self.train['Q100_Risk'] = self.train[self.risk_factors].quantile(1.0, axis=1)
        self.train['Std_Risk'] = self.train[self.risk_factors].std(axis=1)
        self.train['Min_Risk'] = self.train[self.risk_factors].min(axis=1)
        self.train['Max_Risk'] = self.train[self.risk_factors].max(axis=1)
        self.train['fsqt'] = self.train[self.risk_factors].apply(np.sqrt).sum(axis=1)
        self.train['range'] = self.train['Max_Risk'] - self.train['Min_Risk']
        self.train['Variance'] = self.train[self.risk_factors].var(axis=1)
        self.train['Kurtosis'] = kurtosis(self.train[self.risk_factors], axis=1)
        self.train['Skew'] = skew(self.train[self.risk_factors], axis=1)
        self.train['gmean'] = gmean(self.train[self.risk_factors], axis=1)
        self.train['ptp'] = self.train[self.risk_factors].values.ptp(axis=1)
        self.train['mode'] = mode(self.train[self.risk_factors], axis=1)[0]
        self.train['IQR'] = self.train[self.risk_factors].quantile(0.75, axis=1) - self.train[self.risk_factors].quantile(0.25, axis=1)
        self.train['CV'] = self.train[self.risk_factors].std(axis=1) / self.train[self.risk_factors].mean(axis=1)
        self.train['std_range'] = self.train['Std_Risk'] / self.train['range']
        self.train['mean_range'] = self.train['Avg_Risk'] / self.train['range']
        self.train['std_mean'] = self.train['Std_Risk'] / self.train['Avg_Risk']
        self.train['std_median'] = self.train['Std_Risk'] / self.train['Med_Risk']
        self.train['mean_median'] = self.train['Avg_Risk'] / self.train['Med_Risk']
        self.train['std_fsum2'] = self.train['Std_Risk'] / self.train['fsum2']
        self.train['mean_fsum2'] = self.train['Avg_Risk'] / self.train['fsum2']
        self.train['std_fsqt'] = self.train['Std_Risk'] / self.train['fsqt']
        self.train['mean_fsqt'] = self.train['Avg_Risk'] / self.train['fsqt']
        self.train['std_log_fsum'] = self.train['Std_Risk'] / self.train['log_fsum2']
        self.train['fsum_sq'] = self.train['fsum2'] ** 2
        self.train['fsum_cu'] = self.train['fsum2'] ** 3
        self.train['euclidean_distance'] = np.sqrt((self.train[self.risk_factors]**2).sum(axis=1))
        self.train['harmonic'] = len(self.risk_factors) / self.train[self.risk_factors].apply(lambda x: (1/x).mean(), axis=1)
        self.train['zscore'] = self.train[self.risk_factors].apply(lambda x: (x - x.mean()) / x.std(), axis=1).mean(axis=1)
        self.train['cv'] = self.train[self.risk_factors].std(axis=1) / self.train[self.risk_factors].mean(axis=1)
        self.train['Skewness_75'] = (self.train[self.risk_factors].quantile(0.75, axis=1) - self.train[self.risk_factors].mean(axis=1)) / self.train[self.risk_factors].std(axis=1)
        self.train['Skewness_25'] = (self.train[self.risk_factors].quantile(0.25, axis=1) - self.train[self.risk_factors].mean(axis=1)) / self.train[self.risk_factors].std(axis=1)
        self.train['2ndMoment'] = self.train[self.risk_factors].apply(lambda x: (x**2).mean(), axis=1)
        self.train['3rdMoment'] = self.train[self.risk_factors].apply(lambda x: (x**3).mean(), axis=1)
        self.train['entropy'] = self.train[self.risk_factors].apply(lambda x: -1*(x*np.log(x)).sum(), axis=1)
        

        self.train['ClimateImpact'] = self.train['MonsoonIntensity'] + self.train['ClimateChange']
        self.train['AnthropogenicPressure'] = self.train['Deforestation'] + self.train['Urbanization'] + self.train['AgriculturalPractices'] + self.train['Encroachments']
        self.train['InfrastructureQuality'] = self.train['DamsQuality'] + self.train['DrainageSystems'] + self.train['DeterioratingInfrastructure']
        self.train['CoastalVulnerabilityTotal'] = self.train['CoastalVulnerability'] + self.train['Landslides']
        self.train['PreventiveMeasuresEfficiency'] = self.train['RiverManagement'] + self.train['IneffectiveDisasterPreparedness'] + self.train['InadequatePlanning']
        self.train['EcosystemImpact'] = self.train['WetlandLoss'] + self.train['Watersheds']
        self.train['SocioPoliticalContext'] = self.train['PopulationScore'] * self.train['PoliticalFactors']
        self.train['ClimateImpact_Urbanization'] = self.train['MonsoonIntensity'] * self.train['Urbanization']
        self.train['DamsQuality_Siltation'] = self.train['DamsQuality'] * self.train['Siltation']
        self.train['Encroachments_Landslides'] = self.train['Encroachments'] * self.train['Landslides']
        self.train['Deforestation_Urbanization'] = self.train['Deforestation'] / self.train['Urbanization']
        self.train['DrainageSystems_DeterioratingInfrastructure'] = self.train['DrainageSystems'] / self.train['DeterioratingInfrastructure']
        self.train['CoastalVulnerability_Landslides'] = self.train['CoastalVulnerability'] / self.train['Landslides']

        # Calculate risk-related statistics for test data
        self.test['fsum2'] = self.test[self.risk_factors].product(axis=1)
        self.test['log_fsum2'] = np.log1p(self.test['fsum2'])
        self.test['fmin'] = np.linalg.norm(self.test[self.risk_factors], axis=1)
        self.test['total'] = self.test[self.risk_factors].sum(axis=1)
        self.test['special1'] = self.test['total'].isin(np.arange(72, 76))
        self.test['special2'] = self.test['total'].isin(np.arange(87, 94))
        self.test['Avg_Risk'] = self.test[self.risk_factors].mean(axis=1)
        self.test['Med_Risk'] = self.test[self.risk_factors].median(axis=1)
#         self.test['Q5_Risk'] = self.test[self.risk_factors].quantile(0.05, axis=1)
#         self.test['Q10_Risk'] = self.test[self.risk_factors].quantile(0.10, axis=1)
#         self.test['Q15_Risk'] = self.test[self.risk_factors].quantile(0.15, axis=1)
#         self.test['Q20_Risk'] = self.test[self.risk_factors].quantile(0.20, axis=1)
#         self.test['Q25_Risk'] = self.test[self.risk_factors].quantile(0.25, axis=1)
#         self.test['Q30_Risk'] = self.test[self.risk_factors].quantile(0.30, axis=1)
#         self.test['Q35_Risk'] = self.test[self.risk_factors].quantile(0.35, axis=1)
#         self.test['Q40_Risk'] = self.test[self.risk_factors].quantile(0.40, axis=1)
#         self.test['Q45_Risk'] = self.test[self.risk_factors].quantile(0.45, axis=1)
#         self.test['Q50_Risk'] = self.test[self.risk_factors].quantile(0.50, axis=1)
#         self.test['Q55_Risk'] = self.test[self.risk_factors].quantile(0.55, axis=1)
#         self.test['Q60_Risk'] = self.test[self.risk_factors].quantile(0.60, axis=1)
#         self.test['Q65_Risk'] = self.test[self.risk_factors].quantile(0.65, axis=1)
#         self.test['Q70_Risk'] = self.test[self.risk_factors].quantile(0.70, axis=1)
#         self.test['Q72_Risk'] = self.test[self.risk_factors].quantile(0.72, axis=1)
#         self.test['Q73_Risk'] = self.test[self.risk_factors].quantile(0.73, axis=1)
#         self.test['Q74_Risk'] = self.test[self.risk_factors].quantile(0.74, axis=1)
#         self.test['Q75_Risk'] = self.test[self.risk_factors].quantile(0.75, axis=1)
#         self.test['Q76_Risk'] = self.test[self.risk_factors].quantile(0.76, axis=1)
#         self.test['Q80_Risk'] = self.test[self.risk_factors].quantile(0.80, axis=1)
#         self.test['Q85_Risk'] = self.test[self.risk_factors].quantile(0.85, axis=1)
#         self.test['Q90_Risk'] = self.test[self.risk_factors].quantile(0.90, axis=1)
#         self.test['Q95_Risk'] = self.test[self.risk_factors].quantile(0.95, axis=1)
#         self.test['Q100_Risk'] = self.test[self.risk_factors].quantile(1.0, axis=1)
        self.test['Std_Risk'] = self.test[self.risk_factors].std(axis=1)
        self.test['Min_Risk'] = self.test[self.risk_factors].min(axis=1)
        self.test['Max_Risk'] = self.test[self.risk_factors].max(axis=1)
        self.test['fsqt'] = self.test[self.risk_factors].apply(np.sqrt).sum(axis=1)
        self.test['range'] = self.test['Max_Risk'] - self.test['Min_Risk']
        self.test['Variance'] = self.test[self.risk_factors].var(axis=1)
        self.test['Kurtosis'] = kurtosis(self.test[self.risk_factors], axis=1)
        self.test['Skew'] = skew(self.test[self.risk_factors], axis=1)
        self.test['gmean'] = gmean(self.test[self.risk_factors], axis=1)
        self.test['ptp'] = self.test[self.risk_factors].values.ptp(axis=1)
        self.test['mode'] = mode(self.test[self.risk_factors], axis=1)[0]
        self.test['IQR'] = self.test[self.risk_factors].quantile(0.75, axis=1) - self.test[self.risk_factors].quantile(0.25, axis=1)
        self.test['CV'] = self.test[self.risk_factors].std(axis=1) / self.test[self.risk_factors].mean(axis=1)
        self.test['std_range'] = self.test['Std_Risk'] / self.test['range']
        self.test['mean_range'] = self.test['Avg_Risk'] / self.test['range']
        self.test['std_mean'] = self.test['Std_Risk'] / self.test['Avg_Risk']
        self.test['std_median'] = self.test['Std_Risk'] / self.test['Med_Risk']
        self.test['mean_median'] = self.test['Avg_Risk'] / self.test['Med_Risk']
        self.test['std_fsum2'] = self.test['Std_Risk'] / self.test['fsum2']
        self.test['mean_fsum2'] = self.test['Avg_Risk'] / self.test['fsum2']
        self.test['std_fsqt'] = self.test['Std_Risk'] / self.test['fsqt']
        self.test['mean_fsqt'] = self.test['Avg_Risk'] / self.test['fsqt']
        self.test['std_log_fsum'] = self.test['Std_Risk'] / self.test['log_fsum2']
        self.test['fsum_sq'] = self.test['fsum2'] ** 2
        self.test['fsum_cu'] = self.test['fsum2'] ** 3
        self.test['euclidean_distance'] = np.sqrt((self.test[self.risk_factors]**2).sum(axis=1))
        self.test['harmonic'] = len(self.risk_factors) / self.test[self.risk_factors].apply(lambda x: (1/x).mean(), axis=1)
        self.test['zscore'] = self.test[self.risk_factors].apply(lambda x: (x - x.mean()) / x.std(), axis=1).mean(axis=1)
        self.test['cv'] = self.test[self.risk_factors].std(axis=1) / self.test[self.risk_factors].mean(axis=1)
        self.test['Skewness_75'] = (self.test[self.risk_factors].quantile(0.75, axis=1) - self.test[self.risk_factors].mean(axis=1)) / self.test[self.risk_factors].std(axis=1)
        self.test['Skewness_25'] = (self.test[self.risk_factors].quantile(0.25, axis=1) - self.test[self.risk_factors].mean(axis=1)) / self.test[self.risk_factors].std(axis=1)
        self.test['2ndMoment'] = self.test[self.risk_factors].apply(lambda x: (x**2).mean(), axis=1)
        self.test['3rdMoment'] = self.test[self.risk_factors].apply(lambda x: (x**3).mean(), axis=1)
        self.test['entropy'] = self.test[self.risk_factors].apply(lambda x: -1*(x*np.log(x)).sum(), axis=1)
      
        self.test['ClimateImpact'] = self.test['MonsoonIntensity'] + self.test['ClimateChange']
        self.test['AnthropogenicPressure'] = self.test['Deforestation'] + self.test['Urbanization'] + self.test['AgriculturalPractices'] + self.test['Encroachments']
        self.test['InfrastructureQuality'] = self.test['DamsQuality'] + self.test['DrainageSystems'] + self.test['DeterioratingInfrastructure']
        self.test['CoastalVulnerabilityTotal'] = self.test['CoastalVulnerability'] + self.test['Landslides']
        self.test['PreventiveMeasuresEfficiency'] = self.test['RiverManagement'] + self.test['IneffectiveDisasterPreparedness'] + self.test['InadequatePlanning']
        self.test['EcosystemImpact'] = self.test['WetlandLoss'] + self.test['Watersheds']
        self.test['SocioPoliticalContext'] = self.test['PopulationScore'] * self.test['PoliticalFactors']
        self.test['ClimateImpact_Urbanization'] = self.test['MonsoonIntensity'] * self.test['Urbanization']
        self.test['DamsQuality_Siltation'] = self.test['DamsQuality'] * self.test['Siltation']
        self.test['Encroachments_Landslides'] = self.test['Encroachments'] * self.test['Landslides']
        self.test['Deforestation_Urbanization'] = self.test['Deforestation'] / self.test['Urbanization']
        self.test['DrainageSystems_DeterioratingInfrastructure'] = self.test['DrainageSystems'] / self.test['DeterioratingInfrastructure']
        self.test['CoastalVulnerability_Landslides'] = self.test['CoastalVulnerability'] / self.test['Landslides']

        ## Unique Values
        unique_vals = []
        for df in [self.train, self.test]:
            for col in self.risk_factors:
                unique_vals += list(df[col].unique())

        unique_vals = list(set(unique_vals))
        for df in [self.train, self.test]:
            for v in unique_vals:
                df['cnt_{}'.format(v)] = (df[self.risk_factors] == v).sum(axis=1)
                
        self.add_transform_features(self.risk_factors)
                
    
    def add_transform_features(self, initial_features):
        log_features = [f"log_{col}" for col in initial_features]
        log2_features = [f"log2_{col}" for col in initial_features]
        exp_features = [f"exp_{col}" for col in initial_features]
        exp2_features = [f"exp2_{col}" for col in initial_features]
        exp3_features = [f"exp3_{col}" for col in initial_features]
        exp4_features = [f"exp4_{col}" for col in initial_features]

        for df in [self.train, self.test]:
            for col in initial_features:
                df[f"log_{col}"] = np.log1p(df[col] + 1e-4)
            df['log_sum'] = df[log_features].sum(axis=1)

            for col in initial_features:
                df[f"log2_{col}"] = np.log10(df[col] + 1e-4)
            df['log2_sum'] = df[log2_features].sum(axis=1)

            for col in initial_features:
                df[f"exp_{col}"] = 1.2**(df[col])
            df['exp_sum'] = df[exp_features].sum(axis=1)

            for col in initial_features:
                df[f"exp2_{col}"] = 2.5**(df[col])
            df['exp2_sum'] = df[exp2_features].mean(axis=1)

            for col in initial_features:
                df[f"exp3_{col}"] = 4**(df[col])
            df['exp3_sum'] = df[exp3_features].mean(axis=1)

            for col in initial_features:
                df[f"exp4_{col}"] = 2**(df[col])
            df['exp4_sum'] = df[exp4_features].sum(axis=1)

            df['sum_log'] = np.log1p(df['total'])
        
    def fit(self,params,name):
        self.preprocess()
        target_col = ['FloodProbability']
        drop_col = ['id']
        
        train_cols = [col for col in self.train.columns.to_list() if col not in target_col + drop_col]
        scores = list()
        
        
        for i in range(1):
            mskf = RepeatedKFold(n_splits=5, n_repeats=1,random_state=42)
            oof_valid_preds = np.zeros(self.train[train_cols].shape[0])
                
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[target_col])):
                X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[target_col].iloc[train_idx]
                X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[target_col].iloc[valid_idx]
                
                if name == 'lgbm':
                    algo = LGBMRegressor(random_state=i+fold,**params)
                    algo.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
                elif name== 'catboost':
                    algo = CatBoostRegressor(random_state=i+fold,**params)
                else:
                    algo = XGBRegressor(random_state=i+fold, missing=float('inf'),**params)
                    algo.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],verbose=False)
                    
                    
                #algo.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],verbose=False)
                    
                valid_preds = algo.predict(X_valid)
                oof_valid_preds[valid_idx] = valid_preds
                test_predict = algo.predict(self.test[train_cols])
                self.test_predict_list.append(test_predict)
                score = r2_score(y_valid, valid_preds)
                print(f"The r2 score for fold {fold+1} is {score}")
                self.model_dict[f'fold_{fold}'] = algo
                    
            oof_score = r2_score(self.train[target_col], oof_valid_preds)
            print(f"The OOF r2 score for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        return scores,self.test_predict_list
    
    def report_perf(self,optimizer, X, y, title, callbacks=None):
        """
        A wrapper for measuring time and performances of different optmizers

        optimizer = a sklearn or a skopt optimizer
        X = the training set 
        y = our target
        title = a string label for the experiment
        """
        start = time()
        if callbacks:
            optimizer.fit(X, y, callback=callbacks)
        else:
            optimizer.fit(X, y)
        best_score = optimizer.best_score_
        best_score_std = optimizer.cv_results_['std_test_score'][optimizer.best_index_]
        best_params = optimizer.best_params_
        print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
               +u"\u00B1"+" %.3f") % (time() - start, 
                                      len(optimizer.cv_results_['params']),
                                      best_score,
                                      best_score_std))    
        print('Best parameters:')
        pprint(best_params)
        print()
        return best_params
    
    
    def find_params(self):
        self.preprocess()
        avg_r2 = make_scorer(r2_score, greater_is_better=True, needs_proba=False)
        search_spaces = {
            'num_leaves': Integer(2, 210, prior='uniform'),
            'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
            'n_estimators': Integer(50, 1000, prior='uniform'),
            'subsample_for_bin': Integer(10000, 500000, prior='uniform'),
            'min_child_samples': Integer(1, 100, prior='uniform'),
            'reg_alpha': Real(1e-10, 1e-5, prior='log-uniform'),
            'reg_lambda': Real(1e-10, 1e-5, prior='log-uniform'),
            'colsample_bytree': Real(0.1, 0.9, prior='uniform'),
            'subsample': Real(0.5, 1.0, prior='uniform'),
            'max_depth': Integer(2, 20, prior='uniform')
        }
        skf = RepeatedKFold(n_splits=10, n_repeats=1,random_state=42)

        opt = BayesSearchCV(LGBMRegressor(verbosity=-1,device='gpu'),
                    search_spaces,
                    scoring=avg_r2,
                    cv=skf,
                    n_iter=10000,
                    n_jobs=1,  
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)
        target_col = ['FloodProbability']
        drop_col = ['id']
        train_cols = [col for col in self.train.columns.to_list() if col not in target_col + drop_col]
        X = self.train[train_cols]
        y = self.train[target_col]
        best_params = self.report_perf(opt, X, y,'LightBoost', 
                          callbacks=[DeltaXStopper(0.001), 
                                     DeadlineStopper(60*60*11)])
        

In [None]:
lgbm_params1 = {
    'num_leaves': 183,
    'learning_rate': 0.01183688880802108,
    'n_estimators': 577,
    'subsample_for_bin': 165697,
    'min_child_samples': 114,
    'reg_alpha': 2.075080888948164e-06,
    'reg_lambda': 3.838938366471552e-07,
    'colsample_bytree': 0.9634044234652241,
    'subsample': 0.9592138618622019,
    'max_depth': 9,
    #'device':'gpu',
        'verbosity':-1,
}

model = Model(train,test)
lgbm_scores1,lgbm_preds1 = model.fit(lgbm_params1,'lgbm')
print(f'The average r2 score is {np.mean(lgbm_scores1)}')

In [None]:
lgbm_params = {
        'num_leaves': 210, 
        'learning_rate': 0.00895710669100346, 
        'n_estimators': 727, 
        'subsample_for_bin': 21372, 
        'min_child_samples': 100, 
        'reg_alpha': 4.5427528680311086e-07, 
        'reg_lambda': 6.3824100237054236e-09, 
        'colsample_bytree': 0.565257393643049, 
        'subsample': 0.5774186207711538, 
        'max_depth': 11,
        #'device':'gpu',
        'verbosity':-1,
    }

In [None]:
model = Model(train,test)
lgbm_scores,lgbm_preds = model.fit(lgbm_params,'lgbm')
print(f'The average r2 score is {np.mean(lgbm_scores)}')

## Submission

In [None]:
gluon_preds = pd.read_csv('/kaggle/input/autogluon-starter/submission.csv').FloodProbability
voting_preds = pd.read_csv('/kaggle/input/flood-prediction-lightboost-oof-preds/submission.csv').FloodProbability
lgbm_prediction = np.mean(lgbm_preds,axis=0)

final_preds = 0.05*np.mean(lgbm_preds1,axis=0)+lgbm_prediction*0.35 + 0.35* gluon_preds + 0.25 * voting_preds

In [None]:
submit = pd.read_csv('/kaggle/input/playground-series-s4e5/sample_submission.csv')
submit.FloodProbability = final_preds
submit.to_csv('submission.csv',index=False)
submit