In [1]:
import numpy as np
import pandas as pd
import optuna

import matplotlib.pyplot as plt
import missingno as msno

import lightgbm as lgb
import catboost as cb

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
train_path = '/kaggle/input/playground-series-s4e5/train.csv'
test_path = '/kaggle/input/playground-series-s4e5/test.csv'
submission_path = '/kaggle/input/playground-series-s4e5/sample_submission.csv'

In [3]:
def read_data(path, index_col=None):
    df = pd.read_csv(path)
    if index_col != None:
        df = df.set_index(index_col)
    initial_features = list(df.columns)
    if 'FloodProbability' in initial_features:
        initial_features.remove('FloodProbability')
    sorted_features = [f"sort_{i}" for i in np.arange(len(initial_features))]
    df['fsum'] = df[initial_features].sum(axis=1)
    df[sorted_features] = np.sort(df[initial_features], axis=1)
    return df

train_df = read_data(train_path, 'id')
test_df = read_data(test_path, 'id')

In [4]:
label = 'FloodProbability'
X = train_df.drop(columns=[label])
y = train_df[[label]]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

### LightGBM

In [5]:
lgb_params = {
    'boosting_type': 'gbdt', 
    'n_estimators':1500, 
    'learning_rate' :  0.012,    
    'num_leaves' : 250, 
    'subsample_for_bin': 165700, 
    'min_child_samples': 114, 
    'reg_alpha': 2.075e-06, 
    'reg_lambda': 3.839e-07, 
    'colsample_bytree': 0.9634,
    'subsample': 0.9592, 
    'max_depth': 10,
    'random_state':0,
    'verbosity':-1
}
num_round = 100

In [6]:
train_data_lgbm = lgb.Dataset(X_train, label=y_train)
val_data_lgbm = lgb.Dataset(X_val, label=y_val, reference=train_data_lgbm)

lgbm_model = lgb.train(lgb_params, train_data_lgbm, num_round, valid_sets=[val_data_lgbm])



In [7]:
yhat = lgbm_model.predict(X_val)
print(r2_score(y_val, yhat))

0.8690085569113408


### CatBoost

In [8]:
catboost_model = cb.CatBoostRegressor(verbose=0)
catboost_model.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7d332dd61ed0>

In [9]:
yhat_cat = catboost_model.predict(X_val)
print(r2_score(y_val, yhat_cat))

0.868744812503325


In [10]:
bag_model_yhat = (yhat + yhat_cat) / 2.0
print(r2_score(y_val, yhat_cat))

0.868744812503325


### Submission

In [11]:
submission = pd.read_csv(submission_path)
preds = lgbm_model.predict(test_df)
submission['FloodProbability'] = preds
submission.to_csv('submission.csv', index=False)
!head submission.csv

id,FloodProbability
1117957,0.5781638513513134
1117958,0.4535679258646477
1117959,0.4482374327486966
1117960,0.46976291946352733
1117961,0.4697024333929305
1117962,0.5080146686610356
1117963,0.5348359580314328
1117964,0.5281182878772168
1117965,0.47398988774585726
