In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv')
sample_solution = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
sample_solution.head()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
data = pd.concat([train, test], sort=False)

In [None]:
data.head()

In [None]:
print(len(train), len(test), len(data))

In [None]:
data.isnull().sum()

In [None]:
data.isnull().sum() / len(data) * 100

In [None]:
for i in test.columns[1 : ]:
    i_avg = data[i].mean()
    i_std = data[i].std()
    data[i].fillna(np.random.uniform(i_avg - i_std, i_avg + i_std), inplace = True)

In [None]:
train = data[ : len(train)]
test = data[len(train) : ]

In [None]:
y_train = train['claim']
X_train = train.drop('claim', axis = 1)
X_test = test.drop('claim', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.3, random_state = 0, stratify = y_train)

In [None]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid)

In [None]:
from sklearn.metrics import roc_auc_score

def objective(trial):
    params = {
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 255, 500),
        'learning_rate': 0.05,
        'num_leaves': trial.suggest_int('num_leaves', 32, 128),
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference = lgb_train)

    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_train, lgb_eval],
                      verbose_eval=10,
                      num_boost_round=1000,
                      early_stopping_rounds=10)

    y_pred_valid = model.predict(X_valid, num_iteration = model.best_iteration)
    score = roc_auc_score(y_valid, y_pred_valid)
    return score

In [None]:
import optuna

study = optuna.create_study(sampler = optuna.samplers.RandomSampler(seed = 0))
study.optimize(objective, n_trials = 25)

In [None]:
study.best_params

In [None]:
params = {
    'objective': 'binary',
    'max_bin': study.best_params['max_bin'],
    'learning_rate': 0.05,
    'num_leaves': study.best_params['num_leaves']
}

model = lgb.train(params, lgb_train,
                  valid_sets = [lgb_train, lgb_eval],
                  verbose_eval = 10,
                  num_boost_round = 1000,
                  early_stopping_rounds = 10)

y_pred = model.predict(X_test, num_iteration = model.best_iteration)

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
sub['claim'] = y_pred
sub.to_csv('submission_lightgbm.csv', index = False)