In [15]:
import pandas as pd 
import numpy as np
from lightgbm import LGBMClassifier,plot_importance
from sklearn import model_selection
from sklearn.metrics import roc_auc_score,log_loss
import matplotlib.pyplot as plt
import optuna
import joblib
from optuna.integration import LightGBMPruningCallback

In [16]:
df = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')

In [17]:
def make_folds(df):
    '''
    Function create Stratified K folds on the dataset
    Parameters
    ----------
    df : dataframe
    Returns
    -------
    dataframe
    '''
    
    df['kfold'] = -1
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.claim.values
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    for f,(t_,v_) in enumerate(kf.split(X=df,y=y)):
        df.loc[v_,'kfold'] = f
        
    return df

In [18]:
df = make_folds(df)

In [20]:
models = {
    'lgbm':LGBMClassifier(n_estimators= 10,learning_rate=0.25350902193342617,
num_leaves = 1060,
max_depth= 12,
min_data_in_leaf= 6300,
max_bin= 245,
lambda_l1= 5,
lambda_l2=75,
min_gain_to_split= 9.25444411414024,
bagging_fraction= 0.7,
bagging_freq=1,
feature_fraction=0.4)
}

In [21]:
def run_folds(df,fold,model):
    '''
    Funtcion to train and predict on the models on the given fold
    Input : Dataframe,Integer,Model
    Output: NULL
    '''
    df_train = df[df.kfold!=fold].reset_index(drop=True)
    df_valid = df[df.kfold==fold].reset_index(drop=True)
    
    x_train = df_train.drop(columns=['claim','kfold','id'],axis=1).values
    y_train = df_train.claim.values
    
    x_valid = df_valid.drop(columns=['claim','kfold','id'],axis=1).values
    y_valid = df_valid.claim.values
    
    
    clf = models[model]
    
    clf.fit(x_train,y_train)
    train_preds = clf.predict_proba(x_train)[:,1]
    preds = clf.predict_proba(x_valid)[:,1]
    print(f'Fold{fold}')
    print('Train ROC_AUC:{}'.format(roc_auc_score(y_train, train_preds)))
    print('Validation ROC_AUC:{}'.format(roc_auc_score(y_valid, preds)))
    print('*'*50)
    
    joblib.dump(clf,f'dt_{fold}_{model}.bin')
    
    return

In [22]:
for i in range(5):
    run_folds(df,i,'lgbm')

Fold0
Train ROC_AUC:0.7919639040883236
Validation ROC_AUC:0.7899586605287185
**************************************************
Fold1
Train ROC_AUC:0.7916862429370205
Validation ROC_AUC:0.7910698452080063
**************************************************
Fold2
Train ROC_AUC:0.7912177009150665
Validation ROC_AUC:0.79227502588427
**************************************************
Fold3
Train ROC_AUC:0.792139815510605
Validation ROC_AUC:0.7918775071874831
**************************************************
Fold4
Train ROC_AUC:0.7902700982036898
Validation ROC_AUC:0.7881702300671002
**************************************************


In [None]:
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
test_df = test.drop(columns=['id'])
md = joblib.load('./dt_4_lgbm.bin')

In [None]:
prediction = md.predict_proba(test_df)[:,1]

In [None]:
submission = pd.DataFrame({'id': test['id'],'claim': prediction})

In [None]:
submission.to_csv('submission_3.csv',index=False)

In [None]:
def optimize(trial,X,y):
    
    params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        )
            }
    
    kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)
    loss = []
    
    for idx, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model = LGBMClassifier(objective= "binary",**params)
        model.fit(X_train,y_train,early_stopping_rounds=100,eval_metric="binary_logloss",eval_set=[(X_test, y_test)],callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")]
                 )
        preds = model.predict_proba(X_test)[:,1]
        fold_loss = log_loss(y_test, preds)
        loss.append(fold_loss)
        
    return np.mean(loss)

In [None]:
X = df.drop(columns=['id','claim'])
y = df.claim

In [None]:
study = optuna.create_study(direction='minimize',study_name="LGBM Classifier")
func = lambda trial: optimize(trial, X, y)
study.optimize(func, n_trials=50)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
n_estimators= 10,learning_rate=0.25350902193342617
num_leaves= 1060,
max_depth= 12,
min_data_in_leaf= 6300,
max_bin= 245,
lambda_l1= 5,
lambda_l2=75,
min_gain_to_split= 9.25444411414024,
bagging_fraction= 0.7,
bagging_freq=1,
feature_fraction=0.4