In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:

NUM_FOLD = 5

train = pd.read_csv('../dataset/train.csv', index_col='id')
test = pd.read_csv('../dataset/test.csv', index_col='id')
original = pd.read_csv('../dataset/original.csv')
sample_submission = pd.read_csv('../dataset/sample_submission.csv')

# accepted_cols = ['person_income', 'person_home_ownership', 'loan_intent',
#        'loan_grade', 'loan_int_rate', 'loan_percent_income','loan_status']

# train = train[accepted_cols]
# original = original[accepted_cols]
# accepted_cols.remove("loan_status")
# test = test[accepted_cols]



cat_cols = list(test.select_dtypes(include=['object']).columns)

for df in [train, test, original]:
    for col in cat_cols:
        df[col] = df[col].astype('str').astype('category')


X = train.drop(['loan_status'], axis=1)
y = train['loan_status']
X_original = original.drop(['loan_status'], axis=1)
y_original = original['loan_status']





In [6]:
params = {
    'verbose':-1,
    'random_state':42,
    'data_sample_strategy':'goss',
    'n_estimators':4000,
    'learning_rate':0.01,
    'col_sample_bytree':0.6,
    'max_depth':17,
    'max_bin': 4000
}

from lightgbm import LGBMClassifier

model = LGBMClassifier(**params)

skf = StratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=42)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1}")
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)])
    oof[val_idx] = model.predict_proba(X_valid)[:, 1]
    predictions += model.predict_proba(test)[:, 1] / NUM_FOLD
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = X.columns
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print("AUC: ", roc_auc_score(y_valid, oof[val_idx]))


Fold 1
AUC:  0.9525559676947923
Fold 2
AUC:  0.9636808696951459
Fold 3
AUC:  0.9577705906409668
Fold 4
AUC:  0.9619959603608172
Fold 5
AUC:  0.9590644538539981


In [8]:
# prediction

preds = model.predict_proba(test)[:, 1]

In [10]:
submission = pd.DataFrame({'id': test.index, 'loan_status': preds})

In [11]:
submission.to_csv('submission_lgb.csv', index=False)