In [29]:
import lightgbm as lgb

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from lightgbm import LGBMClassifier

In [20]:

# %%

# import dataset
dataset = pd.read_csv('csv/cleaned_train_all.csv')


x = dataset.drop(['credit_card_default'], axis = 1)
y = dataset['credit_card_default'].values
print(x.shape, y.shape)


# %%


scalar =  preprocessing.StandardScaler()

# copy of datasets
X_train = x.copy()

# numerical features

# #for all 
num_cols = ['net_yearly_income','no_of_days_employed','yearly_debt_payments','credit_limit']

# #for less
# num_cols = ["credit_limit_used(%)", "credit_score"]

#apply standardization on numerical features
for i in num_cols:
    
    # fit on training data column
    scale = scalar.fit(X_train[[i]])
    
    # transform the training data column
    X_train[i] = scale.transform(X_train[[i]])

X_train = X_train.set_index('customer_id')
X_train.head()


(43508, 16) (43508,)


Unnamed: 0_level_0,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
CST_115179,46,0,0.0,1,0.0,-0.1359,-0.480983,1.0,1.0,0.072774,-0.163817,73,544,2,1
CST_121920,29,1,0.0,1,0.0,-0.133081,-0.465489,2.0,0.0,-0.954202,-0.038524,52,857,0,0
CST_109330,37,1,0.0,1,0.0,0.042769,-0.483912,2.0,0.0,0.961142,-0.013187,43,650,0,0
CST_128288,39,0,0.0,1,0.0,-0.114861,-0.399679,2.0,0.0,-0.534813,-0.072174,20,754,0,0
CST_151355,46,1,1.0,1,0.0,0.272478,-0.474905,1.0,0.0,0.374525,0.061461,75,927,0,0


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size = 0.33, random_state=42)


In [30]:
param_grid = {
    'class_weight': [None, 'balanced'],
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(30, 150)),
    'learning_rate': [0.01,0.1,0.5],
    'subsample_for_bin': [20000,50000,100000,120000,150000],
    'min_child_samples': [20,50,100,200,500],
    'colsample_bytree': [0.6,0.8,1],
    "max_depth": [5,10,50,100]
}

lgbm = LGBMClassifier() # Shortname the LGBMClassifier()
lgbm.fit(X_train, y_train) # Train the lgbm on train sets

LGBMClassifier()

In [31]:
from sklearn.metrics import accuracy_score

lgbm_tuned = LGBMClassifier(boosting_type = 'gbdt',
                            class_weight = None,
                            min_child_samples = 20,
                            num_leaves = 30,
                            subsample_for_bin = 20000,
                            learning_rate=0.01, 
                            max_depth=10, 
                            n_estimators=40, 
                            colsample_bytree=0.6) # LightGBM Classifier with optimum paramteres
lgbm_tuned.fit(X_train, y_train)

y_test_pred = lgbm_tuned.predict(X_test) #Predicting X_test to find the solutiın
score = round(accuracy_score(y_test, y_test_pred), 3) # Find the accuracy of y_test and predicitons, and round the result
print(score)



0.918


In [32]:
y_test_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [33]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     13177
           1       0.00      0.00      0.00      1181

    accuracy                           0.92     14358
   macro avg       0.46      0.50      0.48     14358
weighted avg       0.84      0.92      0.88     14358



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test,y_test_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)

Confusion matrix : 
 [[    0  1181]
 [    0 13177]]
