# Credit Fraud Imbalanced dataset
## Light GBM

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import pickle
from imblearn.over_sampling import SMOTE

In [1]:
from lightgbm import LGBMClassifier

In [3]:
df = pd.read_csv('C:/Users/admin/Downloads/credit_dataset.csv', index_col=False)

In [4]:
# Converting into type int for simplicity
df['FAMILY SIZE'] = df['FAMILY SIZE'].astype(int)

In [5]:
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_df = df.copy()
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
for row in object_cols:
    label_df[row] = le.fit_transform(df[row])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(le_name_mapping)

{'F': 0, 'M': 1}
{'N': 0, 'Y': 1}
{'N': 0, 'Y': 1}
{'Commercial associate': 0, 'Pensioner': 1, 'State servant': 2, 'Student': 3, 'Working': 4}
{'Academic degree': 0, 'Higher education': 1, 'Incomplete higher': 2, 'Lower secondary': 3, 'Secondary / secondary special': 4}
{'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
{'Co-op apartment': 0, 'House / apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'With parents': 5}


In [6]:
label_df.drop(label_df.columns[0], axis=1, inplace=True)
label_df.drop(['ID', 'GENDER', 'REALITY','NO_OF_CHILD', 'HOUSE_TYPE', 'FLAG_MOBIL', 'WORK_PHONE', 'E_MAIL'], axis=1, inplace=True)
label_df.head()

Unnamed: 0,CAR,INCOME,INCOME_TYPE,EDUCATION_TYPE,FAMILY_TYPE,PHONE,FAMILY SIZE,BEGIN_MONTH,AGE,YEARS_EMPLOYED,TARGET
0,1,112500.0,4,4,1,0,2,29,59,3,0
1,0,270000.0,0,4,3,1,1,4,52,8,0
2,0,270000.0,0,4,3,1,1,26,52,8,0
3,0,270000.0,0,4,3,1,1,26,52,8,0
4,0,270000.0,0,4,3,1,1,38,52,8,0


In [7]:
#Dropping the values greater the 20
label_df=label_df[label_df['YEARS_EMPLOYED'] < 20]

In [8]:
#removing values those are greater then 600000
label_df=label_df[label_df['INCOME'] < 600000]

In [10]:
print(label_df[label_df["TARGET"] == 1].shape)
print(label_df[label_df["TARGET"] == 0].shape)

(404, 11)
(22957, 11)


In [11]:
# Choosing features and Target for training and testing
X = label_df.copy()
y = X.pop('TARGET')

In [15]:
X_smote,y_smote=SMOTE().fit_resample(X,y)

In [16]:
#shape of dataset after using SMOTE
print("X",X_smote.shape)
print("y",y_smote.shape)

X (45914, 10)
y (45914,)


In [17]:
# Splitting the dataset into train and test
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_smote, y_smote, test_size = 0.25, random_state = 99)

In [18]:
model = LGBMClassifier()
# fit the model with the training data
model.fit(xtrain,ytrain)

# predict the target on the train dataset
y_pred = model.predict(xtest)
print('\nTarget on train data',y_pred) 

from sklearn.metrics import accuracy_score
# Accuray Score on train dataset
result = accuracy_score(ytest,y_pred)
print('\naccuracy_score: ', result)


Target on train data [1 0 0 ... 1 1 1]

accuracy_score:  0.9324854081365973


In [19]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[5288  417]
 [ 358 5416]]


In [22]:
from sklearn import metrics
print(metrics.classification_report(y_pred,ytest))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93      5646
           1       0.94      0.93      0.93      5833

    accuracy                           0.93     11479
   macro avg       0.93      0.93      0.93     11479
weighted avg       0.93      0.93      0.93     11479



In [23]:
filename = 'LightGBM_model.pkl'
pickle.dump(model, open(filename, 'wb'))