## XGBoost
# Credit Fraud Dataset

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pickle

In [2]:
df = pd.read_csv('C:/Users/admin/Downloads/credit_dataset.csv')

In [3]:
# Converting into type int for simplicity
df['FAMILY SIZE'] = df['FAMILY SIZE'].astype(int)

In [4]:
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label_df = df.copy()
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
for row in object_cols:
    label_df[row] = le.fit_transform(df[row])
label_df.drop(['ID', 'GENDER', 'REALITY','NO_OF_CHILD', 'HOUSE_TYPE', 'FLAG_MOBIL', 'WORK_PHONE', 'E_MAIL'], axis=1, inplace=True)
label_df.drop(label_df.columns[0], axis=1, inplace=True)

In [5]:
label_df.head()

Unnamed: 0,CAR,INCOME,INCOME_TYPE,EDUCATION_TYPE,FAMILY_TYPE,PHONE,FAMILY SIZE,BEGIN_MONTH,AGE,YEARS_EMPLOYED,TARGET
0,1,112500.0,4,4,1,0,2,29,59,3,0
1,0,270000.0,0,4,3,1,1,4,52,8,0
2,0,270000.0,0,4,3,1,1,26,52,8,0
3,0,270000.0,0,4,3,1,1,26,52,8,0
4,0,270000.0,0,4,3,1,1,38,52,8,0


In [6]:
# Choosing features and Target for training and testing
X = label_df.copy()
y = X.pop('TARGET')

In [7]:
# Splitting the dataset into train and test
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [8]:
model = XGBClassifier()

# fit the model with the training data
model.fit(xtrain,ytrain)


# predict the target on the train dataset
y_pred = model.predict(xtest)
print('\nTarget on train data',y_pred) 

# Accuray Score on train dataset
result = accuracy_score(ytest,y_pred)
print('\naccuracy_score: ', result)




Target on train data [0 0 0 ... 0 0 0]

accuracy_score:  0.9834500318268619


## SMOTE

In [9]:
import imblearn

In [10]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [11]:
# define pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [12]:
X1, y1 = pipeline.fit_resample(xtrain,ytrain)
X1_test, y1_test = pipeline.fit_resample(xtest, ytest)

In [13]:
model = XGBClassifier()

# fit the model with the training data
model.fit(X1,y1)


# predict the target on the train dataset
y_pred = model.predict(X1_test)
print('\nTarget on train data',y_pred) 

# Accuray Score on train dataset
result = accuracy_score(y1_test,y_pred)
print('\naccuracy_score: ', result)




Target on train data [0 0 0 ... 0 0 0]

accuracy_score:  0.823878984332793


In [14]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y1_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

Confusion Matrix : 
 [[1187   47]
 [ 279  338]]


In [15]:
filename = 'XGboost_model.pkl'
pickle.dump(model, open(filename, 'wb'))