In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


**Data reading**

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/sunnysavita10/credit_card_pw_hindi/main/creditCardFraud_28011964_120214.csv")

-- Dataset Information


* LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
* SEX: Gender (1=male, 2=female)
* EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
MARRIAGE: Marital status (1=married, 2=single, 3=others)
* AGE: Age in years
* PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
* PAY_2: Repayment status in August, 2005 (scale same as above)
* PAY_3: Repayment status in July, 2005 (scale same as above)
* PAY_4: Repayment status in June, 2005 (scale same as above)
* PAY_5: Repayment status in May, 2005 (scale same as above)
* PAY_6: Repayment status in April, 2005 (scale same as above)
* BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
* BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
* BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
* BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
* BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
* BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
* PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
* PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
* PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
* PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
* PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
* PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
* default.payment.next.month: Default payment (1=yes, 0=no)




In [None]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


In [None]:
df.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [None]:
df.isnull().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [None]:
!pip install pandas-profiling
from pandas_profiling import ProfileReport



In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   1001 non-null   int64
 1   SEX                         1001 non-null   int64
 2   EDUCATION                   1001 non-null   int64
 3   MARRIAGE                    1001 non-null   int64
 4   AGE                         1001 non-null   int64
 5   PAY_0                       1001 non-null   int64
 6   PAY_2                       1001 non-null   int64
 7   PAY_3                       1001 non-null   int64
 8   PAY_4                       1001 non-null   int64
 9   PAY_5                       1001 non-null   int64
 10  PAY_6                       1001 non-null   int64
 11  BILL_AMT1                   1001 non-null   int64
 12  BILL_AMT2                   1001 non-null   int64
 13  BILL_AMT3                   1001 non-null   int64
 14  BILL_AMT

In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report")

In [None]:
# profile.to_file("your_report.html")

In [None]:
df['default payment next month'].value_counts()

0    787
1    214
Name: default payment next month, dtype: int64

In [None]:
# For seeing analysis in html form w/o downloading page
# profile.to_notebook_iframe()

In [None]:
x = df.drop(labels=['default payment next month'],axis = 1)

In [None]:
x

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679
1,50000,1,1,2,37,0,0,0,0,0,...,57608,19394,19619,20024,2500,1815,657,1000,1000,800
2,500000,1,1,2,29,0,0,0,0,0,...,445007,542653,483003,473944,55000,40000,38000,20239,13750,13770
3,100000,2,2,2,23,0,-1,-1,0,0,...,601,221,-159,567,380,601,0,581,1687,1542
4,140000,2,3,1,28,0,0,2,0,0,...,12108,12211,11793,3719,3329,0,432,1000,1000,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,100000,1,2,1,29,0,0,0,0,-1,...,67782,-2618,95748,101299,3320,5000,0,100000,7186,0
997,200000,2,2,1,28,0,0,0,0,0,...,8441,97041,103541,3632,5000,2000,89000,6500,91,1504
998,90000,2,2,1,40,-1,-1,-1,-1,-1,...,1114,657,1332,780,0,2806,2256,2274,780,0
999,360000,1,1,2,36,1,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y = df['default payment next month']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25,random_state=50)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()


In [None]:
X_train_scaled = scaler.fit_transform(x_train)

In [None]:
X_test_scaled = scaler.transform(x_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [None]:
from sklearn.model_selection import GridSearchCV
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
clf = GridSearchCV(clf,param_grid=params_NB,cv=2,verbose=3)


In [None]:
clf.fit(X_train_scaled, y_train)

Fitting 2 folds for each of 100 candidates, totalling 200 fits
[CV 1/2] END .................var_smoothing=1.0;, score=0.776 total time=   0.0s
[CV 2/2] END .................var_smoothing=1.0;, score=0.757 total time=   0.0s
[CV 1/2] END ..var_smoothing=0.8111308307896871;, score=0.765 total time=   0.0s
[CV 2/2] END ..var_smoothing=0.8111308307896871;, score=0.731 total time=   0.0s
[CV 1/2] END ...var_smoothing=0.657933224657568;, score=0.757 total time=   0.0s
[CV 2/2] END ...var_smoothing=0.657933224657568;, score=0.696 total time=   0.0s
[CV 1/2] END ...var_smoothing=0.533669923120631;, score=0.747 total time=   0.0s
[CV 2/2] END ...var_smoothing=0.533669923120631;, score=0.661 total time=   0.0s
[CV 1/2] END .var_smoothing=0.43287612810830584;, score=0.757 total time=   0.0s
[CV 2/2] END .var_smoothing=0.43287612810830584;, score=0.621 total time=   0.0s
[CV 1/2] END ..var_smoothing=0.3511191734215131;, score=0.760 total time=   0.0s
[CV 2/2] END ..var_smoothing=0.351119173421513

In [None]:
clf.best_params_

{'var_smoothing': 1.0}

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
clf = GaussianNB(var_smoothing = 0.8111308307896871)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
accuracy_score(y_test,y_pred)
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.84      0.93      0.88       201\n           1       0.52      0.30      0.38        50\n\n    accuracy                           0.80       251\n   macro avg       0.68      0.62      0.63       251\nweighted avg       0.78      0.80      0.78       251\n'

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

In [None]:
xgb.fit(X_train_scaled,y_train)

In [81]:
param_grid_xgboost = { "n_estimators": [50,100, 130],
                        "max_depth": range(3,11,1),
                      "random_state":[0,50,100]
                     }

# Creating an object of the Grid Search class
grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid_xgboost,verbose=3,cv=5,n_jobs=-1)

In [82]:
grid.fit(X_train_scaled,y_train)
y_pred2 = grid.predict(X_test_scaled)
print(accuracy_score(y_test,y_pred2))
print(classification_report(y_test,y_pred2))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
0.7768924302788844
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       201
           1       0.43      0.38      0.40        50

    accuracy                           0.78       251
   macro avg       0.64      0.63      0.63       251
weighted avg       0.77      0.78      0.77       251



In [85]:
grid.best_params_

{'max_depth': 4, 'n_estimators': 100, 'random_state': 0}

In [87]:
grid = XGBClassifier(max_depth = 4, n_estimators = 100, random_state = 0)

In [88]:
grid.fit(X_train_scaled, y_train)
y_pred1 = grid.predict(X_test_scaled)

In [89]:
accuracy_score(y_test,y_pred1)

0.7768924302788844

-- So we can see that Gaussian Naive Bayes is giving us 80% accuracy. So we will use that for our predictions


In [90]:
import pickle
pickle.dump(grid,open('xgboost','wb'))

SyntaxError: ignored