In [1]:
# Importing the necessary classes
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Loading the data from csv file to a Pandas DataFrame
data = pd.read_csv("CreditCardFraud.csv")
data

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,100000,1,2,1,29,0,0,0,0,-1,...,-2618,95748,101299,3320,5000,0,100000,7186,0,0
997,200000,2,2,1,28,0,0,0,0,0,...,97041,103541,3632,5000,2000,89000,6500,91,1504,0
998,90000,2,2,1,40,-1,-1,-1,-1,-1,...,657,1332,780,0,2806,2256,2274,780,0,0
999,360000,1,1,2,36,1,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# Printing the first 20 rows of the dataframe
data.head(20)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0
5,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0
6,200000,2,3,2,34,0,0,2,0,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
7,260000,2,1,2,51,-1,-1,-1,-1,-1,...,8517,22287,13668,21818,9966,8583,22301,0,3640,0
8,630000,2,2,2,41,-1,0,-1,-1,-1,...,6500,6500,2870,1000,6500,6500,6500,2870,0,0
9,70000,1,2,2,30,1,2,2,0,0,...,66782,36137,36894,3200,0,3000,3000,1500,0,1


In [4]:
# Number of rows and columns in the dataframe
data.shape

(1001, 24)

In [5]:
# Name of all columns in the dataframe
data.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [6]:
# Getting more information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   LIMIT_BAL                   1001 non-null   int64
 1   SEX                         1001 non-null   int64
 2   EDUCATION                   1001 non-null   int64
 3   MARRIAGE                    1001 non-null   int64
 4   AGE                         1001 non-null   int64
 5   PAY_0                       1001 non-null   int64
 6   PAY_2                       1001 non-null   int64
 7   PAY_3                       1001 non-null   int64
 8   PAY_4                       1001 non-null   int64
 9   PAY_5                       1001 non-null   int64
 10  PAY_6                       1001 non-null   int64
 11  BILL_AMT1                   1001 non-null   int64
 12  BILL_AMT2                   1001 non-null   int64
 13  BILL_AMT3                   1001 non-null   int64
 14  BILL_AMT

In [7]:
# checking for missing values in each column
data.isnull().sum()

LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [8]:
# Getting some statistical measures about the data
data.describe()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
count,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,...,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0,1001.0
mean,167532.467532,1.589411,1.776224,1.604396,34.945055,-0.004995,-0.161838,-0.164835,-0.283716,-0.283716,...,40748.408591,39078.666334,38012.011988,5382.33966,5051.400599,4176.14985,4671.488511,5331.04995,5090.704296,0.213786
std,130587.92132,0.492187,0.750916,0.532298,9.21976,1.173446,1.228732,1.262459,1.184662,1.170224,...,68206.92951,63108.238729,63074.415024,12180.755275,15626.153184,10514.647502,13269.943983,16812.536877,23658.888052,0.410183
min,10000.0,1.0,1.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,...,-3684.0,-28335.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1423.0,1206.0,830.0,1000.0,390.0,228.0,148.0,189.0,0.0,0.0
50%,140000.0,2.0,2.0,2.0,33.0,0.0,0.0,0.0,0.0,0.0,...,17710.0,17580.0,15846.0,2184.0,1710.0,1206.0,1398.0,1306.0,1250.0,0.0
75%,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,...,48851.0,46404.0,46557.0,5090.0,4500.0,3720.0,4000.0,3745.0,3784.0,0.0
max,700000.0,2.0,6.0,3.0,75.0,8.0,7.0,7.0,7.0,7.0,...,628699.0,484612.0,473944.0,199646.0,285138.0,133657.0,188840.0,195599.0,528666.0,1.0


In [9]:
# Grouping the data based on the target variable
data.groupby('default payment next month').mean()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
default payment next month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,170127.064803,1.602287,1.763659,1.60864,34.687421,-0.174079,-0.265565,-0.287166,-0.383736,-0.382465,...,44411.481576,40364.139771,38420.895807,37769.662008,5939.581957,5607.763659,4526.462516,5086.377382,5741.191868,5140.388818
1,157990.654206,1.542056,1.82243,1.588785,35.892523,0.616822,0.219626,0.285047,0.084112,0.079439,...,46816.691589,42161.584112,41497.663551,38903.271028,3333.042056,3005.336449,2887.850467,3145.705607,3822.724299,4907.985981


In [10]:
# distribution of target Variable
data['default payment next month'].value_counts()

0    787
1    214
Name: default payment next month, dtype: int64

In [11]:
pip install pandas-profiling

Note: you may need to restart the kernel to use updated packages.


In [12]:
from pandas_profiling import ProfileReport

  from pandas_profiling import ProfileReport


In [13]:
# Exploratory Data Analysis using Pandas Profiling 
profile = ProfileReport(data, title = "Pandas Profile Report")

In [14]:
# Summarizing the data based on the target variable
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [15]:
profile.to_file(output_file='Credit_Card.html')

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
x = data.drop(labels = ['default payment next month'],axis = 1)
x

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,50000,1,2,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679
1,50000,1,1,2,37,0,0,0,0,0,...,57608,19394,19619,20024,2500,1815,657,1000,1000,800
2,500000,1,1,2,29,0,0,0,0,0,...,445007,542653,483003,473944,55000,40000,38000,20239,13750,13770
3,100000,2,2,2,23,0,-1,-1,0,0,...,601,221,-159,567,380,601,0,581,1687,1542
4,140000,2,3,1,28,0,0,2,0,0,...,12108,12211,11793,3719,3329,0,432,1000,1000,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,100000,1,2,1,29,0,0,0,0,-1,...,67782,-2618,95748,101299,3320,5000,0,100000,7186,0
997,200000,2,2,1,28,0,0,0,0,0,...,8441,97041,103541,3632,5000,2000,89000,6500,91,1504
998,90000,2,2,1,40,-1,-1,-1,-1,-1,...,1114,657,1332,780,0,2806,2256,2274,780,0
999,360000,1,1,2,36,1,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y = data['default payment next month']
y

0       0
1       0
2       0
3       0
4       0
       ..
996     0
997     0
998     0
999     1
1000    1
Name: default payment next month, Length: 1001, dtype: int64

In [18]:
#Splitting the data to training data & Test data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state = 50)

In [19]:
print(x.shape, x_train.shape, x_test.shape)

(1001, 23) (750, 23) (251, 23)


In [20]:
#preprocessing steps
train_scaler=StandardScaler()
test_scaler=StandardScaler()

In [21]:
#Data Standardization
scaled_train_data = train_scaler.fit_transform(x_train)
scaled_test_data = train_scaler.transform(x_test)

In [39]:
# Model Training using Decision Tree Classifier
model = DecisionTreeClassifier()

In [40]:
# training the model with training data
model.fit(x_train, y_train)
predictions = model.predict(scaled_train_data)



In [41]:
# accuracy score on training data
accuracy_score(y_train,predictions)

0.796

In [38]:
# Hyper-parameter tuning to find the best parameters

In [27]:
model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [28]:
param_dist = {
    "criterion":["gini","entropy"],
    "max_depth":[1,2,3,4,5,6,None],
    "splitter":["best","random"],
    "min_samples_split":[1,2,3,4,5,6,None],
    "min_samples_leaf":[1,2,3,4,5,6,None]
}

In [29]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(model,param_grid=param_dist, cv=5,verbose=3)

In [30]:
grid.fit(x_train,y_train)

Fitting 5 folds for each of 1372 candidates, totalling 6860 fits
[CV 1/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=best;, score=nan total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=random;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, min_samples_leaf=1, min_samples_split=1, splitter=random;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_de

2660 fits failed out of a total of 6860.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
840 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sapta\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sapta\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\sapta\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

-------------------------------------------------------------------------

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, None],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, None],
                         'min_samples_split': [1, 2, 3, 4, 5, 6, None],
                         'splitter': ['best', 'random']},
             verbose=3)

In [31]:
grid.best_estimator_

DecisionTreeClassifier(max_depth=4, min_samples_leaf=6, min_samples_split=6,
                       splitter='random')

In [32]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'min_samples_leaf': 6,
 'min_samples_split': 6,
 'splitter': 'random'}

In [42]:
model1 = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_leaf=3,min_samples_split=4, splitter='random')
model1.fit(scaled_train_data,y_train)
y_pred = model1.predict(scaled_test_data)
accuracy_score(y_test, y_pred)*100

82.07171314741036

In [34]:
# Predictive Modeling with random readings from the user based array

In [35]:
# Checking head values that may be used later for predictions
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
1,50000,1,1,2,37,0,0,0,0,0,...,19394,19619,20024,2500,1815,657,1000,1000,800,0
2,500000,1,1,2,29,0,0,0,0,0,...,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
3,100000,2,2,2,23,0,-1,-1,0,0,...,221,-159,567,380,601,0,581,1687,1542,0
4,140000,2,3,1,28,0,0,2,0,0,...,12211,11793,3719,3329,0,432,1000,1000,1000,0


In [36]:
input_data = np.array([[55000,1,2,1,59,-1,0,-1,0,0,0,8100,
                                        6000,35000,21000,20000,20000,2000,
                                        37000,11000,9000,700,700]])
# standardize the data
std_data = train_scaler.fit_transform(input_data)

prediction = model1.predict(std_data)
print(prediction)


if (prediction == 0):
  print("The person is not a credit card defaulter")

else:
  print("The person is a credit card defaulter")

[0]
The person is not a credit card defaulter
