### **Importing the required packages**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

### **Reading and Exploring the Data**

In [None]:
data = pd.read_csv('creditcard.csv')

In [None]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,Target
0,0.114697,0.796303,-0.149553,-0.823011,0.878763,-0.553152,0.939259,-0.108502,0.111137,-0.390521,-1.949546,-0.494436,-0.353696,0.158729,-0.267239,0.234802,-0.754936,-0.343012,0.312175,-0.042711,-0.335776,-0.807853,-0.05594,-1.025281,-0.369557,0.204653,0.242724,0.085713,0.89,0
1,-0.039318,0.495784,-0.810884,0.546693,1.986257,4.386342,-1.344891,-1.743736,-0.563103,-0.616315,-0.587786,0.317419,-0.408521,0.719639,0.226926,-0.296316,-0.040143,0.119177,1.057682,0.926255,-1.377003,-0.0722,-0.197573,1.014807,1.011293,-0.167684,0.113136,0.256836,85.0,0
2,2.275706,-1.531508,-1.021969,-1.602152,-1.220329,-0.462376,-1.196485,-0.147058,-0.950224,1.560463,-1.753256,-1.33101,-0.061941,-0.405532,0.048083,-0.307503,0.289363,0.189739,0.022546,-0.408289,-0.193271,-0.103533,0.150945,-0.811083,-0.197913,-0.128446,0.014197,-0.051289,42.7,0
3,1.940137,-0.357671,-1.210551,0.382523,0.050823,-0.171322,-0.109124,-0.002115,0.869258,-0.001965,0.607629,1.048673,-0.514821,0.329538,-1.041463,-0.498797,-0.276887,0.114245,0.379447,-0.19928,0.157994,0.650355,0.034206,0.739535,0.223605,-0.195509,-0.012791,-0.056841,29.99,0
4,1.081395,-0.502615,1.075887,-0.543359,-1.472946,-1.065484,-0.443231,-0.143374,1.659826,-1.131238,0.173132,1.430172,0.915609,-0.336588,1.140171,-0.653626,-0.016567,0.066287,0.242537,0.05988,0.224157,0.821209,-0.137223,0.986259,0.563228,-0.574206,0.089673,0.052036,68.0,0


In [None]:
data.shape

(56962, 30)

In [None]:
data.isnull().sum()

Unnamed: 0,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0
V10,0


In [None]:
data.duplicated().sum() #print the total number of duplicate rows in our data

np.int64(675)

In [None]:
data[data.duplicated()] #print all the duplicate rows

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,Target
1181,2.010213,0.063667,-1.620606,0.341472,0.368741,-0.586677,0.034489,-0.043750,0.303938,-0.201902,1.246987,0.621225,-0.606263,-0.521835,-0.400476,0.428054,0.375451,0.108121,0.206974,-0.192159,-0.301625,-0.820824,0.362729,0.595190,-0.345739,0.150098,-0.066713,-0.040558,1.79,0
1936,1.302378,-0.606529,-0.681986,-1.904603,1.326623,3.436312,-1.145127,0.959147,1.671302,-1.022946,-0.191423,0.631027,0.031907,-0.031425,1.446627,-0.121820,-0.651405,0.617970,0.927600,0.005757,-0.064208,-0.080587,-0.072991,1.018136,0.663575,-0.671323,0.096801,0.028697,1.00,0
2530,2.055797,-0.326668,-2.752041,-0.842316,2.463072,3.173856,-0.432126,0.727706,0.608606,-0.075186,0.063504,0.350564,-0.141238,0.690972,1.275257,-0.371962,-0.601957,-0.052640,-0.330590,-0.180370,0.269765,0.844627,0.020675,0.726212,0.366624,-0.398828,0.027735,-0.060282,1.00,0
2878,1.076018,-0.126284,1.320255,1.154681,-0.892714,0.356662,-0.792107,0.396302,0.630048,0.007419,1.275818,0.718550,-1.013817,0.184727,0.265956,0.161364,-0.249229,0.170324,-0.500635,-0.231074,0.071098,0.320206,0.055667,0.204490,0.230282,-0.396762,0.077135,0.024214,1.00,0
3301,1.109985,0.368032,-0.061407,1.376844,0.070437,-1.100573,0.610397,-0.487201,0.920589,-0.380884,0.667781,-2.023532,2.151358,1.919033,-0.098700,-0.256589,0.392087,-0.138442,-0.485983,0.013662,-0.088583,-0.126343,-0.217999,0.358122,0.814626,-0.345815,-0.048111,0.020956,89.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56809,1.886717,-0.517305,-1.351317,-0.141112,0.586967,1.052636,-0.330743,0.353181,0.936002,-0.165864,0.739023,1.066571,-0.573594,0.431464,-0.123380,-0.981977,0.272484,-1.240351,-0.187481,-0.305305,-0.071110,0.074763,0.201228,-1.628444,-0.245710,0.023253,0.014937,-0.081462,20.00,0
56830,1.284143,0.462738,-0.371277,0.825644,0.464456,-0.466731,0.459673,-0.186236,-0.549594,0.204381,0.787650,0.882733,0.417111,0.642421,-0.010537,0.161756,-0.835325,0.301955,0.293297,-0.098584,0.013440,0.084371,-0.246710,-0.325573,0.927623,-0.272459,-0.009998,-0.005489,0.89,0
56865,1.018412,1.036663,-1.689814,1.315476,1.698436,0.528807,0.331715,0.364539,-0.711798,-1.570288,3.463018,0.538411,-0.378095,-3.045495,1.468911,-0.029742,3.664588,-0.105190,-2.097944,-0.167555,-0.040238,0.096172,-0.092549,-1.345664,0.510305,-0.182674,0.107058,0.071818,0.89,0
56893,2.060160,0.018599,-1.072853,0.381576,0.018414,-1.063353,0.240911,-0.365617,0.382032,0.033555,-0.570247,1.007335,1.147933,0.062697,-0.025528,-0.193756,-0.318779,-1.067639,0.093688,-0.160514,-0.275957,-0.600087,0.325765,-0.054364,-0.267467,0.201880,-0.060525,-0.058989,1.29,0


In [None]:
data.drop_duplicates(inplace = True)  #remove all the duplicate rows

In [None]:
#check for data imbalance in classification
data['Target'].value_counts()

Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
0,56189
1,98


In [None]:
56189 / 56287

0.9982589230195249

In [None]:
98 / 56287

0.0017410769804750653

### **To do for imbalanced data**:

1. Oversampling (SMOTE).
2. Undersampling (RandomUnderSampler).
3. Use Tree Based Algortihms.

### **Machine Learning Process**

In [None]:
X = data.drop(columns = ['Target'])
y = data['Target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 10000)

#### **Apply Random Forest Classifier on the data**

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

0.9991117427607035

#### Note : Never use accuracy_score in the case of imbalanced data. Use roc_auc_score instead.

In [None]:
roc_auc_score(y_test, y_pred)

np.float64(0.7999110161950526)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred)

array([[11236,     2],
       [    8,    12]])

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
TN, FP, FN, TP = cm.ravel()

In [None]:
print(TN, TP, FN, FP)

11236 12 8 2


In [None]:
precision_score(y_test, y_pred)

0.8571428571428571

In [None]:
recall_score(y_test, y_pred)

0.6