##  CREDIT CARD FRAUD DETECTION

### 1:  Import all standard library

In [76]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### 2: Load Dataset 

In [77]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [78]:
data.shape

(284807, 31)

In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

### Observation:
- Data Contain 284807 rows and 31 column
- Class is target variable

In [80]:
data['Class'].value_counts(normalize=True)*100

0    99.827251
1     0.172749
Name: Class, dtype: float64

### Observation:
1. 0 represent genuine transaction
2. 1 represent fraud transaction
3. we can see that 99.8% of the data contains all genuine transcation and 0.17% of the transactions are fraudlant.

### 3: Separate X and Y

In [81]:
x = data.drop('Class',axis=1)
y= data['Class']

### 4: Split the dataset into train and test set

In [82]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

### 5: Apply Random Forest Classifier algorithm

In [83]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr

LogisticRegression()

In [84]:
lr.fit(x_train,y_train)

LogisticRegression()

### 6: Prediction

In [85]:
y_pred = lr.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### 7: Evaluations

In [86]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, f1_score, precision_score, average_precision_score,  recall_score,  precision_recall_curve

In [87]:
accuracy_score(y_test,y_pred)

0.9989817773252344

In [88]:
def print_scores(y_test, y_pred):
    print(f'Balanced  :{balanced_accuracy_score(y_test, y_pred):.2f}' )
    print(f'F1        :{f1_score(y_test, y_pred):.2f}' )
    print(f'Precision :{precision_score(y_test, y_pred):.2f}' )
    print(f'Recall    :{recall_score(y_test, y_pred):.2f}' )
print_scores(y_test, y_pred)

Balanced  :0.84
F1        :0.70
Precision :0.73
Recall    :0.67


### Conclude :
- The accuracy obtained by applying Random Forest Algorithm with class imbalance Data is 99.89%.
- This accuracy is misleading.This cannot be the actual accuracy.

##### Dealing with Class Imbalance
1. Random Oversampling the Minority Class: Increasing the rows of the minority class.

In [89]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

- Here the minority class is 1(fraud transactions). In Random Oversampling we increase the minority class to match the majority class.

#### Step 1: Seperate the data for the minority and majority classes

In [90]:
df_minority = data[data['Class']==1]
df_majority = data[data['Class']==0]

#### Step 2: To oversample the minority dataframe

In [91]:
from sklearn.utils import resample
df_minority_oversampled = resample(df_minority,n_samples=284315,replace=True, random_state=0)
df_minority_oversampled.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
68633,53076.0,1.296231,0.417447,0.193963,0.901644,0.130531,-0.371634,0.158126,-0.202669,-0.079512,...,-0.112114,-0.220002,-0.121022,-0.440454,0.67154,-0.413518,0.032838,0.0206,1.18,1
10897,18690.0,-15.398845,7.472324,-19.026912,11.165526,-6.893856,-2.120937,-14.91333,-0.721214,-7.175097,...,-2.444884,0.727495,-0.345078,-0.981749,0.995271,0.816762,2.262942,-1.178063,1.0,1
42674,41194.0,-7.896886,5.38102,-8.451162,7.963928,-7.862419,-2.37682,-11.949723,5.051356,-6.912076,...,2.557944,0.926278,0.032795,0.638073,0.361887,0.444577,1.101923,0.205958,1.52,1
79536,58067.0,-0.264869,3.38614,-3.454997,4.367629,3.33606,-2.053918,0.25689,-2.957235,-2.855797,...,-1.394504,-0.166029,-1.452081,-0.251815,1.243461,0.452787,0.132218,0.424599,1.0,1
151462,95559.0,-16.30865,11.614801,-19.739386,10.463866,-12.599146,-1.202393,-23.380508,-5.781133,-7.811022,...,-4.884983,1.14091,1.392953,0.348997,-2.16751,-0.798754,-2.942775,-0.46268,1.63,1


In [92]:
df_minority_oversampled.shape

(284315, 31)

In [93]:
df_majority.shape

(284315, 31)

In [94]:
data_oversampled = pd.concat([df_minority_oversampled,df_majority])
data_oversampled.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
68633,53076.0,1.296231,0.417447,0.193963,0.901644,0.130531,-0.371634,0.158126,-0.202669,-0.079512,...,-0.112114,-0.220002,-0.121022,-0.440454,0.67154,-0.413518,0.032838,0.0206,1.18,1
10897,18690.0,-15.398845,7.472324,-19.026912,11.165526,-6.893856,-2.120937,-14.91333,-0.721214,-7.175097,...,-2.444884,0.727495,-0.345078,-0.981749,0.995271,0.816762,2.262942,-1.178063,1.0,1
42674,41194.0,-7.896886,5.38102,-8.451162,7.963928,-7.862419,-2.37682,-11.949723,5.051356,-6.912076,...,2.557944,0.926278,0.032795,0.638073,0.361887,0.444577,1.101923,0.205958,1.52,1
79536,58067.0,-0.264869,3.38614,-3.454997,4.367629,3.33606,-2.053918,0.25689,-2.957235,-2.855797,...,-1.394504,-0.166029,-1.452081,-0.251815,1.243461,0.452787,0.132218,0.424599,1.0,1
151462,95559.0,-16.30865,11.614801,-19.739386,10.463866,-12.599146,-1.202393,-23.380508,-5.781133,-7.811022,...,-4.884983,1.14091,1.392953,0.348997,-2.16751,-0.798754,-2.942775,-0.46268,1.63,1


In [95]:
data_oversampled.shape

(568630, 31)

- Now dataset contain 568630 rows and 31 columns

#### Separate x and y

In [96]:
x= data_oversampled.drop('Class',axis=1)
y= data_oversampled['Class']

#### split data into train test set

In [97]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [98]:
lr.fit(x_train,y_train)

LogisticRegression()

##### Prediction

In [99]:
y_pred_os = lr.predict(x_test)
y_pred_os

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

#### Accuracy

In [100]:
accuracy_score(y_test,y_pred_os)

0.9448850746531137

- After oversampled we got 94.48% accuracy.

In [101]:
def print_scores(y_test, y_pred_os):
    print(f'Accuracy  :{accuracy_score(y_test, y_pred_os):.2f}' )
    print(f'Balanced  :{balanced_accuracy_score(y_test, y_pred_os):.2f}' )
    print(f'F1        :{f1_score(y_test, y_pred_os):.2f}' )
    print(f'Precision :{precision_score(y_test, y_pred_os):.2f}' )
    print(f'Recall    :{recall_score(y_test, y_pred_os):.2f}' )
print_scores(y_test, y_pred_os)

Accuracy  :0.94
Balanced  :0.94
F1        :0.94
Precision :0.96
Recall    :0.92
