#### About
> Credit Card Fraud detection.

> Dataset Link - https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('/home/suraj/ClickUp/Jan-Feb/data/creditcard.csv')

In [3]:
#checking unique values of target variable
np.unique(df['Class'].values.tolist())


array([0, 1])

In [4]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [5]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

#### 1. Data Pre-processing

In [6]:
# Data cleaning

# Check for duplicates
df = df.drop_duplicates()
# Check for missing values
df = df.dropna()
df


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [7]:
#feature scaling

scaler = StandardScaler()
df[['Time', 'Amount'] + ['V{}'.format(i) for i in range(1, 29)]] = scaler.fit_transform(df[['Time', 'Amount'] + ['V{}'.format(i) for i in range(1, 29)]])


In [8]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.996823,-0.701082,-0.041687,1.680101,0.976623,-0.247020,0.348012,0.193700,0.084434,0.333534,...,-0.024777,0.383483,-0.177444,0.110157,0.247059,-0.392622,0.333033,-0.065850,0.244200,0
1,-1.996823,0.608792,0.164138,0.109279,0.318998,0.042258,-0.060980,-0.065656,0.072903,-0.231703,...,-0.311372,-0.881454,0.162081,-0.561503,0.321175,0.260854,-0.027154,0.043219,-0.342584,0
2,-1.996802,-0.700336,-0.811337,1.174270,0.270648,-0.366756,1.352655,0.643223,0.210788,-1.381169,...,0.343094,1.065068,1.457772,-1.138484,-0.628161,-0.288861,-0.144325,-0.183824,1.158900,0
3,-1.996802,-0.499064,-0.109972,1.187383,-0.608355,-0.008814,0.937245,0.192079,0.320843,-1.264664,...,-0.149093,0.007299,-0.305465,-1.941446,1.242487,-0.460694,0.154039,0.185687,0.139886,0
4,-1.996781,-0.597606,0.535539,1.025470,0.287092,-0.297036,0.072873,0.481517,-0.228725,0.747917,...,-0.012516,1.101780,-0.220709,0.232904,-0.394800,1.041677,0.550001,0.654234,-0.073813,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,1.642235,-6.102103,6.118855,-6.519873,-1.459282,-3.897079,-1.956335,-4.007632,6.196662,1.749010,...,0.295375,0.154412,1.626230,-0.841382,2.757072,0.518377,2.380049,2.509507,-0.350252,0
284803,1.642257,-0.379208,-0.030938,1.347812,-0.520175,0.629193,0.795504,0.018351,0.250814,0.535282,...,0.296413,1.275826,0.019665,-1.678330,-1.163409,-0.820253,0.168567,-0.164849,-0.254325,0
284804,1.642278,0.982354,-0.180433,-2.155033,-0.392355,1.908988,2.276699,-0.243249,0.601561,0.396215,...,0.321057,0.798074,-0.060444,1.056626,0.510299,-0.181557,0.006802,-0.082640,-0.082239,0
284805,1.642278,-0.126465,0.324660,0.464577,0.489870,-0.275808,0.469130,-0.560399,0.576734,0.359367,...,0.366919,1.104223,-0.262138,0.203081,-1.091530,1.133734,0.270523,0.317004,-0.313391,0


In [9]:
#feature engineering
df['Amount_Max'] = df.groupby(['Time'])['Amount'].transform('max')
df['Amount_Min'] = df.groupby(['Time'])['Amount'].transform('min')
df['Amount_Mean'] = df.groupby(['Time'])['Amount'].transform('mean')
df['Amount_Median'] = df.groupby(['Time'])['Amount'].transform('median')

df['V_sum'] = df[['V{}'.format(i) for i in range(1, 29)]].sum(axis=1)
df['V_mean'] = df[['V{}'.format(i) for i in range(1, 29)]].mean(axis=1)


In [10]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V27,V28,Amount,Class,Amount_Max,Amount_Min,Amount_Mean,Amount_Median,V_sum,V_mean
0,-1.996823,-0.701082,-0.041687,1.680101,0.976623,-0.247020,0.348012,0.193700,0.084434,0.333534,...,0.333033,-0.065850,0.244200,0,0.244200,-0.342584,-0.049192,-0.049192,2.802489,0.100089
1,-1.996823,0.608792,0.164138,0.109279,0.318998,0.042258,-0.060980,-0.065656,0.072903,-0.231703,...,-0.027154,0.043219,-0.342584,0,0.244200,-0.342584,-0.049192,-0.049192,3.401201,0.121471
2,-1.996802,-0.700336,-0.811337,1.174270,0.270648,-0.366756,1.352655,0.643223,0.210788,-1.381169,...,-0.144325,-0.183824,1.158900,0,1.158900,0.139886,0.649393,0.649393,0.619255,0.022116
3,-1.996802,-0.499064,-0.109972,1.187383,-0.608355,-0.008814,0.937245,0.192079,0.320843,-1.264664,...,0.154039,0.185687,0.139886,0,1.158900,0.139886,0.649393,0.649393,-3.163440,-0.112980
4,-1.996781,-0.597606,0.535539,1.025470,0.287092,-0.297036,0.072873,0.481517,-0.228725,0.747917,...,0.550001,0.654234,-0.073813,0,-0.073813,-0.338670,-0.206242,-0.206242,6.451518,0.230411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,1.642235,-6.102103,6.118855,-6.519873,-1.459282,-3.897079,-1.956335,-4.007632,6.196662,1.749010,...,2.380049,2.509507,-0.350252,0,-0.350252,-0.350252,-0.350252,-0.350252,13.202981,0.471535
284803,1.642257,-0.379208,-0.030938,1.347812,-0.520175,0.629193,0.795504,0.018351,0.250814,0.535282,...,0.168567,-0.164849,-0.254325,0,-0.254325,-0.254325,-0.254325,-0.254325,-1.898147,-0.067791
284804,1.642278,0.982354,-0.180433,-2.155033,-0.392355,1.908988,2.276699,-0.243249,0.601561,0.396215,...,0.006802,-0.082640,-0.082239,0,-0.082239,-0.313391,-0.197815,-0.197815,6.606631,0.235951
284805,1.642278,-0.126465,0.324660,0.464577,0.489870,-0.275808,0.469130,-0.560399,0.576734,0.359367,...,0.270523,0.317004,-0.313391,0,-0.082239,-0.313391,-0.197815,-0.197815,7.060626,0.252165


In [11]:
#dimensionality reduction
pca = PCA(n_components=10)
pca_values = pca.fit_transform(df[['Time', 'Amount'] + ['V{}'.format(i) for i in range(1, 29)]])
pca_df = pd.DataFrame(data=pca_values, columns=['PCA_{}'.format(i) for i in range(1, 11)])
df = pd.concat([df.drop(['Time', 'Amount'] + ['V{}'.format(i) for i in range(1, 29)], axis=1), pca_df], axis=1)


In [12]:
df

Unnamed: 0,Class,Amount_Max,Amount_Min,Amount_Mean,Amount_Median,V_sum,V_mean,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10
0,0.0,0.244200,-0.342584,-0.049192,-0.049192,2.802489,0.100089,0.406023,-2.415979,0.387907,0.133958,-0.083658,0.052912,-0.014510,-0.550987,0.458126,0.222192
1,0.0,0.244200,-0.342584,-0.049192,-0.049192,3.401201,0.121471,-0.379466,-2.066637,-0.301409,-0.041380,0.286663,0.048511,-0.573105,0.691282,0.145388,0.167701
2,0.0,1.158900,0.139886,0.649393,0.649393,0.619255,0.022116,1.919133,-2.385297,-0.583583,0.845605,-0.182472,-0.836814,0.108906,0.334235,-1.470322,-0.713704
3,0.0,1.158900,0.139886,0.649393,0.649393,-3.163440,-0.112980,0.310605,-1.661335,0.020578,-0.238946,1.281676,-0.318594,0.314453,-1.193661,-1.631180,-2.100156
4,0.0,-0.073813,-0.338670,-0.206242,-0.206242,6.451518,0.230411,-0.055219,-1.475902,1.368235,-0.068990,1.250952,0.285838,-0.651053,0.132903,0.739820,-0.525126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,0.0,-0.350252,-0.350252,-0.350252,-0.350252,13.202981,0.471535,,,,,,,,,,
284803,0.0,-0.254325,-0.254325,-0.254325,-0.254325,-1.898147,-0.067791,,,,,,,,,,
284804,0.0,-0.082239,-0.313391,-0.197815,-0.197815,6.606631,0.235951,,,,,,,,,,
284805,0.0,-0.082239,-0.313391,-0.197815,-0.197815,7.060626,0.252165,,,,,,,,,,


In [13]:
df.columns

Index(['Class', 'Amount_Max', 'Amount_Min', 'Amount_Mean', 'Amount_Median',
       'V_sum', 'V_mean', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6',
       'PCA_7', 'PCA_8', 'PCA_9', 'PCA_10'],
      dtype='object')

#### 2. Feature Engineering

In [14]:
#creating new features


df['Amount_Range'] = df['Amount_Max'] - df['Amount_Min']
df['V_Range'] = df['V_sum'] - df['V_mean']
df['PCA_Sum'] = df['PCA_1'] + df['PCA_2'] + df['PCA_3'] + df['PCA_4'] + df['PCA_5'] + df['PCA_6'] + df['PCA_7'] + df['PCA_8'] + df['PCA_9'] + df['PCA_10']



In [15]:
df

Unnamed: 0,Class,Amount_Max,Amount_Min,Amount_Mean,Amount_Median,V_sum,V_mean,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,PCA_6,PCA_7,PCA_8,PCA_9,PCA_10,Amount_Range,V_Range,PCA_Sum
0,0.0,0.244200,-0.342584,-0.049192,-0.049192,2.802489,0.100089,0.406023,-2.415979,0.387907,0.133958,-0.083658,0.052912,-0.014510,-0.550987,0.458126,0.222192,0.586784,2.702400,-1.404015
1,0.0,0.244200,-0.342584,-0.049192,-0.049192,3.401201,0.121471,-0.379466,-2.066637,-0.301409,-0.041380,0.286663,0.048511,-0.573105,0.691282,0.145388,0.167701,0.586784,3.279730,-2.022453
2,0.0,1.158900,0.139886,0.649393,0.649393,0.619255,0.022116,1.919133,-2.385297,-0.583583,0.845605,-0.182472,-0.836814,0.108906,0.334235,-1.470322,-0.713704,1.019014,0.597139,-2.964313
3,0.0,1.158900,0.139886,0.649393,0.649393,-3.163440,-0.112980,0.310605,-1.661335,0.020578,-0.238946,1.281676,-0.318594,0.314453,-1.193661,-1.631180,-2.100156,1.019014,-3.050460,-5.216561
4,0.0,-0.073813,-0.338670,-0.206242,-0.206242,6.451518,0.230411,-0.055219,-1.475902,1.368235,-0.068990,1.250952,0.285838,-0.651053,0.132903,0.739820,-0.525126,0.264857,6.221107,1.001460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,0.0,-0.350252,-0.350252,-0.350252,-0.350252,13.202981,0.471535,,,,,,,,,,,0.000000,12.731446,
284803,0.0,-0.254325,-0.254325,-0.254325,-0.254325,-1.898147,-0.067791,,,,,,,,,,,0.000000,-1.830356,
284804,0.0,-0.082239,-0.313391,-0.197815,-0.197815,6.606631,0.235951,,,,,,,,,,,0.231151,6.370680,
284805,0.0,-0.082239,-0.313391,-0.197815,-0.197815,7.060626,0.252165,,,,,,,,,,,0.231151,6.808461,
