# Importing dataset and preprocessing

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('C:\\Users\SHAMBHURAJ\Downloads\Fraud.csv')

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

In [3]:
df.head(4)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0


In [4]:
df.columns


Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [5]:
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

#### There are no null values in the dataset
#### For this dataset there are no outliers as these values are of bank balance

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   step            1048575 non-null  int64  
 1   type            1048575 non-null  object 
 2   amount          1048575 non-null  float64
 3   nameOrig        1048575 non-null  object 
 4   oldbalanceOrg   1048575 non-null  float64
 5   newbalanceOrig  1048575 non-null  float64
 6   nameDest        1048575 non-null  object 
 7   oldbalanceDest  1048575 non-null  float64
 8   newbalanceDest  1048575 non-null  float64
 9   isFraud         1048575 non-null  int64  
 10  isFlaggedFraud  1048575 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 88.0+ MB


In [7]:
df['type'].value_counts()

CASH_OUT    373641
PAYMENT     353873
CASH_IN     227130
TRANSFER     86753
DEBIT         7178
Name: type, dtype: int64

#### here we can see there are 5 types of transactions in our dataset
##### we need to decode it

In [38]:
df_1 = df.copy(deep = True)

In [39]:
df_1['type'] = df_1['type'].map({'CASH_OUT':1,'PAYMENT':2,'CASH_IN':3,'TRANSFER':4,'DEBIT':5})

In [40]:
df_1['isFlaggedFraud'].value_counts()

0    1048575
Name: isFlaggedFraud, dtype: int64

In [41]:
df_1['isFraud'].value_counts()

0    1047433
1       1142
Name: isFraud, dtype: int64

#### Here we can see for feature the distribution for 'isFlaggedFraud' and 'isFraud'
#### this feature is totally imbalanced and that's why we removing it

In [42]:
df_1 = df_1.drop(columns = ['nameOrig','nameDest','isFraud','isFlaggedFraud'])

In [43]:
df_1.head(3)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,1,2,9839.64,170136.0,160296.36,0.0,0.0
1,1,2,1864.28,21249.0,19384.72,0.0,0.0
2,1,4,181.0,181.0,0.0,0.0,0.0


In [44]:
#calculating VIF for multi-collinearity between variale's
from statsmodels.stats.outliers_influence import variance_inflation_factor
def cal_vif_fact(x):
    
    #calculating vif
    vif = pd.DataFrame()
    vif['variables'] = x.columns
    vif['VIF'] = [variance_inflation_factor(x.values,i) for i in range(x.shape[1])]
    
    return(vif)

In [45]:
cal_vif_fact(df_1)

Unnamed: 0,variables,VIF
0,step,2.652199
1,type,3.608318
2,amount,2.098141
3,oldbalanceOrg,645.126815
4,newbalanceOrig,651.862813
5,oldbalanceDest,37.642258
6,newbalanceDest,40.330536


In [36]:
df_1 = df_1.drop(columns=['oldbalanceDest'])

In [37]:
cal_vif_fact(df_1)

Unnamed: 0,variables,VIF
0,step,2.643058
1,type,3.60805
2,amount,1.710765
3,oldbalanceOrg,620.371955
4,newbalanceOrig,625.310091
5,newbalanceDest,1.330794


In [31]:
df_1['old'] = df_1['oldbalanceOrg'] * df_1['oldbalanceDest']

In [32]:
df_1['new'] = df_1['newbalanceOrig'] * df_1['newbalanceDest']

In [33]:
df_1 = df_1.drop(columns = ['oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'])

In [37]:
cal_vif_fact(df_1)

Unnamed: 0,variables,VIF
0,step,0.656036
1,type,0.512352
2,amount,1.039609
3,old,198.338256
4,new,198.188347
