# Preprocess mini_fraud

<img src="https://i.imgur.com/4mlit9j.png" width=700 align="center">

### Import libraries and data

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline
plt.style.use('ggplot')

In [2]:
fraud_df = pd.read_csv("Fraud.csv")

In [3]:
pd.options.display.float_format = '{:.2f}'.format

In [4]:
fraud_df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
fraud_df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,26.97,158666.98,874009.54,893808.9,978160.05,1114197.97,0.0,0.0
std,15.62,264940.93,2971750.56,3008271.33,2296780.39,2416593.12,0.03,0.0
min,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,12149.07,0.0,0.0,0.0,0.0,0.0,0.0
50%,20.0,76343.33,16002.0,0.0,126377.21,218260.36,0.0,0.0
75%,39.0,213761.89,136642.02,174599.99,915923.47,1149807.51,0.0,0.0
max,95.0,10000000.0,38900000.0,38900000.0,42100000.0,42200000.0,1.0,0.0


In [6]:
fraud_df.shape

(1048575, 11)

In [7]:
fraud_df = fraud_df.drop(['isFlaggedFraud'], axis=1)

In [8]:
# get same_value feature
def same_value(amt, oldbal):
    if amt == oldbal:
        return 1
    else:
        return 0

fraud_df['same_value'] = fraud_df.apply(lambda x: same_value(x['amount'], x['oldbalanceOrg']), axis=1)

In [9]:
# transform nameDest to numeric feature by only using 1st character
def nameDest_prefix(name):
    return name[:1]
  
fraud_df['prefix'] = fraud_df['nameDest'].apply(nameDest_prefix)

# Get one hot encoding of columns
one_hot = pd.get_dummies(fraud_df['prefix'], dtype=int)
# Drop column as it is now encoded
fraud_df = fraud_df.drop(['nameDest','prefix'],axis = 1)
# Join the encoded dataset
fraud_df = fraud_df.join(one_hot)
fraud_df.head(200)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,0.00,0.00,0,0,0,1
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,0.00,0.00,0,0,0,1
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,0.00,0.00,1,1,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,21182.00,0.00,1,1,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,0.00,0.00,0,0,0,1
5,1,PAYMENT,7817.71,C90045638,53860.00,46042.29,0.00,0.00,0,0,0,1
6,1,PAYMENT,7107.77,C154988899,183195.00,176087.23,0.00,0.00,0,0,0,1
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,0.00,0.00,0,0,0,1
8,1,PAYMENT,4024.36,C1265012928,2671.00,0.00,0.00,0.00,0,0,0,1
9,1,DEBIT,5337.77,C712410124,41720.00,36382.23,41898.00,40348.79,0,0,1,0


In [10]:
fraud_df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
same_value          int64
C                   int32
M                   int32
dtype: object

In [11]:
# Check for null values
fraud_df.isnull().values.any()

False

In [12]:
fraud_df.shape

(1048575, 12)

In [13]:
fraud_df.nunique()

step                   95
type                    5
amount            1009606
nameOrig          1048317
oldbalanceOrg      391033
newbalanceOrig     440792
oldbalanceDest     590110
newbalanceDest     437054
isFraud                 2
same_value              2
C                       2
M                       2
dtype: int64

In [14]:
## Label encoding of type feature
# def label_encoding(type):
#   if (type == "CASH_OUT"):
#     return 1
#   elif (type == "TRANSFER"):
#     return 1
#   else:
#     return 0

# fraud_df['type_label'] = fraud_df['type'].apply(label_encoding)
# pd.set_option('display.max_rows', None)
# fraud_df = fraud_df.drop(columns=['type'])
# fraud_df.head()

In [15]:
# One-hot encoding of type feature
# Get one hot encoding of column
one_hot = pd.get_dummies(fraud_df['type'], dtype=int)
# Drop column as it is now encoded
fraud_df = fraud_df.drop('type',axis = 1)
# Join the encoded dataset
fraud_df = fraud_df.join(one_hot)
fraud_df.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,0.0,0.0,0,0,0,1,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,0.0,0.0,0,0,0,1,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,0.0,0.0,1,1,1,0,0,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,21182.0,0.0,1,1,1,0,0,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,0.0,0.0,0,0,0,1,0,0,0,1,0


In [16]:
fraud_df.dtypes

step                int64
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
same_value          int64
C                   int32
M                   int32
CASH_IN             int32
CASH_OUT            int32
DEBIT               int32
PAYMENT             int32
TRANSFER            int32
dtype: object

In [17]:
# take all numeric cols and put in 1 dataframe
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = fraud_df.select_dtypes(include=numerics)
numeric_df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,1,0,0,0,1,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,1,0,0,0,1,0
2,1,181.0,181.0,0.0,0.0,0.0,1,1,1,0,0,0,0,0,1
3,1,181.0,181.0,0.0,21182.0,0.0,1,1,1,0,0,1,0,0,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,1,0,0,0,1,0


In [18]:
# draw correlation matrix
numeric_df_corr = numeric_df.corr()
numeric_df_corr
highlight_table = numeric_df_corr.round(2).style.background_gradient(cmap='coolwarm')
highlight_table

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
step,1.0,-0.03,-0.01,-0.01,-0.0,-0.02,0.05,0.05,-0.02,0.02,-0.01,-0.01,-0.01,0.02,0.0
amount,-0.03,1.0,0.0,-0.0,0.22,0.31,0.13,0.12,0.4,-0.4,0.02,0.07,-0.05,-0.4,0.54
oldbalanceOrg,-0.01,0.0,1.0,1.0,0.09,0.06,0.0,0.0,0.19,-0.19,0.51,-0.2,-0.02,-0.19,-0.08
newbalanceOrig,-0.01,-0.0,1.0,1.0,0.1,0.06,-0.01,-0.01,0.19,-0.19,0.53,-0.21,-0.02,-0.19,-0.09
oldbalanceDest,-0.0,0.22,0.09,0.1,1.0,0.98,-0.01,-0.01,0.3,-0.3,0.11,0.13,0.01,-0.3,0.13
newbalanceDest,-0.02,0.31,0.06,0.06,0.98,1.0,-0.0,-0.0,0.33,-0.33,0.06,0.16,0.01,-0.33,0.2
isFraud,0.05,0.13,0.0,-0.01,-0.01,-0.0,1.0,0.99,0.02,-0.02,-0.02,0.01,-0.0,-0.02,0.05
same_value,0.05,0.12,0.0,-0.01,-0.01,-0.0,0.99,1.0,0.02,-0.02,-0.02,0.01,-0.0,-0.02,0.05
C,-0.02,0.4,0.19,0.19,0.3,0.33,0.02,0.02,1.0,-1.0,0.38,0.53,0.06,-1.0,0.21
M,0.02,-0.4,-0.19,-0.19,-0.3,-0.33,-0.02,-0.02,-1.0,1.0,-0.38,-0.53,-0.06,1.0,-0.21


In [19]:
# save df as csv
numeric_df.to_csv('a2_b1_c1_d1.csv', index=False)

In [None]:
numeric_df = numeric_df.drop(['same_value'], axis=1)
numeric_df.to_csv('a1_b2_c2.csv', index=False)