# Preprocess mini_fraud

<img src="https://i.imgur.com/4mlit9j.png" width=700 align="center">

### Import libraries and data

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline
plt.style.use('ggplot')

In [2]:
fraud_df= pd.read_csv("mini_fraud.csv")
# fraud_df = pd.read_csv("Fraud.csv")

In [3]:
pd.options.display.float_format = '{:.2f}'.format

In [4]:
fraud_df.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
3,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
4,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0


In [5]:
fraud_df.describe()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,253519.32,295780.69,109558.48,250980.92,875959.75,0.3,0.0
std,877084.29,1155283.18,705727.79,939750.88,2979490.22,0.46,0.0
min,8.73,0.0,0.0,0.0,0.0,0.0,0.0
25%,4313.3,0.0,0.0,0.0,0.0,0.0,0.0
50%,15852.32,11303.5,0.0,0.0,0.0,0.0,0.0
75%,141176.99,54416.89,7571.69,122635.73,170368.14,1.0,0.0
max,10000000.0,12930418.44,6309146.87,13010502.78,19169204.93,1.0,0.0


In [6]:
fraud_df.shape

(500, 10)

In [7]:
fraud_df = fraud_df.drop(['isFlaggedFraud'], axis=1)

In [8]:
# get same_value feature
def same_value(amt, oldbal):
    if amt == oldbal:
        return 1
    else:
        return 0

fraud_df['same_value'] = fraud_df.apply(lambda x: same_value(x['amount'], x['oldbalanceOrg']), axis=1)

In [9]:
# transform nameDest to numeric feature by only using 1st character
def nameDest_prefix(name):
    return name[:1]
  
fraud_df['prefix'] = fraud_df['nameDest'].apply(nameDest_prefix)

# Get one hot encoding of columns
one_hot = pd.get_dummies(fraud_df['prefix'], dtype=int)
# Drop column as it is now encoded
fraud_df = fraud_df.drop(['nameDest','prefix'],axis = 1)
# Join the encoded dataset
fraud_df = fraud_df.join(one_hot)
fraud_df.head(200)

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M
0,PAYMENT,9839.64,C1231006815,170136.00,160296.36,0.00,0.00,0,0,0,1
1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,0.00,0.00,0,0,0,1
2,PAYMENT,11668.14,C2048537720,41554.00,29885.86,0.00,0.00,0,0,0,1
3,PAYMENT,7817.71,C90045638,53860.00,46042.29,0.00,0.00,0,0,0,1
4,PAYMENT,7107.77,C154988899,183195.00,176087.23,0.00,0.00,0,0,0,1
5,PAYMENT,7861.64,C1912850431,176087.23,168225.59,0.00,0.00,0,0,0,1
6,PAYMENT,4024.36,C1265012928,2671.00,0.00,0.00,0.00,0,0,0,1
7,DEBIT,5337.77,C712410124,41720.00,36382.23,41898.00,40348.79,0,0,1,0
8,DEBIT,9644.94,C1900366749,4465.00,0.00,10845.00,157982.12,0,0,1,0
9,PAYMENT,3099.97,C249177573,20771.00,17671.03,0.00,0.00,0,0,0,1


In [10]:
fraud_df.dtypes

type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
same_value          int64
C                   int32
M                   int32
dtype: object

In [11]:
# Check for null values
fraud_df.isnull().values.any()

False

In [12]:
fraud_df.shape

(500, 11)

In [13]:
fraud_df.nunique()

type                4
amount            430
nameOrig          500
oldbalanceOrg     285
newbalanceOrig    155
oldbalanceDest    224
newbalanceDest    145
isFraud             2
same_value          2
C                   2
M                   2
dtype: int64

In [14]:
# Label encoding of type feature
def label_encoding(type):
  if (type == "CASH_OUT"):
    return 1
  elif (type == "TRANSFER"):
    return 1
  else:
    return 0

fraud_df['type_label'] = fraud_df['type'].apply(label_encoding)
pd.set_option('display.max_rows', None)
fraud_df = fraud_df.drop(columns=['type'])
fraud_df.head()

Unnamed: 0,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,type_label
0,9839.64,C1231006815,170136.0,160296.36,0.0,0.0,0,0,0,1,0
1,1864.28,C1666544295,21249.0,19384.72,0.0,0.0,0,0,0,1,0
2,11668.14,C2048537720,41554.0,29885.86,0.0,0.0,0,0,0,1,0
3,7817.71,C90045638,53860.0,46042.29,0.0,0.0,0,0,0,1,0
4,7107.77,C154988899,183195.0,176087.23,0.0,0.0,0,0,0,1,0


In [15]:
# One-hot encoding of type feature
# Get one hot encoding of column
one_hot = pd.get_dummies(fraud_df['type'], dtype=int)
# Drop column as it is now encoded
fraud_df = fraud_df.drop('type',axis = 1)
# Join the encoded dataset
fraud_df = fraud_df.join(one_hot)
fraud_df.head()

Unnamed: 0,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,9839.64,C1231006815,170136.0,160296.36,0.0,0.0,0,0,0,1,0,0,1,0
1,1864.28,C1666544295,21249.0,19384.72,0.0,0.0,0,0,0,1,0,0,1,0
2,11668.14,C2048537720,41554.0,29885.86,0.0,0.0,0,0,0,1,0,0,1,0
3,7817.71,C90045638,53860.0,46042.29,0.0,0.0,0,0,0,1,0,0,1,0
4,7107.77,C154988899,183195.0,176087.23,0.0,0.0,0,0,0,1,0,0,1,0


In [16]:
fraud_df.dtypes

amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
same_value          int64
C                   int32
M                   int32
CASH_OUT            int32
DEBIT               int32
PAYMENT             int32
TRANSFER            int32
dtype: object

In [17]:
# take all numeric cols and put in 1 dataframe
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = fraud_df.select_dtypes(include=numerics)
numeric_df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,1,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,1,0,0,1,0
2,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,1,0,0,1,0
3,7817.71,53860.0,46042.29,0.0,0.0,0,0,0,1,0,0,1,0
4,7107.77,183195.0,176087.23,0.0,0.0,0,0,0,1,0,0,1,0


In [19]:
# draw correlation matrix
numeric_df_corr = numeric_df.corr()
numeric_df_corr
highlight_table = numeric_df_corr.round(2).style.background_gradient(cmap='coolwarm')
highlight_table

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,same_value,C,M,CASH_OUT,DEBIT,PAYMENT,TRANSFER
amount,1.0,0.77,0.05,0.06,0.23,0.29,0.23,0.22,-0.22,0.07,-0.06,-0.22,0.2
oldbalanceOrg,0.77,1.0,0.66,-0.03,0.07,0.19,0.15,0.01,-0.01,-0.03,-0.04,-0.01,0.08
newbalanceOrig,0.05,0.66,1.0,-0.04,-0.04,-0.08,-0.1,-0.17,0.17,-0.12,-0.01,0.17,-0.06
oldbalanceDest,0.06,-0.03,-0.04,1.0,0.74,-0.02,-0.02,0.2,-0.2,0.23,-0.0,-0.2,-0.03
newbalanceDest,0.23,0.07,-0.04,0.74,1.0,-0.04,-0.05,0.22,-0.22,0.18,-0.04,-0.22,0.07
isFraud,0.29,0.19,-0.08,-0.02,-0.04,1.0,0.95,0.5,-0.5,0.19,-0.14,-0.5,0.44
same_value,0.23,0.15,-0.1,-0.02,-0.05,0.95,1.0,0.47,-0.47,0.16,-0.14,-0.47,0.43
C,0.22,0.01,-0.17,0.2,0.22,0.5,0.47,1.0,-1.0,0.59,0.17,-1.0,0.39
M,-0.22,-0.01,0.17,-0.2,-0.22,-0.5,-0.47,-1.0,1.0,-0.59,-0.17,1.0,-0.39
CASH_OUT,0.07,-0.03,-0.12,0.23,0.18,0.19,0.16,0.59,-0.59,1.0,-0.17,-0.59,-0.4


In [19]:
# save df as csv
numeric_df.to_csv('a1_b1_c2_d1.csv', index=False)

In [20]:
numeric_df = numeric_df.drop(['same_value'], axis=1)
numeric_df.to_csv('a1_b2_c2_d1.csv', index=False)