In [1]:
import numpy as np
import pandas as pd

In [12]:
df = pd.read_csv("Fraud.csv",usecols=['type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'])

In [13]:
df.shape

(6362620, 10)

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

In [14]:
df.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   type            object 
 1   amount          float32
 2   nameOrig        object 
 3   oldbalanceOrg   float32
 4   newbalanceOrig  float32
 5   nameDest        object 
 6   oldbalanceDest  float32
 7   newbalanceDest  float32
 8   isFraud         int8   
 9   isFlaggedFraud  int8   
dtypes: float32(5), int8(2), object(3)
memory usage: 279.1+ MB


In [16]:
df.columns

Index(['type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [19]:
df['amount'] = df.amount.astype(dtype=np.float32)
df['oldbalanceOrg'] = df.oldbalanceOrg.astype(dtype=np.float32)
df['newbalanceOrig'] = df.newbalanceOrig.astype(dtype=np.float32)
df["oldbalanceDest"] = df.oldbalanceDest.astype(dtype=np.float32)
df["newbalanceDest"] = df.newbalanceDest.astype(np.float32)
# df.step = df.step.astype(np.int16)
df.isFraud = df.isFraud.astype(np.int8)
df.isFlaggedFraud = df.isFlaggedFraud.astype(np.int8)

In [22]:
#https://www.analyticsvidhya.com/blog/2020/10/feature-selection-techniques-in-machine-learning/
# just for remembering all the different methods

In [23]:
df.isnull().sum()

type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [25]:
df.duplicated().sum()

0

In [26]:
df[df.isFlaggedFraud == 1]

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2736446,TRANSFER,4953893.0,C728984460,4953893.0,4953893.0,C639921569,0.0,0.0,1,1
3247297,TRANSFER,1343002.0,C1100582606,1343002.0,1343002.0,C1147517658,0.0,0.0,1,1
3760288,TRANSFER,536624.4,C1035541766,536624.4,536624.4,C1100697970,0.0,0.0,1,1
5563713,TRANSFER,4892193.0,C908544136,4892193.0,4892193.0,C891140444,0.0,0.0,1,1
5996407,TRANSFER,10000000.0,C689608084,19585040.0,19585040.0,C1392803603,0.0,0.0,1,1
5996409,TRANSFER,9585040.0,C452586515,19585040.0,19585040.0,C1109166882,0.0,0.0,1,1
6168499,TRANSFER,3576297.0,C193696150,3576297.0,3576297.0,C484597480,0.0,0.0,1,1
6205439,TRANSFER,353874.2,C1684585475,353874.2,353874.2,C1770418982,0.0,0.0,1,1
6266413,TRANSFER,2542664.0,C786455622,2542664.0,2542664.0,C661958277,0.0,0.0,1,1
6281482,TRANSFER,10000000.0,C19004745,10399040.0,10399040.0,C1806199534,0.0,0.0,1,1
