***Problem Statement***

To build a machine learning model for a company in order to detect frauduelent transactions

In [1]:
import matplotlib as plt
import numpy as np
import pandas as pd
import sklearn as sklearn

In [2]:
df = pd.read_csv('Fraud.csv')

In [3]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


To Convert all the 'type' categories in dataset to integer format 

In [4]:
# Replacing Payment transactions to numeric values
df.type = df.type.replace({'CASH_IN':0, 'CASH_OUT':1, 'DEBIT':2, 'PAYMENT':3, 'TRANSFER':4})

In [5]:
df['type'].value_counts()

1    2237500
3    2151495
0    1399284
4     532909
2      41432
Name: type, dtype: int64

In [6]:
df.type = df.type.astype(float)
df.type

0          3.0
1          3.0
2          4.0
3          1.0
4          3.0
          ... 
6362615    1.0
6362616    4.0
6362617    1.0
6362618    4.0
6362619    1.0
Name: type, Length: 6362620, dtype: float64

In [7]:
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [8]:
df['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

In [9]:
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [11]:
# To check whih type of transactions account to fraud the most.
pd.crosstab(df.type, df.isFraud)

isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1399284,0
1.0,2233384,4116
2.0,41432,0
3.0,2151495,0
4.0,528812,4097


As Observed 'isFlaggedFraud', 'nameOrig' & 'nameDest' isn't that important as the factors showcase a high level fraud, customer who started the transaction and the customer who received the transaction respectively but those factors can be faked and can't be trusted upon. The purpose is to identify a frauduelent transaction and not categorise is as high level or low level. Therefore reducing the parameters can get better output.

In [12]:
df = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,3.0,9839.64,170136.0,160296.36,0.0,0.0,0
1,1,3.0,1864.28,21249.0,19384.72,0.0,0.0,0
2,1,4.0,181.0,181.0,0.0,0.0,0.0,1
3,1,1.0,181.0,181.0,0.0,21182.0,0.0,1
4,1,3.0,11668.14,41554.0,29885.86,0.0,0.0,0


In [13]:
y = df.isFraud
x = df.drop(columns=['isFraud'])

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [15]:
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [16]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((5090096, 7), (5090096,), (1272524, 7), (1272524,))

In [17]:
# Put models in dictionary
models = {"Random Forest": RandomForestClassifier()}
 
def fit_and_score(models, x_train, x_test, y_train, y_test):
    
        
    """
    Fits and evaluates given ML models.
    models: a dict of different Scikit-Learn ML models
    x_train : training data (no labels) 
    x_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    np.random.seed(42)

    model_score = {}

    for name, model in models.items():
             
        model.fit(x_train, y_train)
        model.fit(x_train, y_train)
        model_score[name] = model.score(x_test, y_test)
    return model_score

In [18]:
model_score = fit_and_score(models=models,
                            x_train=x_train,
                            x_test=x_test,
                            y_train=y_train,
                            y_test=y_test)
model_score

{'Random Forest': 0.9997288852705332}