# Online Payment Fraud Detection

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 1. Importing dataset

In [34]:
df = pd.read_csv('data.csv')
df.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [29]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


# 2. Data Cleaning

In [30]:
#checking for any null values
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

No null values

In [31]:
#checking for any 'na' values
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

No 'na' values

## 3. Exploratory data analysis

### 3.1 Transaction types used for fraud

In [14]:
Transaction_Types = df['type'].drop_duplicates().values
Fraudlent_Transaction_Types = df.loc[df['isFraud'] == 1].type.drop_duplicates().values

print('Types of transactions: ', Transaction_Types)
print('\nTransaction type use for fraud: ', Fraudlent_Transaction_Types)

Types of transactions:  ['PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT' 'CASH_IN']

Transaction type use for fraud:  ['TRANSFER' 'CASH_OUT']


In [20]:
No_of_Fradulent_Transactions = df.loc[df['isFraud'] == 1]
print('No. of fradulent transaction: ', len(No_of_Fradulent_Transactions))

Fraud_Transfers = df.loc[(df['isFraud'] == 1) & (df['type'] == 'TRANSFER')]
print('\nNo. of fraud \'TRANSFER\':  ', len(Fraud_Transfers))

Fraud_CashOut =  df.loc[(df['isFraud'] == 1) & (df['type'] == 'CASH_OUT')]
print('\nNo. of fraus \'CASH_OUT\': ', len(Fraud_CashOut))

No. of fradulent transaction:  8213

No. of fraud 'TRANSFER':   4097

No. of fraus 'CASH_OUT':  4116


## 4. Correlation

In [24]:
corr = df.corr()
print(corr['isFraud'].sort_values(ascending=False))

  corr = df.corr()


isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


## 5. Transforming categorical value into numerical form

<h4>We will convert payment types into the numerical form and isFraud values 0 & 1 to understandable labels</h4>

In [35]:
df['type'] = df['type'].map({'PAYMENT': 1, 'TRANSFER': 2, 'CASH_OUT': 3, 'DEBIT': 4, 'CASH_IN': 5})
df['isFraud'] = df['isFraud'].map({1: 'Fraud', 0: 'No Fraud'})
df.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,2,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,3,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0
5,1,1,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,No Fraud,0
6,1,1,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,No Fraud,0
7,1,1,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,No Fraud,0
8,1,1,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,No Fraud,0
9,1,4,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,No Fraud,0


## 6. Online payment fraud detection model

<h4> We will use classification model to classify Fraud or No Fraud transaction. As a classification model, we will use Decision Tree classifier </h4>

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import seaborn as sns
import time

### 6.1 Splitting the training and testing data

In [48]:
Y = np.array(df[['isFraud']])
X = np.array(df[['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10, random_state=42)

### 6.2 Training the model

In [49]:
decisionTreeModel = DecisionTreeClassifier()

In [54]:
start = time.time()
decisionTreeModel.fit(X_train, y_train)
end = time.time()
print("Time taken for training a model: ", end-start, "seconds")

Time taken for training a model:  19.73416805267334 seconds


### 6.3 Model evaluation

In [57]:
Yhat = decisionTreeModel.predict(X_test)
report = classification_report(y_test, Yhat)
print(report)

              precision    recall  f1-score   support

       Fraud       0.91      0.89      0.90       817
    No Fraud       1.00      1.00      1.00    635445

    accuracy                           1.00    636262
   macro avg       0.96      0.95      0.95    636262
weighted avg       1.00      1.00      1.00    636262



<h3>Conclusion:</h3><h4>Our model to classify a transaction as "Fraud" and "No Fraud" is pretty much accurate!</h4>

### 6.4 Prediction

Let's provide our custom data and see whether it'll be a potential Fraud or not

In [60]:
#              Type, Amount, oldbalanceOrg, newbalanceOrig
data1 = np.array([[3, 1000000, 2000000, 1000000]])
prediction1 = decisionTreeModel.predict(data1)

data2 = np.array([[3, 1000000, 1000000, 0]])
prediction2 = decisionTreeModel.predict(data2)
print(prediction1, prediction2)

['No Fraud'] ['Fraud']


## References

<h3>Article referred: <a href="https://thecleverprogrammer.com/2022/02/22/online-payments-fraud-detection-with-machine-learning/">Online Payments Fraud Detection</a> by <a href="https://thecleverprogrammer.com/author/amankharwal/">Aman Kharwal</a></h3>