In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv


#  Detecting Online Payment Fraud with Machine Learning

In the dynamic world of digital transactions, the threat of online payment fraud looms large. To tackle this challenge head-on, we're embarking on a Data Science Project: "Detecting Online Payment Fraud with Machine Learning." 🛡️🕵️‍♂️

Armed with the potent tools of Machine Learning and a dataset called "Synthetic Financial Datasets For Fraud Detection," [sourced from Kaggle](https://www.kaggle.com/datasets/ealaxi/paysim1), we're on a mission to uncover the intricate patterns of fraudulent activities that often go unnoticed. 💰🔍

By sifting through data, identifying anomalies, and using smart algorithms, we're creating a digital safeguard against online payment fraud. 📊💪

Just like a vigilant guardian, our model learns and adapts, making predictions based on past patterns. As we unveil the secrets of online payment fraud detection, we're making strides to secure your digital transactions and keep your financial journey safe and sound. 🚀🔒

## Columns

* step: represents a unit of time where 1 step equals 1 hour
* type: type of online transaction
* amount: the amount of the transaction
* nameOrig: customer starting the transaction
* oldbalanceOrg: balance before the transaction
* newbalanceOrig: balance after the transaction
* nameDest: recipient of the transaction
* oldbalanceDest: initial balance of recipient before the transaction
* newbalanceDest: the new balance of recipient after the transaction
* isFraud: fraud transaction

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier



## Loading Dataset

In [3]:
data = pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
data.shape

(6362620, 11)

## Checking for Null Values

In [5]:
data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

No Null Value Found

## Exploratory data analysis (EDA)

In [6]:
# Exploring transaction type

data['type'].value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [7]:
# Calculate value counts for the 'type' column
type_counts = data['type'].value_counts()

# Create a bar plot using Plotly Express
fig = px.bar(x=type_counts.index, y=type_counts.values, labels={'x': 'Transaction Type', 'y': 'Count'}, title='Transaction Type Distribution')

# Show the plot
fig.show()

In [8]:
# Checking correlation
correlation = data.corr()

correlation["isFraud"].sort_values(ascending=False)





isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

## Encoding

I will also transform the values of the isFraud column into No Fraud and Fraud labels to have a better understanding of the output:

In [9]:
data["type"] = data["type"].map({
    "CASH_OUT": 1, 
    "PAYMENT": 2,                              
    "CASH_IN": 3,
    "TRANSFER": 4,
    "DEBIT": 5
})

data["isFraud"] = data["isFraud"].map({
    0: "No Fraud", 
    1: "Fraud"
})

data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0


## Online Payments Fraud Detection Model

In [10]:
# splitting the data

x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])

y = np.array(data[["isFraud"]])

In [11]:
# training a machine learning model

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42) # Spliting

model = DecisionTreeClassifier() # Model

model.fit(xtrain, ytrain) # Fitting the Model to Train

print(model.score(xtest, ytest)) # Testing the Model

0.9997296711103287
