# Introduction:

In this project, we aim to build a machine learning model for detecting fraudulent online payment transactions. The dataset used for this project contains various features related to each transaction, such as transaction type, amount, old balance, and new balance, along with a binary target variable indicating whether the transaction is fraudulent or not.

### Let's import the data and the necessary modules:

In [None]:
import opendatasets as od

dataset = 'https://www.kaggle.com/datasets/jainilcoder/online-payment-fraud-detection'

od.download(dataset)

import os
data_dir = 'C:/Users/Souptik/online-payment-fraud-detection'
os.listdir(data_dir)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [None]:
dataset = pd.read_csv("C:/Users/Souptik/online-payment-fraud-detection/onlinefraud.csv")
print(dataset.head())

#### Let's look at the data:

In [None]:
dataset.shape

In [None]:
print(dataset.isnull().sum())

In [None]:
print(dataset['type'].value_counts())

## EDA:

In [None]:
type_counts = dataset["type"].value_counts()
transactions = type_counts.index
quantity = type_counts.values

plt.figure(figsize = (6,6))
plt.pie(quantity, labels = transactions, autopct = '%1.1f%%', 
        startangle = 90, pctdistance = 0.85,
        colors = sns.color_palette("Set3"), wedgeprops = {'edgecolor':
                                                         'black'})

centre_circle = plt.Circle((0,0), 0.70, fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.tight_layout()
plt.title("Distribution of Transaction Type")
plt.show()

In [None]:
sns.countplot(x='isFraud', data=dataset)
plt.title('Distribution of Fraudulent vs. Non-Fraudulent Transactions')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='isFraud', y='amount', data=dataset)
plt.title('Transaction Amount Distribution for Fraudulent vs. Non-Fraudulent Transactions')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='type', hue='isFraud', data=dataset)
plt.title('Transaction Type vs. Fraudulent Transactions')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='type', y='amount', data=dataset)
plt.title('Transaction Amount Distribution by Type')
plt.show()

#### Let's look at the Correlation:

In [None]:
correlation = dataset.corr()
print(correlation["isFraud"].sort_values(ascending = False))

#### Now, let's convert the categorical variables:

In [None]:
from sklearn.preprocessing import LabelEncoder

# Perform label encoding on the 'type' column
label_encoder = LabelEncoder()
dataset['type'] = label_encoder.fit_transform(dataset['type'])

# Transform the values of the 'isFraud' column to "No Fraud" and "Fraud"
dataset["isFraud"] = dataset["isFraud"].map({0: "No Fraud", 1: "Fraud"})

print(dataset.head())

### Without wasting time, let's split the data as per training and test, and design the models also:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix

# Splitting the dataset
x = np.array(dataset[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(dataset[["isFraud"]])

# Convert 'No Fraud' to 0 and 'Fraud' to 1 in y
y_binary = (y == 'Fraud').astype(int)

# Splitting the dataset into train and test sets
xtrain, xtest, ytrain_binary, ytest_binary = train_test_split(x, y_binary, test_size=0.10, random_state=42)

# Training the DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(xtrain, ytrain_binary)
dt_accuracy = dt_model.score(xtest, ytest_binary)
dt_preds = dt_model.predict(xtest)
dt_cm = confusion_matrix(ytest_binary, dt_preds)

# Training the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(xtrain, ytrain_binary)
xgb_accuracy = xgb_model.score(xtest, ytest_binary)
xgb_preds = xgb_model.predict(xtest)
xgb_cm = confusion_matrix(ytest_binary, xgb_preds)

# Training the LGBM model
lgb_model = LGBMClassifier()
lgb_model.fit(xtrain, ytrain_binary)
lgb_accuracy = lgb_model.score(xtest, ytest_binary)
lgb_preds = lgb_model.predict(xtest)
lgb_cm = confusion_matrix(ytest_binary, lgb_preds)

# Training the CatBoost model
cat_model = CatBoostClassifier()
cat_model.fit(xtrain, ytrain_binary)
cat_accuracy = cat_model.score(xtest, ytest_binary)
cat_preds = cat_model.predict(xtest)
cat_cm = confusion_matrix(ytest_binary, cat_preds)

# Print the accuracy and confusion matrix for each model
print("DecisionTree Accuracy:", dt_accuracy)
print("DecisionTree Confusion Matrix:")
print(dt_cm)

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Confusion Matrix:")
print(xgb_cm)

print("LGBM Accuracy:", lgb_accuracy)
print("LGBM Confusion Matrix:")
print(lgb_cm)

print("CatBoost Accuracy:", cat_accuracy)
print("CatBoost Confusion Matrix:")
print(cat_cm)

#### Let's compare the accuracy scores to get the best model:

In [None]:
# Dictionary to store model names and accuracies
model_accuracies = {}

# DecisionTree
dt_accuracy = dt_model.score(xtest, ytest_binary)
model_accuracies['DecisionTree'] = dt_accuracy

# XGBoost
xgb_accuracy = xgb_model.score(xtest, ytest_binary)
model_accuracies['XGBoost'] = xgb_accuracy

# LGBM
lgb_accuracy = lgb_model.score(xtest, ytest_binary)
model_accuracies['LGBM'] = lgb_accuracy

# CatBoost
cat_accuracy = cat_model.score(xtest, ytest_binary)
model_accuracies['CatBoost'] = cat_accuracy

# Print model accuracies
for model, accuracy in model_accuracies.items():
    print(f"{model} Accuracy: {accuracy}")

# Find the model with the highest accuracy
best_model = max(model_accuracies, key=model_accuracies.get)
print(f"\nThe best model is: {best_model} with accuracy: {model_accuracies[best_model]}")

### Using the best model to predict a value:

In [None]:
# Assuming DecisionTree is the best model
best_model = dt_model

# Features for prediction
features = np.array([[1, 8900.2, 8990.2, 0.0]])

# Predict using the best model
prediction = best_model.predict(features)

# Convert the prediction to the corresponding label (fraud or not fraud)
if prediction[0] == 1:
    label = 'Fraud'
else:
    label = 'Not Fraud'

# Print the result
print("Prediction:", label)

# Conclusion:

In conclusion, this project demonstrates the development of an online payment fraud detection model using machine learning techniques. The selected model can effectively predict fraudulent transactions, helping to enhance security and reduce potential financial losses in online payment systems.