In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv("dataset.csv")
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [None]:

data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64

In [None]:
# Exploring transaction type
data.type.value_counts()

type
CASH_OUT    197647
PAYMENT     180386
CASH_IN     119087
TRANSFER     44175
DEBIT         4031
Name: count, dtype: int64

In [None]:
type = data["type"].value_counts()
transactions = type.index
quantity = type.values

import plotly.express as px
figure = px.pie(data,
             values=quantity,
             names=transactions,hole = 0.5,
             title="Distribution of Transaction Type")
figure.show()


In [None]:
numeric_data = data.select_dtypes(include=['number'])

In [None]:
# Calculate the correlation matrix
correlation = numeric_data.corr()

# Print the correlation values sorted by 'isFraud'
print(correlation["isFraud"].sort_values(ascending=False))

isFraud           1.000000
amount            0.051534
oldbalanceOrg    -0.000591
newbalanceDest   -0.001855
oldbalanceDest   -0.005204
newbalanceOrig   -0.006216
step             -0.019785
isFlaggedFraud         NaN
Name: isFraud, dtype: float64


In [None]:
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2,
                                 "CASH_IN": 3, "TRANSFER": 4,
                                 "DEBIT": 5})
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0.0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0.0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0.0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0.0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0.0


In [None]:
# splitting the data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data[["isFraud"]])
#  training a machine learning model
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)

# Convert predictions to numerical labels before calculating accuracy
y_pred = model.predict(xtest)
y_pred_numeric = [0 if label == 'No Fraud' else 1 for label in y_pred]
ytest_numeric = [0 if label == 'No Fraud' else 1 for label in ytest]

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(ytest_numeric, y_pred_numeric)
print(accuracy)

0.9995782370307887


In [None]:
# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print(model.predict(features))

['Fraud']
