## Import

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
df = pd.read_csv('/content/onlinefraud.csv')

In [18]:
df.sample(15)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
72925,10,PAYMENT,6199.05,C1735349822,30284.38,24085.33,M1348090724,0.0,0.0,0,0
3727325,278,PAYMENT,12642.97,C1295849170,11773.0,0.0,M970908687,0.0,0.0,0,0
3148179,236,TRANSFER,219009.61,C1288693583,134459.09,0.0,C91333546,594399.81,813409.42,0,0
3422708,256,PAYMENT,9310.03,C362209172,29919.85,20609.82,M577105164,0.0,0.0,0,0
5484585,379,PAYMENT,7684.71,C1847846734,5830.0,0.0,M932172214,0.0,0.0,0,0
2473872,204,CASH_OUT,260638.64,C651122931,0.0,0.0,C445441670,298464.38,559103.02,0,0
5221322,370,CASH_OUT,118309.52,C119388568,151.0,0.0,C1318366375,0.0,118309.52,0,0
1426003,139,PAYMENT,4529.25,C294354084,0.0,0.0,M996936850,0.0,0.0,0,0
4886063,348,CASH_OUT,90239.36,C406376768,0.0,0.0,C1627617232,155098.87,245338.23,0,0
1344369,137,CASH_OUT,107894.34,C1175801098,53295.0,0.0,C1093062377,658552.67,766447.01,0,0


In [15]:
df.shape # shape of dataset

(246948, 11)

In [9]:
df.isnull().sum() # any null values

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,1
nameDest,1
oldbalanceDest,1
newbalanceDest,1
isFraud,1


In [19]:
df.duplicated().sum()

0

## import from sklearn library

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier # Corrected the typo from DecsionTreeClassifier to DecisionTreeClassifier0

In [21]:
import plotly.express as px # plot

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [24]:
print(df.type.value_counts()) # to check value counts type

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64


In [25]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


## Plot the Graph of Payments

In [26]:
type = df['type'].value_counts()
transactions = type.index
quantity = type.values
figure = px.pie(df,
                values=quantity,
                names=transactions,
                hole= 0.5,
                title='Distribution of Transaction Type')
figure.show()

In [27]:
# Now let’s have a look at the correlation between the features of the data with the isFraud column
# Checking correlation

# Select only numerical columns from the DataFrame
numerical_data = df.select_dtypes(include=["number"])

# Compute the correlation matrix
correlation = numerical_data.corr()

# Check if 'isFraud' exists in the numerical columns
if "isFraud" in numerical_data.columns:
    # Sort and print the correlation values
    print(correlation["isFraud"].sort_values(ascending=False))
else:
    print("Error: 'isFraud' column not found in the numerical data.")

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [28]:
# Now let’s transform the categorical features into numerical. Here we will also transform the values of the isFraud column into
# No Fraud and Fraud labels to have a better understanding of the output
# Changing CASH_OUT to 1, PAYMENT to 2, CASH_IN to 3, TRANSFER to 4 and DEBIT to 5

df["type"] = df["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})
df["isFraud"] = df["isFraud"].map({0: "No Fraud", 1: "Fraud"})
print(df.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud               0  
1  M2044282225             0.0             0.0  No Fraud               0  
2   C553264065             0.0             0.0     Fraud               0  
3    C38997010         21182.0             0.0     Fraud               0  
4  M1230701703             0.0             0.0  No Fraud               0  


## Segregate the dataset X and y

In [29]:
x = np.array(df[['type','amount','oldbalanceOrg','newbalanceOrig']])
y = np.array(df[['isFraud']])

## Train Test split

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=42)

## shape of the segrigate dataset

In [31]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(5726358, 4)
(636262, 4)
(5726358, 1)
(636262, 1)


## Fit the model

In [32]:
model = DecisionTreeClassifier()

model.fit(x_train, y_train) # training a machine learning model

## Model Score

In [33]:
print(model.score(x_test, y_test))

0.9997375295082843


## Predication

In [34]:
# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]

features = np.array([[1, 8900.2, 8990.2, 0.0]])
print(model.predict(features))

['Fraud']
