In [47]:
import numpy as np 
import pandas as pd 
import plotly.express as px


In [48]:
# Data 
data = pd.read_csv('PS_20174392719_1491204439457_log.csv') 

In [49]:
data.head(100) 

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.0,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.0,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95,1,TRANSFER,710544.77,C835773569,0.0,0.00,C1359044626,738531.50,16518.36,0,0
96,1,TRANSFER,581294.26,C843299092,0.0,0.00,C1590550415,5195482.15,19169204.93,0,0
97,1,TRANSFER,11996.58,C605982374,0.0,0.00,C1225616405,40255.00,0.00,0,0
98,1,PAYMENT,2875.10,C1412322831,15443.0,12567.90,M1651262695,0.00,0.00,0,0


In [50]:
# Rows and cols 
data.shape

(6362620, 11)

In [51]:
# Nulls 
data.isnull().sum( )

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [52]:
# Types of payments 
data.type.value_counts() 

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [53]:
# Visual 
type = data['type'].value_counts() 


In [54]:
transaction = type.index 
quantity  = type.values 


In [55]:
figure = px.pie(data , values=quantity , names=transaction , hole=0.5, title=" Distribution Of Transaction Type")
figure.show() 

In [56]:
numeric_data = data.select_dtypes(include=[float, int])
correlation = numeric_data.corr()
print(correlation['isFraud'].sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [57]:
# taking training parameters 
data['type'] = data['type'].map({
    'CASH_OUT' : 1 , 
    'PAYMENT' : 2 , 
    'CASH_IN' : 3 , 
    'TRANSFER' : 4 , 
    'DEBIT' : 5 
})

In [58]:
x = np.array(data[['type','amount','oldbalanceOrg','newbalanceOrig']]) 
y = np.array(data[['isFraud']])

In [65]:
#  Taking the Model 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [60]:
#  Training 
xtrain , xtest , ytrain , ytest = train_test_split(x,y,test_size = 0.2,random_state=42)
xtrain.shape

(5090096, 4)

In [61]:
#  models 
decision_tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
log_reg = LogisticRegression(max_iter=1000)


# Train models
decision_tree.fit(xtrain, ytrain)
knn.fit(xtrain, ytrain.ravel())  # KNN expects y to be 1-dimensional
log_reg.fit(xtrain, ytrain.ravel())


In [66]:
# Predict and calculate F1 score for each model
dt_f1 = f1_score(ytest, decision_tree.predict(xtest))
knn_f1 = f1_score(ytest, knn.predict(xtest))
log_reg_f1 = f1_score(ytest, log_reg.predict(xtest))

In [62]:
# Predict and calculate accuracy for each model
dt_score = accuracy_score(ytest, decision_tree.predict(xtest))
knn_score = accuracy_score(ytest, knn.predict(xtest))
log_reg_score = accuracy_score(ytest, log_reg.predict(xtest))

In [63]:
# Print accuracy of each model
print(f"Decision Tree Accuracy: {dt_score * 100:.2f}%")
print(f"K-Nearest Neighbors Accuracy: {knn_score * 100:.2f}%")
print(f"Logistic Regression Accuracy: {log_reg_score * 100:.2f}%")

Decision Tree Accuracy: 99.97%
K-Nearest Neighbors Accuracy: 99.96%
Logistic Regression Accuracy: 99.95%


In [67]:
# Create a DataFrame for accuracy and F1 comparison
results = pd.DataFrame({
    'Model': ['Decision Tree', 'KNN', 'Logistic Regression'],
    'Accuracy': [dt_score * 100, knn_score * 100, log_reg_score * 100],
    'F1 Score': [dt_f1 * 100, knn_f1 * 100, log_reg_f1 * 100]
})

# Melt the DataFrame to have "Metric" column (Accuracy/F1 Score) for Plotly
results_melted = results.melt(id_vars='Model', value_vars=['Accuracy', 'F1 Score'], 
                              var_name='Metric', value_name='Score')

# Plot dynamic graph using Plotly Express with a slimmer aspect ratio
fig = px.bar(
    results_melted, 
    x='Model', 
    y='Score', 
    color='Metric',
    barmode='group',  # Group bars for Accuracy and F1 Score side by side
    title='Accuracy and F1 Score Comparison of Different Models',
    text='Score',
    color_discrete_sequence=px.colors.qualitative.Bold
)

# Customize the layout for a slim and sleek look
fig.update_layout(
    title_font_size=20,
    xaxis_title='',
    yaxis_title='Percentage (%)',
    yaxis_range=[0, 100],
    template='plotly_dark',  # Dark theme for better contrast
    font=dict(
        family="Arial, sans-serif",
        size=12,
        color="white"
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=True,
    legend_title_text='Metric',
    width=600,  # Set width to make the graph slimmer
    height=400  # Adjust height accordingly
)

# Add hover information and display values
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside', marker_line_width=2)

# Show the figure
fig.show()
