In [6]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
import plotly.express as px # type: ignore
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [7]:
# Load data
data = pd.read_csv('PS_20174392719_1491204439457_log.csv')

In [8]:
data.head(100) 


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.0,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.0,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
95,1,TRANSFER,710544.77,C835773569,0.0,0.00,C1359044626,738531.50,16518.36,0,0
96,1,TRANSFER,581294.26,C843299092,0.0,0.00,C1590550415,5195482.15,19169204.93,0,0
97,1,TRANSFER,11996.58,C605982374,0.0,0.00,C1225616405,40255.00,0.00,0,0
98,1,PAYMENT,2875.10,C1412322831,15443.0,12567.90,M1651262695,0.00,0.00,0,0


In [9]:
# Rows and cols 
data.shape

(6362620, 11)

In [10]:
# Nulls 
data.isnull().sum( )

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [31]:
# Select specific columns to display
columns_to_display = ['type', 'amount', 'oldbalanceOrg', 'newbalanceDest', 'isFlaggedFraud']

# Display the selected columns in table format
display_data = data[columns_to_display].head()  # Display first few rows
display(display_data)  # Use display() for table formatting in Jupyter-style environments

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceDest,isFlaggedFraud
0,PAYMENT,9839.64,170136.0,0.0,0
1,PAYMENT,1864.28,21249.0,0.0,0
2,TRANSFER,181.0,181.0,0.0,0
3,CASH_OUT,181.0,181.0,0.0,0
4,PAYMENT,11668.14,41554.0,0.0,0


In [11]:
# Types of payments 
data.type.value_counts() 

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [12]:
# Visual 
type = data['type'].value_counts() 
transaction = type.index 
quantity  = type.values 
figure = px.pie(data , values=quantity , names=transaction , hole=0.5, title=" Distribution Of Transaction Type")
figure.show() 


In [13]:
numeric_data = data.select_dtypes(include=[float, int])
correlation = numeric_data.corr()
print(correlation['isFraud'].sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [14]:
# Feature engineering for behavior analysis
data['balanceOrg_diff'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['balanceDest_diff'] = data['oldbalanceDest'] - data['newbalanceDest']
data['transaction_type'] = data['type'].map({
    'CASH_OUT': 1, 
    'PAYMENT': 2, 
    'CASH_IN': 3, 
    'TRANSFER': 4, 
    'DEBIT': 5
})


In [15]:
# Select the most important features based on behavior
features = ['transaction_type', 'amount', 'balanceOrg_diff', 'balanceDest_diff', 'isFlaggedFraud']
x = np.array(data[features])
y = np.array(data['isFraud'])


In [16]:
# Split into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:

# Define the models
decision_tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
log_reg = LogisticRegression(max_iter=1000)


In [18]:
# Train the models
decision_tree.fit(xtrain, ytrain)
knn.fit(xtrain, ytrain.ravel())
log_reg.fit(xtrain, ytrain.ravel())

In [19]:
# Predict and calculate accuracy and F1 score for each model
dt_score = accuracy_score(ytest, decision_tree.predict(xtest))
knn_score = accuracy_score(ytest, knn.predict(xtest))
log_reg_score = accuracy_score(ytest, log_reg.predict(xtest))

In [20]:
# F1 Score calculating
dt_f1 = f1_score(ytest, decision_tree.predict(xtest))
knn_f1 = f1_score(ytest, knn.predict(xtest))
log_reg_f1 = f1_score(ytest, log_reg.predict(xtest))

In [21]:
# Print the results
print(f"Decision Tree Accuracy: {dt_score * 100:.2f}%")
print(f"K-Nearest Neighbors Accuracy: {knn_score * 100:.2f}%")
print(f"Logistic Regression Accuracy: {log_reg_score * 100:.2f}%")
print(f"Decision Tree F1 Score: {dt_f1:.2f}")
print(f"K-Nearest Neighbors F1 Score: {knn_f1:.2f}") 
print(f"Logistic Regression F1 Score: {log_reg_f1:.2f}")
      

Decision Tree Accuracy: 99.93%
K-Nearest Neighbors Accuracy: 99.93%
Logistic Regression Accuracy: 99.91%
Decision Tree F1 Score: 0.71
K-Nearest Neighbors F1 Score: 0.68
Logistic Regression F1 Score: 0.50


In [22]:
# Graph 
results = pd.DataFrame(
    { 'Model': ['Decision Tree', 'KNN', 'Logistic Regression'],
    'Accuracy': [dt_score * 100, knn_score * 100, log_reg_score * 100], 
    'F1 Score': [dt_f1 * 100, knn_f1 * 100, log_reg_f1 * 100] })


results_melted = results.melt(id_vars='Model', value_vars=['Accuracy', 'F1 Score'], var_name='Metric', value_name='Score')

fig = px.bar( results_melted, x='Model', y='Score', color='Metric', barmode='group', 
      title='Accuracy and F1 Score Comparison of Different Models',
      text='Score', color_discrete_sequence=px.colors.qualitative.Bold )

fig.update_layout( title_font_size=20, xaxis_title='', yaxis_title='Percentage (%)', yaxis_range=[0, 100], template='plotly_dark',
font=dict( family="Arial, sans-serif", size=12, color="white" ), plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', 
showlegend=True, legend_title_text='Metric', width=600, height=400 )

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside', marker_line_width=2)

fig.show()  

In [23]:
# FINE TUNING DECISION TREE 

xtrain_sample, _, ytrain_sample, _ = train_test_split(xtrain, ytrain, test_size=0.8, random_state=42)

# Hyperparameter tuning 
param_grid = {
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}


# model 
decision_tree = DecisionTreeClassifier(random_state=42)

In [24]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=decision_tree,
    param_distributions=param_grid,
    n_iter=20,  
    cv=3,       
    random_state=42,
    n_jobs=-1,  
    scoring='accuracy'
)


random_search.fit(xtrain_sample, ytrain_sample)

best_params = random_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 40, 'criterion': 'entropy'}


In [25]:

best_decision_tree = DecisionTreeClassifier(**best_params, random_state=42)
best_decision_tree.fit(xtrain, ytrain)

y_pred = best_decision_tree.predict(xtest)
accuracy = accuracy_score(ytest, y_pred)
f1 = f1_score(ytest, y_pred)        

print(f"Tuned Decision Tree Accuracy: {accuracy * 100:.2f}%")
print(f"Tuned Decision Tree F1 Score: {f1 * 100:.2f}%")

Tuned Decision Tree Accuracy: 99.96%
Tuned Decision Tree F1 Score: 81.39%


In [26]:
# Create a DataFrame for performance comparison
results = pd.DataFrame({
    'Metric': ['Accuracy', 'F1 Score'],
    'Score': [accuracy * 100, f1 * 100]
})

fig = px.bar(
    results,
    x='Metric',
    y='Score',
    title='Tuned Decision Tree: Performance Metrics',
    text='Score',
    color='Metric',
    color_discrete_sequence=px.colors.qualitative.Bold
)

fig.update_layout(
    title_font_size=20,
    xaxis_title='Metric',
    yaxis_title='Percentage (%)',
    yaxis_range=[0, 100],
    template='plotly_dark',
    font=dict(
        family="Arial, sans-serif",
        size=12,
        color="white"
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    width=600,
    height=400
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside', marker_line_width=2)

fig.show()


In [27]:
# Simulated data for training and validation performance over epochs
epochs = ['Epoch 1', 'Epoch 2', 'Epoch 3', 'Epoch 4', 'Epoch 5']

training_performance = [60, 75, 85, 95, 93]  
validation_performance = [55, 70, 80, 90, 88]  

results = pd.DataFrame({
    'Epoch': epochs,
    'Training Performance': training_performance,
    'Validation Performance': validation_performance
})

results_melted = results.melt(id_vars='Epoch', value_vars=['Training Performance', 'Validation Performance'],
                              var_name='Dataset', value_name='Score')

fig = px.line(
    results_melted,
    x='Epoch',  
    y='Score',  
    color='Dataset',  
    markers=True,
    title='Training vs Validation Performance Over Epochs',
    color_discrete_sequence=['#1f77b4', '#ff7f0e']  
)


fig.update_layout(
    title_font_size=20,
    xaxis_title='Epoch',
    yaxis_title='Percentage (%)',
    yaxis_range=[50, 100],  
    template='plotly_dark', 
    font=dict(
        family="Arial, sans-serif",
        size=12,
        color="white"
    ),
    plot_bgcolor='rgba(0,0,0,0)',  
    paper_bgcolor='rgba(0,0,0,0)',
    showlegend=True,
    legend_title_text='Dataset',
    width=700,
    height=500
)

fig.update_traces(texttemplate='%{y:.2f}%', textposition='top center', marker_line_width=2)

fig.show()


In [40]:
# Example set of transactions
transactions = [
    [1, 1000000, 5000000, 10000, 1],  # Fraudulent
    [2, 5000, 0, 0, 0],              # Non-Fraudulent
    [1, 200000, 10000, 2000, 1],     # Fraudulent
    [2, 7000, 100, 50, 0],           # Non-Fraudulent
    [1, 300000, 200000, 15000, 1]     # Fraudulent
]

In [39]:
def classify_transaction(transaction_data):
    # Model 
    prediction = best_decision_tree.predict([transaction_data])
    return "Fraud" if prediction[0] == 1 else "Not Fraud"

total_transactions = len(transactions)
total_fraud_transactions = sum(1 for t in transactions if t[-1] == 1) 
detected_fraud_transactions = sum(1 for t in transactions if classify_transaction(t) == "Fraud")

# Output the results
print(f"Total No of Transactions: {total_transactions}")
print(f"Total Fraud Transactions: {total_fraud_transactions}")
print(f"No of Fraud Transactions Detected: {detected_fraud_transactions}")


Total No of Transactions: 5
Total Fraud Transactions: 3
No of Fraud Transactions Detected: 2
