In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

print("Current Directory:", os.getcwd())


Current Directory: C:\Users\HP\Desktop\Nexford University\Business Analytics Capstone\Module 4 Assignment


In [3]:
# Step 2- Load dataset
df = pd.read_csv("Finalised_Dataset_paysim_cleaned.csv")
print("Dataset Shape:", df.shape)
print(df.head())


Dataset Shape: (6362620, 15)
   step    amount     nameorig  oldbalanceorg  newbalanceorig     namedest  \
0     1   9839.64  C1231006815       170136.0       160296.36  M1979787155   
1     1   1864.28  C1666544295        21249.0        19384.72  M2044282225   
2     1    181.00  C1305486145          181.0            0.00   C553264065   
3     1    181.00   C840083671          181.0            0.00    C38997010   
4     1  11668.14  C2048537720        41554.0        29885.86  M1230701703   

   oldbalancedest  newbalancedest  isfraud  isflaggedfraud  type_encoded  \
0             0.0             0.0        0               0             3   
1             0.0             0.0        0               0             3   
2             0.0             0.0        1               0             4   
3         21182.0             0.0        1               0             1   
4             0.0             0.0        0               0             3   

   error_balance_orig  error_balance_dest  or

In [4]:
# Step 3 Define Features and Target

X = df.drop(['isfraud', 'nameorig', 'namedest'], axis=1)
y = df['isfraud']
print("Step 3: Features and target defined")
print("Any NaNs in X?", X.isnull().sum().sum())


Step 3: Features and target defined
Any NaNs in X? 0


In [5]:
# Step 4 SMOTE
print("Original class distribution:", y.value_counts())
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)
print("Resampled class distribution:", y_resampled.value_counts())


Original class distribution: isfraud
0    6354407
1       8213
Name: count, dtype: int64
Resampled class distribution: isfraud
0    6354407
1    6354407
Name: count, dtype: int64


In [7]:
# Step 5 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
print("Data split")


Data split


In [8]:
# Step 6 Train Model

# Train Logistic Regression
log_model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
log_model.fit(X_train, y_train)

# Train XGBoost
from xgboost import XGBClassifier
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
log_pred = log_model.predict(X_test)
log_proba = log_model.predict_proba(X_test)[:, 1]

xgb_pred = xgb_model.predict(X_test)
xgb_proba = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation
print("=== Logistic Regression ===")
print(classification_report(y_test, log_pred))
print("ROC AUC:", roc_auc_score(y_test, log_proba))

print("\n=== XGBoost ===")
print(classification_report(y_test, xgb_pred))
print("ROC AUC:", roc_auc_score(y_test, xgb_proba))


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.95      0.97      0.96   1270837
           1       0.97      0.95      0.96   1270926

    accuracy                           0.96   2541763
   macro avg       0.96      0.96      0.96   2541763
weighted avg       0.96      0.96      0.96   2541763

ROC AUC: 0.991931304340943

=== XGBoost ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270837
           1       1.00      1.00      1.00   1270926

    accuracy                           1.00   2541763
   macro avg       1.00      1.00      1.00   2541763
weighted avg       1.00      1.00      1.00   2541763

ROC AUC: 0.9999991162755614


In [1]:
# Step 7 Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
y_proba = model.predict_proba(X_test)[:, 1]
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

# Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


NameError: name 'model' is not defined

In [None]:
# Note:
# All necessary libraries were imported, and I have to print current directory just to test/see my dataset location.
# The cleaned data from milestone 1 assignment was loaded into my notebook, ready for modelling
# Next, I defined the features.
# The dataset was highly imbalanced, so I used SMOTE to balance the class
# See the README file for more details.

In [9]:
import joblib

# Using xgb_model as my trained model
joblib.dump(xgb_model, 'model.pkl')

['model.pkl']