In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [6]:
# Load dataset
df = pd.read_csv('/Users/sa26/Documents/GitHub/ML_Financial_Fraud_Detection/data/processed/processed_log.csv')
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,oldbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.0,0.0,0
1,1,PAYMENT,1864.28,21249.0,0.0,0
2,1,TRANSFER,181.0,181.0,0.0,1
3,1,CASH_OUT,181.0,181.0,21182.0,1
4,1,PAYMENT,11668.14,41554.0,0.0,0


In [11]:
# Separate features (X) and target (y)
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

In [13]:
# Identify categorical and numerical features
categorical_features = ["type"]
numerical_features = ["step", "amount", "oldbalanceOrg", "oldbalanceDest"]

In [15]:
# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [16]:
# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [17]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
# Apply preprocessing to training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [19]:
# Convert processed data back to DataFrames (helpful for inspection)
X_train_processed_df = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

Initial Classifiers Training and Evaluation

In [20]:
# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_processed, y_train)
y_pred_lr = lr.predict(X_test_processed)
f1_lr_initial = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Initial Logistic Regression F1 Score: {f1_lr_initial:.4f}")
print(f"Initial Logistic Regression Classification Report:\n{classification_report(y_test, y_pred_lr)}")

Initial Logistic Regression F1 Score: 0.9982
Initial Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.93      0.05      0.10      1643

    accuracy                           1.00   1272524
   macro avg       0.97      0.53      0.55   1272524
weighted avg       1.00      1.00      1.00   1272524



In [21]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train_processed, y_train)
y_pred_nb = nb.predict(X_test_processed)
f1_nb_initial = f1_score(y_test, y_pred_nb, average='weighted')
print(f"Initial Naive Bayes F1 Score: {f1_nb_initial:.4f}")
print(f"Initial Naive Bayes Classification Report:\n{classification_report(y_test, y_pred_nb)}")

Initial Naive Bayes F1 Score: 0.7216
Initial Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.57      0.72   1270881
           1       0.00      1.00      0.01      1643

    accuracy                           0.57   1272524
   macro avg       0.50      0.78      0.36   1272524
weighted avg       1.00      0.57      0.72   1272524



In [22]:
# k-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train_processed, y_train)
y_pred_knn = knn.predict(X_test_processed)
f1_knn_initial = f1_score(y_test, y_pred_knn, average='weighted')
print(f"Initial kNN F1 Score: {f1_knn_initial:.4f}")
print(f"Initial kNN Classification Report:\n{classification_report(y_test, y_pred_knn)}")

Initial kNN F1 Score: 0.9993
Initial kNN Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.88      0.55      0.68      1643

    accuracy                           1.00   1272524
   macro avg       0.94      0.78      0.84   1272524
weighted avg       1.00      1.00      1.00   1272524

