In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [3]:
# Load dataset
df = pd.read_csv('/Users/sa26/Documents/GitHub/ML_Financial_Fraud_Detection/data/processed/processed_log.csv')
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,oldbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.0,0.0,0
1,1,PAYMENT,1864.28,21249.0,0.0,0
2,1,TRANSFER,181.0,181.0,0.0,1
3,1,CASH_OUT,181.0,181.0,21182.0,1
4,1,PAYMENT,11668.14,41554.0,0.0,0


In [4]:
# Separate features (X) and target (y)
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

In [5]:
# Identify categorical and numerical features
categorical_features = ["type"]
numerical_features = ["step", "amount", "oldbalanceOrg", "oldbalanceDest"]

In [6]:
# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [7]:
# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
# Apply preprocessing to training and testing data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [18]:
# Convert processed data back to DataFrames (helpful for inspection)
X_train_processed_df = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

Initial Classifiers Training and Evaluation

In [10]:
# Logistic Regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_processed, y_train)
y_pred_lr = lr.predict(X_test_processed)
f1_lr_initial = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Initial Logistic Regression F1 Score: {f1_lr_initial:.4f}")
print(f"Initial Logistic Regression Classification Report:\n{classification_report(y_test, y_pred_lr)}")

Initial Logistic Regression F1 Score: 0.9982
Initial Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.93      0.05      0.10      1643

    accuracy                           1.00   1272524
   macro avg       0.97      0.53      0.55   1272524
weighted avg       1.00      1.00      1.00   1272524



In [11]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train_processed, y_train)
y_pred_nb = nb.predict(X_test_processed)
f1_nb_initial = f1_score(y_test, y_pred_nb, average='weighted')
print(f"Initial Naive Bayes F1 Score: {f1_nb_initial:.4f}")
print(f"Initial Naive Bayes Classification Report:\n{classification_report(y_test, y_pred_nb)}")

Initial Naive Bayes F1 Score: 0.7216
Initial Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.57      0.72   1270881
           1       0.00      1.00      0.01      1643

    accuracy                           0.57   1272524
   macro avg       0.50      0.78      0.36   1272524
weighted avg       1.00      0.57      0.72   1272524



In [11]:
# k-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train_processed, y_train)
y_pred_knn = knn.predict(X_test_processed)
f1_knn_initial = f1_score(y_test, y_pred_knn, average='weighted')
print(f"Initial kNN F1 Score: {f1_knn_initial:.4f}")
print(f"Initial kNN Classification Report:\n{classification_report(y_test, y_pred_knn)}")

Initial kNN F1 Score: 0.9993
Initial kNN Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.88      0.55      0.68      1643

    accuracy                           1.00   1272524
   macro avg       0.94      0.78      0.84   1272524
weighted avg       1.00      1.00      1.00   1272524



Attempted SVM, but was not suitable for large dataset because of its long training time

Hyperparameter Tuning (Focus on RandomizedSearchCV due to speed)

In [13]:
# Logistic Regression

param_distributions_lr = {'C': np.logspace(-3, 3, 20), 'penalty': ['l1', 'l2'], 'solver': ['liblinear']}
random_search_lr = RandomizedSearchCV(LogisticRegression(random_state=42), param_distributions_lr, n_iter=10, cv=5, scoring='f1_weighted', random_state=42)
random_search_lr.fit(X_train_processed, y_train)
best_lr_random = random_search_lr.best_estimator_
print(f"Best Hyperparameters (RandomizedSearchCV) for Logistic Regression: {random_search_lr.best_params_}")

Best Hyperparameters (RandomizedSearchCV) for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l2', 'C': 1000.0}


In [12]:
# kNN
knn_params = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn_search = RandomizedSearchCV(
    KNeighborsClassifier(),
    knn_params,
    n_iter=15,
    cv=3,
    scoring='f1_weighted',
    random_state=42,
    n_jobs=-1  # Use all available processors
)

knn_search.fit(X_train_processed, y_train)
best_knn = knn_search.best_estimator_
print(f"Best kNN params: {knn_search.best_params_}")

  _data = np.array(data, dtype=dtype, copy=copy,


Best kNN params: {'weights': 'distance', 'n_neighbors': 5, 'metric': 'manhattan'}


In [18]:
# Naive Bayes

param_distributions_nb = {'var_smoothing': np.logspace(-9, -5, 5)}
random_search_nb = RandomizedSearchCV(GaussianNB(), param_distributions_nb, n_iter=5, cv=5, scoring='f1_weighted', random_state=42)
random_search_nb.fit(X_train_processed, y_train)
best_nb = random_search_nb.best_estimator_
print(f"Best Hyperparameters for Naive Bayes: {random_search_nb.best_params_}")

Best Hyperparameters for Naive Bayes: {'var_smoothing': 1e-05}


Re-train Models with Optimal Hyperparameters and Evaluate

In [23]:
# Create a new Logistic Regression model with the best hyperparameters
best_hyperparameters = {'solver': 'liblinear', 'penalty': 'l2', 'C': 1000.0}

tuned_lr_model = LogisticRegression(**best_hyperparameters)

# Train the new model on your training data
tuned_lr_model.fit(X_train_processed, y_train)

# Make predictions using the new model
y_pred_lr_manual_tuned = tuned_lr_model.predict(X_test_processed)

# Evaluate the new model
f1_lr_manual_tuned = f1_score(y_test, y_pred_lr_manual_tuned, average='weighted')
print(f"Manually Tuned Logistic Regression F1 Score: {f1_lr_manual_tuned:.4f}")
print(f"Manually Tuned Logistic Regression Classification Report:\n{classification_report(y_test, y_pred_lr_manual_tuned)}")

Manually Tuned Logistic Regression F1 Score: 0.9986
Manually Tuned Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.81      0.23      0.35      1643

    accuracy                           1.00   1272524
   macro avg       0.90      0.61      0.68   1272524
weighted avg       1.00      1.00      1.00   1272524



In [25]:
# Naive Bayes (Manually set the best hyperparameter)

tuned_nb = GaussianNB(var_smoothing=1e-05)

# Fit the model with the tuned hyperparameter
tuned_nb.fit(X_train_processed, y_train)

# Make predictions using the tuned model
y_pred_nb_tuned_manual = tuned_nb.predict(X_test_processed)

# Evaluate the performance of the manually tuned model
f1_nb_tuned_manual = f1_score(y_test, y_pred_nb_tuned_manual, average='weighted')
print(f"Manually Tuned Naive Bayes F1 Score: {f1_nb_tuned_manual:.4f}")
print(f"Manually Tuned Naive Bayes Classification Report:\n{classification_report(y_test, y_pred_nb_tuned_manual)}")

Manually Tuned Naive Bayes F1 Score: 0.7216
Manually Tuned Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.57      0.72   1270881
           1       0.00      1.00      0.01      1643

    accuracy                           0.57   1272524
   macro avg       0.50      0.78      0.36   1272524
weighted avg       1.00      0.57      0.72   1272524



No change between initial and tuned Naive Bayes

In [26]:
# kNN
best_knn.fit(X_train_processed, y_train)
y_pred_knn_tuned = best_knn.predict(X_test_processed)
f1_knn_tuned = f1_score(y_test, y_pred_knn_tuned, average='weighted')
print(f"Tuned kNN F1 Score: {f1_knn_tuned:.4f}")
print(f"Tuned kNN Classification Report:\n{classification_report(y_test, y_pred_knn_tuned)}")

Tuned kNN F1 Score: 0.9993
Tuned kNN Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.90      0.61      0.72      1643

    accuracy                           1.00   1272524
   macro avg       0.95      0.80      0.86   1272524
weighted avg       1.00      1.00      1.00   1272524

