In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
mlg_ulb_creditcardfraud_path = kagglehub.dataset_download('mlg-ulb/creditcardfraud')

print('Data source import complete.')

In [None]:
import os
import pandas as pd
import numpy as np

data_path = os.path.join(mlg_ulb_creditcardfraud_path, 'creditcard.csv')
df = pd.read_csv(data_path)

df['TransactionDensity'] = df['Amount'] / (df['Time'] + 1)

df.fillna(df.mean(numeric_only=True), inplace=True)

features_to_scale_names = ['Amount', 'TransactionDensity']
pca_features = [f'V{i}' for i in range(1, 29)]

X = df[features_to_scale_names + pca_features].copy()
y = df['Class']

print(f"Full dataset ready for splitting. Total records: {len(df)}")
print(f"Fraud ratio: {y.mean():.4%}")

***UNDERSAMPLING***


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

scaler = StandardScaler()
scaler.fit(X_train[features_to_scale_names])
X_train[features_to_scale_names] = scaler.transform(X_train[features_to_scale_names])
X_test[features_to_scale_names] = scaler.transform(X_test[features_to_scale_names])

rus = RandomUnderSampler(random_state=42)
X_train, y_train = rus.fit_resample(X_train, y_train)

print(f"Training set size (Balanced): {len(X_train)}")
print(f"Test set size (Real World): {len(X_test)}")

***SMOTE***

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

scaler = StandardScaler()
scaler.fit(X_train[features_to_scale_names])
X_train[features_to_scale_names] = scaler.transform(X_train[features_to_scale_names])
X_test[features_to_scale_names] = scaler.transform(X_test[features_to_scale_names])

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

X_train, y_train = X_train_smote, y_train_smote

print(f"Original Training Fraud Count: {y_test.value_counts()[1]}")
print(f"New Training Set Size (SMOTE): {len(X_train)}")
print(f"New Fraud Count (SMOTE): {sum(y_train == 1)}")

In [None]:
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(contamination=0.01, random_state=42)
X_full_scaled = pd.concat([X_train, X_test], axis=0)
X_full_scaled = X_full_scaled.sort_index()

df['isOutlier'] = iforest.fit_predict(X_full_scaled)

print("Total Anomaly Counts:")
print(df['isOutlier'].value_counts())

In [None]:
import time
from sklearn.metrics import recall_score, average_precision_score
import pandas as pd

# List to store results for graphing later
performance_data = []

print(f"{'Model':<25} | {'Recall':<8} | {'AUPRC':<8} | {'Train(s)':<8} | {'Inf(ms)':<8}")
print("-" * 70)

# Helper function to avoid repeating code for every model
def record_performance(name, model, X_train, y_train, X_test, y_test):
    # 1. Measure Training Time
    start_train = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_train

    # 2. Measure Inference Time & Get Predictions
    start_inf = time.time()
    y_pred = model.predict(X_test)

    # Get scores for AUPRC (Probabilities or Decision Function)
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        y_scores = model.decision_function(X_test)

    inf_time = (time.time() - start_inf)
    inf_time_ms = (inf_time / len(X_test)) * 1000 # Time per sample in ms

    # 3. Calculate Metrics
    rec = recall_score(y_test, y_pred)
    auprc = average_precision_score(y_test, y_scores)

    # Print results
    print(f"{name:<25} | {rec:.4f}   | {auprc:.4f}   | {train_time:.3f}    | {inf_time_ms:.4f}")

    # Store for graphing
    return {
        "Model": name,
        "Recall": rec,
        "AUPRC": auprc,
        "Train Time (s)": train_time,
        "Inference Time (ms)": inf_time_ms
    }

# --- Execute Models ---
performance_data.append(record_performance("Logistic Regression", LogisticRegression(solver='liblinear', random_state=42), X_train, y_train, X_test, y_test))
performance_data.append(record_performance("Decision Tree", DecisionTreeClassifier(random_state=42), X_train, y_train, X_test, y_test))
performance_data.append(record_performance("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42), X_train, y_train, X_test, y_test))
performance_data.append(record_performance("Gradient Boosting", GradientBoostingClassifier(n_estimators=100, random_state=42), X_train, y_train, X_test, y_test))
performance_data.append(record_performance("Linear SVC", LinearSVC(C=1, random_state=42, dual=False), X_train, y_train, X_test, y_test))

In [None]:
import time
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import recall_score, average_precision_score

# Define Model
nn_model = Sequential()
nn_model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

# Compile
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Measure Training Time
start_train = time.time()
nn_model.fit(X_train, y_train, epochs=10, batch_size=128, verbose=0)
train_time_nn = time.time() - start_train

# Measure Inference Time & Get Predictions
start_inf = time.time()
y_probs_nn = nn_model.predict(X_test, verbose=0).flatten()
y_pred_nn_binary = (y_probs_nn > 0.5).astype('int32')
inf_time_nn = time.time() - start_inf
inf_time_ms_nn = (inf_time_nn / len(X_test)) * 1000

# Calculate Metrics
rec_nn = recall_score(y_test, y_pred_nn_binary)
auprc_nn = average_precision_score(y_test, y_probs_nn)

# Print results
print(f"{'Neural Network':<25} | {rec_nn:.4f}   | {auprc_nn:.4f}   | {train_time_nn:.3f}    | {inf_time_ms_nn:.4f}")

# Store in the same list as your other models
performance_data.append({
    "Model": "Neural Network",
    "Recall": rec_nn,
    "AUPRC": auprc_nn,
    "Train Time (s)": train_time_nn,
    "Inference Time (ms)": inf_time_ms_nn
})

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [5, 10, 15]}

# print("GridSearchCV")
# grid_search = GridSearchCV(
#     RandomForestClassifier(random_state=42),
#     param_grid,
#     cv=5,
#     scoring='recall',
#     n_jobs=-1
# )
# grid_search.fit(X_train, y_train)

# print("Grid Search Complete.")
# print("Best Hyperparameters:", grid_search.best_params_)
# print("Best Cross-Validation Recall Score:", f"{grid_search.best_score_:.4f}")

# best_rf_model = grid_search.best_estimator_
# y_pred_best_rf = best_rf_model.predict(X_test)
# print("Test Set Recall Score (using Best Model):", f"{recall_score(y_test, y_pred_best_rf):.4f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert the collected data into a DataFrame
df_results = pd.DataFrame(performance_data).sort_values(by='AUPRC', ascending=False)

# Create the Visualization with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))

# Performance Metrics (AUPRC and Recall)
df_results.plot(x='Model', y=['AUPRC', 'Recall'], kind='bar', ax=ax1, width=0.8, color=['#4C72B0', '#55A868'])
ax1.set_title('Detection Performance (Higher is Better)', fontsize=14, pad=15)
ax1.set_ylabel('Score')
ax1.set_ylim(0, 1.1)
ax1.legend(loc='lower right')
ax1.grid(axis='y', linestyle='--', alpha=0.6)
ax1.tick_params(axis='x', labelrotation=45)

# Efficiency (Training Time)
sns.barplot(data=df_results, x='Model', y='Train Time (s)', ax=ax2, palette='magma')
ax2.set_title('Training Time (Lower is Better)', fontsize=14, pad=15)
ax2.set_ylabel('Seconds')
ax2.grid(axis='y', linestyle='--', alpha=0.6)
ax2.tick_params(axis='x', labelrotation=45)

plt.tight_layout()
plt.show()