In [None]:
#Cleaning Splitting and Saving Data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# Ensure the output folder exists
output_dir = "processed_data"
os.makedirs(output_dir, exist_ok=True)

# Load and clean
df = pd.read_csv("creditcard.csv").dropna()

# Features and target
features = ['Time'] + [f'V{i}' for i in range(1, 29)] + ['Amount']
X = df[features]
y = df['Class']

# Split (save once)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save processed split to disk
pd.DataFrame(X_train_scaled, columns=features).to_csv(os.path.join(output_dir, "X_train_scaled.csv"), index=False)
pd.DataFrame(X_test_scaled, columns=features).to_csv(os.path.join(output_dir, "X_test_scaled.csv"), index=False)
y_train.to_csv(os.path.join(output_dir, "y_train.csv"), index=False)
y_test.to_csv(os.path.join(output_dir, "y_test.csv"), index=False)

In [None]:
#Train & Evaluate Logistic Regression

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Train Logistic Regression
    model = LogisticRegression(max_iter=1000, solver='liblinear')
    model.fit(X_res, y_res)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
results_df.to_excel("results/logistic_regression_results.xlsx", index=False)

In [None]:
#Train & Evaluate XGBoost

import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Train XGBoost
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=0)
    model.fit(X_res, y_res)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/xgboost_results.xlsx"
results_df.to_excel(excel_path, index=False)

In [None]:
#Train & Evaluate K-Nearest Neighbours

import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Fast grid search parameters for KNN
param_grid = {
    'n_neighbors': [3, 5],
    'weights': ['uniform'],
    'metric': ['euclidean']
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Downsample for speed (only use 10k rows max)
    if len(X_res) > 10000:
        X_res = X_res.sample(n=10000, random_state=42)
        y_res = y_res.loc[X_res.index]

    # Grid Search with 2-fold CV
    grid = GridSearchCV(KNeighborsClassifier(), param_grid,
                        cv=2, scoring='f1', n_jobs=-1)
    grid.fit(X_res, y_res)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": str(grid.best_params_),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/knn_results.xlsx"
results_df.to_excel(excel_path, index=False)

In [None]:
#Train & Evaluate Linear Discriminant Analysis

import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Grid search parameters for LDA (limited, because LDA has few tunable params)
param_grid = {
    'solver': ['svd', 'lsqr', 'eigen']
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Grid Search with 2-fold CV using full data
    grid = GridSearchCV(LinearDiscriminantAnalysis(), param_grid,
                        cv=2, scoring='f1', n_jobs=-1, error_score='raise')
    grid.fit(X_res, y_res)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": str(grid.best_params_),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/lda_results.xlsx"
results_df.to_excel(excel_path, index=False)

In [None]:
#Train & Evaluate Gaussian Naive Bayes

import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    model = GaussianNB()
    model.fit(X_res, y_res)

    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": "-",  # no hyperparameters to tune
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/gaussian_nb_results.xlsx"
results_df.to_excel(excel_path, index=False)

In [None]:
#Train & Evaluate AdaBoost

import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Grid search parameters for AdaBoost (small grid for speed)
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.5, 1.0]
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Grid Search with 2-fold CV using full data
    grid = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid,
                        cv=2, scoring='f1', n_jobs=-1)
    grid.fit(X_res, y_res)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": str(grid.best_params_),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/adaboost_results.xlsx"
results_df.to_excel(excel_path, index=False)

# Optional: Auto-download in Colab
try:
    from google.colab import files
    files.download(excel_path)
except:
    print(f"📁 File saved locally: {excel_path}")

print("✅ AdaBoost evaluation complete. Results saved to Excel.")

In [None]:
# Train and Evaluate Decision Tree

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Minimal grid to keep it fast but effective
param_grid = {
    'max_depth': [5, None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'criterion': ['gini']
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Grid Search with 2-fold CV for speed
    grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid,
                        cv=2, scoring='f1', n_jobs=-1)
    grid.fit(X_res, y_res)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": str(grid.best_params_),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/decision_tree_results.xlsx"
results_df.to_excel(excel_path, index=False)

In [None]:
# Train and Evaluate Random Forest

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Define balancing techniques
balancing_methods = {
    "Unbalanced": (X_train, y_train),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train, y_train),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train, y_train),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train, y_train)
}

# Minimal grid for fast Random Forest tuning
param_grid = {
    'n_estimators': [50],
    'max_depth': [10, None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'criterion': ['gini']
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Grid Search with 2-fold CV for speed
    grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid,
                        cv=2, scoring='f1', n_jobs=-1)
    grid.fit(X_res, y_res)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": str(grid.best_params_),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/random_forest_results.xlsx"
results_df.to_excel(excel_path, index=False)

In [None]:
#Train and Evaluate SVM

import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import os

# Load processed data
X_train = pd.read_csv("processed_data/X_train_scaled.csv")
X_test = pd.read_csv("processed_data/X_test_scaled.csv")
y_train = pd.read_csv("processed_data/y_train.csv").squeeze()
y_test = pd.read_csv("processed_data/y_test.csv").squeeze()

# Limit training size to 30000 samples for faster execution
X_train_small = X_train.sample(n=30000, random_state=42)
y_train_small = y_train.loc[X_train_small.index]

# Define balancing techniques on subset
balancing_methods = {
    "Unbalanced": (X_train_small, y_train_small),
    "RandomOversampling": RandomOverSampler(random_state=42).fit_resample(X_train_small, y_train_small),
    "RandomUndersampling": RandomUnderSampler(random_state=42).fit_resample(X_train_small, y_train_small),
    "SMOTE": SMOTE(random_state=42).fit_resample(X_train_small, y_train_small)
}

# Minimal grid for fast SVM tuning
param_grid = {
    'C': [1],
    'kernel': ['linear'],
    'probability': [True]
}

# Store results
results = []

for method_name, (X_res, y_res) in balancing_methods.items():
    # Grid Search with 2-fold CV for speed
    grid = GridSearchCV(SVC(), param_grid, cv=2, scoring='f1', n_jobs=-1)
    grid.fit(X_res, y_res)
    best_model = grid.best_estimator_

    # Predict
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Evaluate
    results.append({
        "Balancing Method": method_name,
        "Best Params": str(grid.best_params_),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Confusion Matrix": str(confusion_matrix(y_test, y_pred).tolist())
    })

# Save results
results_df = pd.DataFrame(results)
os.makedirs("results", exist_ok=True)
excel_path = "results/svm_results_subset.xlsx"
results_df.to_excel(excel_path, index=False)