In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, 
    f1_score, confusion_matrix
)
import smote_variants as sv
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

# File to log results
log_file = 'results_log.txt'

# Helper function to log results
def log_results(content):
    with open(log_file, 'a') as f:
        f.write(content + '\n')

# Clear previous log file
if os.path.exists(log_file):
    os.remove(log_file)

# Load and preprocess the dataset
data = pd.read_csv('creditcard.csv')
scaler_amount = RobustScaler()
scaler_time = StandardScaler()
data['Amount'] = scaler_amount.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = scaler_time.fit_transform(data['Time'].values.reshape(-1, 1))
data = data.sample(frac=1, random_state=1)

# Prepare training, validation, and test datasets
X = data.drop('Class', axis=1).values
y = data['Class'].values
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

# Evaluate a model
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X)
    return {
        'ROC AUC': roc_auc_score(y, y_prob),
        'Precision': precision_score(y, y_pred),
        'Recall': recall_score(y, y_pred),
        'F1 Score': f1_score(y, y_pred),
        'Confusion Matrix': confusion_matrix(y, y_pred)
    }

# Define classifiers
classifiers = [
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(n_estimators=100, n_jobs=-1),
    GradientBoostingClassifier(),
    SVC(probability=True),
    XGBClassifier(tree_method='gpu_hist')  # Enable GPU for XGBoost
]

# Parameter grids for GridSearchCV
param_grids = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'XGBClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }
}

# Perform evaluation and grid search
results = {}  # To store results for all classifiers
augmentation_methods = ['polynom_fit_SMOTE_mesh']

# Function to perform grid search or fit directly
def fit_model_with_gridsearch(clf, clf_name, X_train, y_train):
    if clf_name in param_grids:
        grid_search = GridSearchCV(
            estimator=clf, param_grid=param_grids[clf_name],
            cv=5, scoring='roc_auc', n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_
    else:
        clf.fit(X_train, y_train)
        return clf

# Evaluate classifiers before augmentation
log_results("Classifier Results Before and After Augmentation:\n")
for clf in classifiers:
    clf_name = clf.__class__.__name__
    log_results(f"Classifier: {clf_name}\n")
    
    # Train and evaluate before augmentation
    best_model = fit_model_with_gridsearch(clf, clf_name, X_train, y_train)
    results_before = evaluate_model(best_model, X_val, y_val)
    results[clf_name] = {'before': results_before}
    
    # Log results before augmentation
    log_results("Before Augmentation:")
    for metric, value in results_before.items():
        log_results(f"{metric}: {value}")
    log_results("-" * 50)
    
    # Perform augmentation and evaluate
    for method in augmentation_methods:
        log_results(f"Applying {method} augmentation...")
        oversampler_class = getattr(sv, method)
        oversampler = oversampler_class()
        X_aug, y_aug = oversampler.sample(X_train, y_train)
        
        best_model_aug = fit_model_with_gridsearch(clf, clf_name, X_aug, y_aug)
        results_after = evaluate_model(best_model_aug, X_val, y_val)
        results[clf_name][method] = results_after
        
        # Log results after augmentation
        log_results(f"After Augmentation ({method}):")
        for metric, value in results_after.items():
            log_results(f"{metric}: {value}")
        log_results("-" * 50)

# Define and evaluate a neural network
def create_neural_network(input_shape):
    model = Sequential([
        InputLayer(shape=(input_shape,)),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Multi-GPU strategy for the neural network
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    neural_network = create_neural_network(X_train.shape[1])
    checkpoint = ModelCheckpoint('best_nn_model.keras', save_best_only=True)

    neural_network.fit(
        X_train, y_train, validation_data=(X_val, y_val), 
        epochs=10, batch_size=64, callbacks=[checkpoint]
    )

# Evaluate neural network before augmentation
y_prob = neural_network.predict(X_val)
y_pred = (y_prob > 0.5).astype(int)
nn_metrics_before = {
    'ROC AUC': roc_auc_score(y_val, y_prob),
    'Precision': precision_score(y_val, y_pred),
    'Recall': recall_score(y_val, y_pred),
    'F1 Score': f1_score(y_val, y_pred),
    'Confusion Matrix': confusion_matrix(y_val, y_pred)
}
log_results("\nNeural Network Results Before Augmentation:")
for metric, value in nn_metrics_before.items():
    log_results(f"{metric}: {value}")
log_results("-" * 50)

# Neural network evaluation after augmentation
for method in augmentation_methods:
    log_results(f"Applying {method} augmentation to Neural Network...")
    oversampler_class = getattr(sv, method)
    oversampler = oversampler_class()
    X_aug, y_aug = oversampler.sample(X_train, y_train)
    
    with strategy.scope():
        neural_network_aug = create_neural_network(X_aug.shape[1])
        neural_network_aug.fit(
            X_aug, y_aug, validation_data=(X_val, y_val), 
            epochs=10, batch_size=64, callbacks=[checkpoint]
        )
    
    y_prob_aug = neural_network_aug.predict(X_val)
    y_pred_aug = (y_prob_aug > 0.5).astype(int)
    nn_metrics_after = {
        'ROC AUC': roc_auc_score(y_val, y_prob_aug),
        'Precision': precision_score(y_val, y_pred_aug),
        'Recall': recall_score(y_val, y_pred_aug),
        'F1 Score': f1_score(y_val, y_pred_aug),
        'Confusion Matrix': confusion_matrix(y_val, y_pred_aug)
    }
    log_results(f"\nNeural Network Results After Augmentation ({method}):")
    for metric, value in nn_metrics_after.items():
        log_results(f"{metric}: {value}")
    log_results("-" * 50)

print(f"Results have been logged to {log_file}.")


2024-11-27 19:53:03,107:INFO:polynom_fit_SMOTE_mesh: Running sampling via ('polynom_fit_SMOTE_mesh', "{'proportion': 1.0, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'deterministic', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'polynom_fit_SMOTE_mesh'}")
2024-11-27 19:53:03,118:INFO:polynom_fit_SMOTE_mesh: simplex sampling with n_dim 2
2024-11-27 19:57:59,488:INFO:polynom_fit_SMOTE_mesh: Running sampling via ('polynom_fit_SMOTE_mesh', "{'proportion': 1.0, 'ss_params': {'n_dim': 2, 'simplex_sampling': 'random', 'within_simplex_sampling': 'deterministic', 'gaussian_component': {}}, 'random_state': None, 'class_name': 'polynom_fit_SMOTE_mesh'}")
2024-11-27 19:57:59,495:INFO:polynom_fit_SMOTE_mesh: simplex sampling with n_dim 2


KeyboardInterrupt: 