In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, 
    f1_score, confusion_matrix, classification_report, roc_curve
)
import smote_variants as sv
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import scipy.stats as stats

# Load and preprocess data
data = pd.read_csv('creditcard.csv')
scaler_amount = RobustScaler()
scaler_time = StandardScaler()
data['Amount'] = scaler_amount.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = scaler_time.fit_transform(data['Time'].values.reshape(-1, 1))
data = data.sample(frac=1, random_state=1)

X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Visualizations
plt.figure(figsize=(20, 8))
sns.boxplot(data=X[['Time', 'Amount']])
plt.show()

plt.figure(figsize=(15, 10))
sns.heatmap(data.corr(), cmap='coolwarm')
plt.show()

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X)
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=y, cmap='coolwarm', alpha=0.7)
plt.show()

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X)
plt.figure(figsize=(10, 6))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis')
plt.show()

stats.probplot(data['Amount'], dist="norm", plot=plt)
plt.show()

# Helper functions
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X)
    return {
        'ROC AUC': roc_auc_score(y, y_prob),
        'Precision': precision_score(y, y_pred),
        'Recall': recall_score(y, y_pred),
        'F1 Score': f1_score(y, y_pred),
        'Confusion Matrix': confusion_matrix(y, y_pred)
    }

def apply_smote(X, y, method):
    oversampler = getattr(sv, method)()
    return oversampler.sample(X, y)

# Classifiers and Augmentation Methods
classifiers = [
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(probability=True),
    XGBClassifier()
]

augmentation_methods = ['SMOTE', 'SMOTE_TomekLinks', 'SMOTE_ENN']
results_before_aug = {}
results_after_aug = {}

# Train and Evaluate Models Before Augmentation
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train, y_train)
    results_before_aug[clf_name] = evaluate_model(clf, X_val, y_val)

# Train and Evaluate Models After Augmentation
for method in augmentation_methods:
    X_aug, y_aug = apply_smote(X_train, y_train, method)
    aug_results = {}
    for clf in classifiers:
        clf_name = clf.__class__.__name__
        clf.fit(X_aug, y_aug)
        aug_results[clf_name] = evaluate_model(clf, X_val, y_val)
    results_after_aug[method] = aug_results

# Print Results
print("Comparison of models before augmentation:")
for clf_name, metrics in results_before_aug.items():
    print(f"\nClassifier: {clf_name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
    print("-" * 50)

for method, models in results_after_aug.items():
    print(f"\nResults for augmentation method: {method}")
    for clf_name, metrics in models.items():
        print(f"\nClassifier: {clf_name}")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")
        print("-" * 50)

# Neural Network Training
def create_neural_network(input_shape):
    model = Sequential([
        InputLayer(input_shape=(input_shape,)),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

neural_network = create_neural_network(X_train.shape[1])
checkpoint = ModelCheckpoint('best_nn_model.h5', save_best_only=True)
neural_network.fit(
    X_train, y_train, validation_data=(X_val, y_val), 
    epochs=10, batch_size=64, callbacks=[checkpoint]
)
nn_metrics = evaluate_model(neural_network, X_val, y_val)
print("\nNeural Network Results:")
for metric, value in nn_metrics.items():
    print(f"{metric}: {value}")
print("-" * 50)

# XGBoost with Grid Search
xgb_model = XGBClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
xgb_metrics = evaluate_model(best_model, X_val, y_val)
print("\nXGBoost with GridSearch Results:")
for metric, value in xgb_metrics.items():
    print(f"{metric}: {value}")
print("-" * 50)
