In [None]:
# Cell 1: Imports and environment
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix, auc
from sklearn.ensemble import RandomForestClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')

# Cell 2: Helper plotting functions
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title(title)
    plt.show()

def plot_roc(y_true, y_scores, title="ROC Curve"):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    plt.plot(fpr, tpr, label=f"AUC = {auc(fpr,tpr):.4f}")
    plt.plot([0,1],[0,1],'--', color='grey')
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title(title)
    plt.legend(); plt.show()

def classification_metrics(y_true, y_pred, y_scores=None):
    print(classification_report(y_true, y_pred, digits=4))
    if y_scores is not None:
        print("ROC-AUC:", roc_auc_score(y_true, y_scores))

import kagglehub
import pandas as pd
import os

# Download dataset
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
print("Dataset downloaded to:", path)

# Load the CSV file
df = pd.read_csv(os.path.join(path, "creditcard.csv"))
df.head()
# Cell 4: Quick EDA
print(df['Class'].value_counts())
print(df.describe().T)
sns.countplot(x='Class', data=df)
plt.title("Class distribution (0=legit,1=fraud)")
plt.show()

# Distribution of Amount (skewed)
sns.histplot(df['Amount'], bins=50, kde=True)
plt.title("Transaction Amount distribution")
plt.show()

# Cell 5: Feature engineering & preprocessing
# Create 'Hour' from 'Time' (seconds elapsed)
df['Hour'] = (df['Time'] // 3600) % 24
# Log1p transform for Amount
df['Amount_log'] = np.log1p(df['Amount'])

# Drop original Time/Amount if desired, keep engineered
df_model = df.drop(['Time','Amount'], axis=1)
df_model.head()

# Cell 6: Train/test split
X = df_model.drop('Class', axis=1)
y = df_model['Class']

# Standardize numerical columns (V1..V28, Hour, Amount_log)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

print("Train size:", X_train.shape, "Test size:", X_test.shape)
# Save scaler
joblib.dump(scaler, "scaler_creditcard.joblib")

# Cell 7: Handle imbalance options
# Option A: Use class_weight in the model (we will use for RandomForest)
# Option B: Use SMOTE for oversampling - show example
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
print("After SMOTE:", np.bincount(y_train_sm))

# Cell 8: MODEL 1 - XGBoost (or RandomForest fallback)
# We'll try XGBoost if available; else RandomForest
try:
    from xgboost import XGBClassifier
    model_xgb = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1,
                              use_label_encoder=False, eval_metric='auc', scale_pos_weight=(y_train==0).sum()/(y_train==1).sum(),
                              random_state=42)
    model_xgb.fit(X_train, y_train)
    preds_xgb = model_xgb.predict(X_test)
    proba_xgb = model_xgb.predict_proba(X_test)[:,1]
    print("XGBoost trained")
    classification_metrics(y_test, preds_xgb, proba_xgb)
    plot_confusion_matrix(y_test, preds_xgb, "XGBoost Confusion Matrix")
    plot_roc(y_test, proba_xgb, "XGBoost ROC")
    joblib.dump(model_xgb, "xgb_creditcard.pkl")
except Exception as e:
    print("XGBoost not available or failed, falling back to RandomForest:", e)
    rf = RandomForestClassifier(n_estimators=200, class_weight='balanced_subsample', random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_test)
    proba_rf = rf.predict_proba(X_test)[:,1]
    classification_metrics(y_test, preds_rf, proba_rf)
    plot_confusion_matrix(y_test, preds_rf, "RandomForest Confusion Matrix")
    plot_roc(y_test, proba_rf, "RandomForest ROC")
    joblib.dump(rf, "rf_creditcard.pkl")

# Cell 9: Feature importance (if RF or XGB)
try:
    imp = model_xgb.get_booster().get_score(importance_type='weight')
    # Convert to pandas
    imp_df = pd.DataFrame(list(imp.items()), columns=['feature','importance']).sort_values('importance',ascending=False)
    display(imp_df.head(10))
except:
    try:
        imp = rf.feature_importances_
        feat_names = X.columns
        imp_df = pd.DataFrame({'feature':feat_names, 'importance':imp}).sort_values('importance',ascending=False)
        display(imp_df.head(10))
    except:
        print("Feature importance not available.")

# Cell 10: MODEL 2 - Autoencoder (Unsupervised anomaly detection)
# Train autoencoder only on legitimate transactions from training set
import tensorflow as tf
from tensorflow.keras import layers, Sequential, callbacks

# Separate legitimate only for training
X_train_legit = X_train[y_train==0]
print("Legit-only training shape:", X_train_legit.shape)

input_dim = X_train_legit.shape[1]
ae = Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(input_dim, activation='linear')  # linear for reconstruction
])
ae.compile(optimizer='adam', loss='mse')

early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = ae.fit(X_train_legit, X_train_legit, epochs=100, batch_size=2048, validation_split=0.1, callbacks=[early_stop], verbose=1)

# Save model
ae.save("autoencoder_creditcard.h5")

# Cell 11: Use autoencoder to compute reconstruction error, set threshold
# Compute MSE for test set, then pick threshold (e.g., 99th percentile of train legit errors)
recon_train = ae.predict(X_train_legit)
train_mse = np.mean(np.power(X_train_legit - recon_train, 2), axis=1)
threshold = np.percentile(train_mse, 99)  # tuneable
print("Chosen threshold:", threshold)

# Evaluate on test set
recon_test = ae.predict(X_test)
test_mse = np.mean(np.power(X_test - recon_test, 2), axis=1)
y_pred_ae = (test_mse > threshold).astype(int)

classification_metrics(y_test, y_pred_ae)
plot_confusion_matrix(y_test, y_pred_ae, "Autoencoder Confusion Matrix")
# ROC for AE (use MSE as score)
plot_roc(y_test, test_mse, "Autoencoder ROC (higher score => anomaly)")

# Cell 12: Compare model performance summary
# Load saved models and display a small summary
import json
results = {}
# XGB/RF results
try:
    results['xgb'] = {
        'roc_auc': float(roc_auc_score(y_test, proba_xgb)),
        'precision_recall': None
    }
except:
    results['rf'] = {
        'roc_auc': float(roc_auc_score(y_test, proba_rf)),
        'precision_recall': None
    }
results['autoencoder'] = {
    'roc_auc': float(roc_auc_score(y_test, test_mse))
}
print(json.dumps(results, indent=2))

# Cell 13: Save scaler & a function to preprocess single transaction for demo
joblib.dump(scaler, "scaler_creditcard.joblib")
# Prepare a convenience function: expects raw features similar to original dataset (V1..V28, Amount, Time)
def preprocess_transaction(row_dict):
    # row_dict: dict with keys 'V1'...'V28', 'Amount', 'Time'
    tmp = {}
    tmp['Hour'] = (row_dict['Time'] // 3600) % 24
    tmp['Amount_log'] = np.log1p(row_dict['Amount'])
    feat_order = [f'V{i}' for i in range(1,29)] + ['Hour','Amount_log']
    arr = np.array([row_dict[f] for f in feat_order], dtype=float).reshape(1,-1)
    arr_scaled = scaler.transform(arr)
    return arr_scaled

# Cell 14: Load secondary dataset (IEEE-CIS) - minimal alignment & testing
# This dataset is large and complex; here we select a subset of numeric features for quick compatibility testing.
# Ensure you have train_transaction.csv and train_identity.csv in the working directory
if os.path.exists("train_transaction.csv"):
    trans = pd.read_csv("train_transaction.csv", nrows=200000)  # limit for speed; remove nrows for full
    print("IEEE-CIS sample shape:", trans.shape)
    # Basic selection: 'TransactionAmt' ~ Amount; 'TransactionDT' ~ Time; plus a few numeric columns if present
    # We'll try to extract numeric-only features and scale them to the same dimension, but note: features are different.
    # For true evaluation, implement feature mapping and feature-engineering so the features match model input.
else:
    print("No IEEE-CIS transaction file found. Download from Kaggle and place train_transaction.csv here for domain generalization tests.")

# Cell 15: Quick note & instructions (save)
print("Notebook progress saved. Next: tune hyperparameters, perform cross-validation, produce final plots, prepare model packaging (Gradio app and requirements.txt).")
