# Combined Credit Card Fraud Training (ULB + 2023)

This notebook merges the two datasets, reuses the same preparation techniques (feature engineering, stratified splits, scaling, class imbalance handling), and trains the same baseline models (Logistic Regression, Random Forest). We report Accuracy, Precision, Recall, F1, and ROC-AUC on a held-out test set. Target: ≥95% accuracy.

Notes
- Reads: `data/creditcard.csv/creditcard.csv` (ULB) and `data/creditcard.csv/creditcard_2023.csv` (2023)
- Handles class imbalance with SMOTE on train only
- Scaler: StandardScaler on train, applied to val/test
- Models: Logistic Regression (balanced), Random Forest (balanced)
- Metrics: train/val/test; plus cross-validation on train
- Saves key artifacts under `results/`

In [1]:
# 1) Import Libraries and Set Seed
import os
import json
import math
import random
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Optional: IMBLEARN (SMOTE); will fallback if unavailable
try:
    from imblearn.over_sampling import SMOTE  # type: ignore
    IMBLEARN_AVAILABLE = True
except Exception:
    SMOTE = None  # type: ignore
    IMBLEARN_AVAILABLE = False

import warnings
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

print("Environment ready. Seed set to", SEED)
print("imblearn available:", IMBLEARN_AVAILABLE)

Environment ready. Seed set to 42
imblearn available: False


In [2]:
# 2) Configure Paths and Hyperparameters
from pathlib import Path
import os

# Detect Kaggle environment and set paths accordingly
if os.path.exists('/kaggle/input'):
    # Try common dataset folder names users might upload to Kaggle
    # Adjust these names if your Kaggle Dataset names differ
    kaggle_base = Path('/kaggle/input')
    # Attempt to find creditcard.csv and creditcard_2023.csv anywhere under /kaggle/input
    def find_file(root: Path, name: str):
        for p in root.rglob(name):
            return p
        return None
    ULB_PATH = find_file(kaggle_base, 'creditcard.csv')
    Y2023_PATH = find_file(kaggle_base, 'creditcard_2023.csv') or find_file(kaggle_base, 'credit_card_fraud_2023.csv')
else:
    DATA_DIR = Path('data')
    ULB_PATH = DATA_DIR / 'creditcard.csv' / 'creditcard.csv'
    Y2023_PATH = DATA_DIR / 'creditcard.csv' / 'creditcard_2023.csv'

TARGET_ACCURACY = 0.95
SCALER_TYPE = 'standard'
BALANCE_METHOD = 'smote'  # on train only
TEST_SIZE = 0.2
VAL_SIZE = 0.2  # of the remaining after test split

OUTPUT_DIR = Path('results') / 'combined_training'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('ULB path:', ULB_PATH)
print('2023 path:', Y2023_PATH)
print('Output dir:', OUTPUT_DIR)

ULB path: /kaggle/input/creditcardfraudtraining/creditcard.csv
2023 path: /kaggle/input/creditcardfraudtraining/creditcard_2023.csv
Output dir: results/combined_training


In [3]:
# 3) Load both datasets (handle large file) and inspect
from zipfile import ZipFile


def find_file(root: Path, names):
    names = names if isinstance(names, (list, tuple)) else [names]
    for nm in names:
        p = next(root.rglob(nm), None)
        if p is not None:
            return p
    return None


def read_csv_maybe_zip(path: Path, nrows=None):
    if path.suffix.lower() == '.zip':
        with ZipFile(path, 'r') as zf:
            # pick the first .csv inside
            csv_names = [n for n in zf.namelist() if n.lower().endswith('.csv')]
            if not csv_names:
                raise FileNotFoundError(f'No CSV inside {path}')
            with zf.open(csv_names[0]) as f:
                return pd.read_csv(f, nrows=nrows)
    else:
        return pd.read_csv(path, nrows=nrows)


def safe_read_csv(path: Path, nrows=None):
    try:
        return read_csv_maybe_zip(path, nrows=None)
    except Exception as e:
        print(f"Full read failed for {path.name}: {e}\nFalling back to reading first {nrows or 200000} rows.")
        return read_csv_maybe_zip(path, nrows=nrows or 200000)

# Resolve paths on Kaggle if not found
if ULB_PATH is None or not Path(ULB_PATH).exists():
    base = Path('/kaggle/input') if Path('/kaggle/input').exists() else Path('.')
    ULB_PATH = find_file(base, ['creditcard.csv', 'creditcard.csv.zip'])
if Y2023_PATH is None or not Path(Y2023_PATH).exists():
    base = Path('/kaggle/input') if Path('/kaggle/input').exists() else Path('.')
    Y2023_PATH = find_file(base, [
        'creditcard_2023.csv',
        'credit_card_fraud_2023.csv',
        'creditcard_2023_cleaned.csv',
        'credit_card_fraud_2023_cleaned.csv'
    ])

print('Resolved ULB path:', ULB_PATH)
print('Resolved 2023 path:', Y2023_PATH)
assert ULB_PATH is not None and Path(ULB_PATH).exists(), f"Missing ULB dataset at {ULB_PATH}"
assert Y2023_PATH is not None and Path(Y2023_PATH).exists(), f"Missing 2023 dataset at {Y2023_PATH}"

df_ulb = safe_read_csv(Path(ULB_PATH))
df_2023 = safe_read_csv(Path(Y2023_PATH))

print('ULB shape:', df_ulb.shape)
print('2023 shape:', df_2023.shape)
print('ULB columns (head):', list(df_ulb.columns)[:10])
print('2023 columns (head):', list(df_2023.columns)[:10])

# 4) Harmonize target and align feature space
# Identify likely target columns
possible_targets = ['Class','class','fraud','Fraud','is_fraud','target','label']

def find_target(df):
    for c in possible_targets:
        if c in df.columns and df[c].nunique() == 2:
            return c
    return None

tgt_ulb = find_target(df_ulb)
tgt_2023 = find_target(df_2023)
print('Target ULB:', tgt_ulb, '| Target 2023:', tgt_2023)
assert tgt_ulb is not None and tgt_2023 is not None, 'Could not identify target in one or both datasets.'

# Standardize target name
df_ulb = df_ulb.rename(columns={tgt_ulb: 'target'})
df_2023 = df_2023.rename(columns={tgt_2023: 'target'})

# Keep numeric columns + engineered ones later (drop obvious identifiers if exist)
num_ulb = df_ulb.select_dtypes(include=[np.number]).columns.tolist()
num_2023 = df_2023.select_dtypes(include=[np.number]).columns.tolist()

# Ensure 'target' present and kept at end
num_ulb = [c for c in num_ulb if c != 'target']
num_2023 = [c for c in num_2023 if c != 'target']

# Common numeric columns
common_numeric = sorted(list(set(num_ulb).intersection(set(num_2023))))
print('Common numeric features:', len(common_numeric))

# Minimal feature engineering similar to previous scripts
def add_engineered_features(df):
    df_eng = df.copy()
    if 'Amount' in df_eng.columns:
        df_eng['Amount_log'] = np.log1p(df_eng['Amount'])
        df_eng['Amount_sqrt'] = np.sqrt(np.maximum(df_eng['Amount'], 0))
    if 'Time' in df_eng.columns:
        df_eng['Hour'] = ((df_eng['Time'] / 3600) % 24)
        df_eng['Day'] = ((df_eng['Time'] / (3600*24)) % 7)
        rng = df_eng['Time'].max() - df_eng['Time'].min()
        if rng != 0:
            df_eng['Time_normalized'] = (df_eng['Time'] - df_eng['Time'].min()) / rng
        else:
            df_eng['Time_normalized'] = 0.0
    # V-stats if available
    v_cols = [c for c in df_eng.columns if isinstance(c, str) and c.startswith('V')]
    if len(v_cols) > 0:
        df_eng['V_mean'] = df_eng[v_cols].mean(axis=1)
        df_eng['V_std'] = df_eng[v_cols].std(axis=1)
        df_eng['V_sum'] = df_eng[v_cols].sum(axis=1)
        df_eng['V_max'] = df_eng[v_cols].max(axis=1)
        df_eng['V_min'] = df_eng[v_cols].min(axis=1)
    return df_eng

ulb_eng = add_engineered_features(df_ulb)
y2023_eng = add_engineered_features(df_2023)

# Recompute numeric lists including engineered features
num_ulb2 = [c for c in ulb_eng.select_dtypes(include=[np.number]).columns if c != 'target']
num_2023_2 = [c for c in y2023_eng.select_dtypes(include=[np.number]).columns if c != 'target']
common_numeric2 = sorted(list(set(num_ulb2).intersection(set(num_2023_2))))
print('Common numeric features after engineering:', len(common_numeric2))

# Build combined DataFrame with common features and target
ulb_final = ulb_eng[common_numeric2 + ['target']].copy()
y2023_final = y2023_eng[common_numeric2 + ['target']].copy()

combined = pd.concat([ulb_final, y2023_final], axis=0, ignore_index=True)
combined = combined.dropna().drop_duplicates()
print('Combined shape:', combined.shape)
print('Fraud rate (overall):', combined['target'].mean()*100, '%')

Resolved ULB path: /kaggle/input/creditcardfraudtraining/creditcard.csv
Resolved 2023 path: /kaggle/input/creditcardfraudtraining/creditcard_2023.csv
ULB shape: (284807, 31)
2023 shape: (568630, 31)
ULB columns (head): ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
2023 columns (head): ['id', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
Target ULB: Class | Target 2023: Class
Common numeric features: 29
Common numeric features after engineering: 36
Combined shape: (844292, 37)
Fraud rate (overall): 33.73086562468909 %


In [4]:
# 5) Stratified Train/Val/Test Split
features = [c for c in combined.columns if c != 'target']
X = combined[features].values
y = combined['target'].values

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=SEED, stratify=y
)
val_size_adj = VAL_SIZE / (1 - TEST_SIZE)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_size_adj, random_state=SEED, stratify=y_temp
)

print('Split sizes:', X_train.shape, X_val.shape, X_test.shape)
print('Train fraud rate:', y_train.mean()*100, '%')
print('Val fraud rate:', y_val.mean()*100, '%')
print('Test fraud rate:', y_test.mean()*100, '%')

# 6) Scale + Balance (train only)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Balance training data
if IMBLEARN_AVAILABLE and SMOTE is not None:
    sm = SMOTE(random_state=SEED)
    X_train_bal, y_train_bal = sm.fit_resample(X_train_scaled, y_train)
    print('Used SMOTE for balancing.')
else:
    # Simple random oversampling fallback
    print('SMOTE not available; using simple random oversampling fallback.')
    y_series = pd.Series(y_train)
    count_0 = int((y_series == 0).sum())
    count_1 = int((y_series == 1).sum())
    maj_label = 0 if count_0 >= count_1 else 1
    min_label = 1 - maj_label

    X_m = X_train_scaled[y_train == maj_label]
    X_n = X_train_scaled[y_train == min_label]
    y_m = y_train[y_train == maj_label]
    y_n = y_train[y_train == min_label]

    deficit = len(y_m) - len(y_n)
    idx = np.random.choice(len(X_n), size=deficit, replace=True)
    X_syn = X_n[idx]
    y_syn = y_n[idx]

    X_train_bal = np.vstack([X_train_scaled, X_syn])
    y_train_bal = np.hstack([y_train, y_syn])

print('Balanced train shapes:', X_train_bal.shape, len(y_train_bal))
print('Balanced class distribution:', pd.Series(y_train_bal).value_counts().to_dict())

Split sizes: (506574, 36) (168859, 36) (168859, 36)
Train fraud rate: 33.73070864276493 %
Val fraud rate: 33.731101096180836 %
Test fraud rate: 33.731101096180836 %
SMOTE not available; using simple random oversampling fallback.
Balanced train shapes: (671406, 36) 671406
Balanced class distribution: {1: 335703, 0: 335703}


In [5]:
# 7-10) Train models and evaluate

def evaluate_split(name, y_true, y_pred, y_proba=None):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    roc = roc_auc_score(y_true, y_proba if y_proba is not None else y_pred)
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n{name} Metrics:")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | ROC-AUC: {roc:.4f}")
    print('Confusion Matrix:\n', cm)
    print('\nClassification Report:\n', classification_report(y_true, y_pred, digits=4))
    return {'accuracy':acc,'precision':prec,'recall':rec,'f1':f1,'roc_auc':roc,'cm':cm}

# Train only Random Forest for speed
models = {
    'Random_Forest': RandomForestClassifier(n_estimators=150, random_state=SEED, class_weight='balanced', n_jobs=-1, max_depth=12)
}

results = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_bal, y_train_bal)

    # Train metrics
    ytr_pred = model.predict(X_train_bal)
    ytr_proba = model.predict_proba(X_train_bal)[:,1] if hasattr(model, 'predict_proba') else None
    train_metrics = evaluate_split(f'{model_name} - Train', y_train_bal, ytr_pred, ytr_proba)

    # Val metrics
    yv_pred = model.predict(X_val_scaled)
    yv_proba = model.predict_proba(X_val_scaled)[:,1] if hasattr(model, 'predict_proba') else None
    val_metrics = evaluate_split(f'{model_name} - Val', y_val, yv_proba >= 0.5 if yv_proba is not None else yv_pred, yv_proba)

    # Test metrics
    yt_pred = model.predict(X_test_scaled)
    yt_proba = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, 'predict_proba') else None
    test_metrics = evaluate_split(f'{model_name} - Test', y_test, yt_proba >= 0.5 if yt_proba is not None else yt_pred, yt_proba)

    results[model_name] = {
        'train': train_metrics,
        'val': val_metrics,
        'test': test_metrics,
        'model': model
    }

    # If we only need to report and stop here, break out
    if 'STOP_AFTER_TEST' in globals() and STOP_AFTER_TEST:
        print('\nSTOP_AFTER_TEST is True: skipping cross-validation, threshold tuning, and file saving.')
        break



Training Random_Forest...

Random_Forest - Train Metrics:
Accuracy: 0.9960 | Precision: 0.9985 | Recall: 0.9936 | F1: 0.9960 | ROC-AUC: 0.9999
Confusion Matrix:
 [[335199    504]
 [  2164 333539]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9936    0.9985    0.9960    335703
           1     0.9985    0.9936    0.9960    335703

    accuracy                         0.9960    671406
   macro avg     0.9960    0.9960    0.9960    671406
weighted avg     0.9960    0.9960    0.9960    671406


Random_Forest - Val Metrics:
Accuracy: 0.9958 | Precision: 0.9965 | Recall: 0.9910 | F1: 0.9937 | ROC-AUC: 0.9998
Confusion Matrix:
 [[111700    201]
 [   510  56448]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9955    0.9982    0.9968    111901
           1     0.9965    0.9910    0.9937     56958

    accuracy                         0.9958    168859
   macro avg     0.9960    0.9946    0

In [6]:
# Export trained model, scaler, features, and threshold for deployment
import os, json, pickle, zipfile
from pathlib import Path
from datetime import datetime
from IPython.display import FileLink

# Pick best model (prefer Random Forest if available)
export_model_name = None
if 'results' in globals() and isinstance(results, dict):
    if 'Random_Forest' in results:
        export_model_name = 'Random_Forest'
    else:
        # fall back to the highest test accuracy
        export_model_name = max(results.keys(), key=lambda k: results[k]['test']['accuracy'])
else:
    raise RuntimeError('No trained results found. Please run the training cells first.')

model_obj = results[export_model_name]['model']
scaler_obj = scaler if 'scaler' in globals() else None
feature_list = features if 'features' in globals() else None

thr = 0.5
if 'threshold_results' in globals() and export_model_name in threshold_results:
    thr = float(threshold_results[export_model_name]['threshold'])

# Prep export directory (Kaggle-friendly)
work_base = Path('/kaggle/working') if Path('/kaggle/working').exists() else OUTPUT_DIR
export_dir = work_base / 'combined_model'
export_dir.mkdir(parents=True, exist_ok=True)

# Save artifacts
with open(export_dir / 'model.pkl', 'wb') as f:
    pickle.dump(model_obj, f)

if scaler_obj is not None:
    with open(export_dir / 'scaler.pkl', 'wb') as f:
        pickle.dump(scaler_obj, f)

if feature_list is not None:
    with open(export_dir / 'features.json', 'w') as f:
        json.dump({'features': feature_list}, f, indent=2)

with open(export_dir / 'threshold.json', 'w') as f:
    json.dump({'threshold': thr}, f, indent=2)

# Some metadata + scores
meta = {
    'exported_at': datetime.utcnow().isoformat() + 'Z',
    'model_name': export_model_name,
    'val_metrics': results[export_model_name]['val'],
    'test_metrics': results[export_model_name]['test'],
}
with open(export_dir / 'metadata.json', 'w') as f:
    json.dump(meta, f, indent=2, default=lambda o: o.tolist() if hasattr(o, 'tolist') else o)

# Zip the folder for easy download
zip_path = work_base / 'combined_model_artifacts.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as z:
    for p in export_dir.iterdir():
        z.write(p, arcname=f'combined_model/{p.name}')

print('Exported artifacts to:', export_dir)
print('Zipped at:', zip_path)
try:
    display(FileLink(zip_path))
except Exception:
    pass

print('\nTo download on Kaggle:')
print('- Use the link above or open the “Output” panel and download combined_model_artifacts.zip.')
print('- You can also “Save Version” and fetch files from the notebook version outputs.')

Exported artifacts to: /kaggle/working/combined_model
Zipped at: /kaggle/working/combined_model_artifacts.zip



To download on Kaggle:
- Use the link above or open the “Output” panel and download combined_model_artifacts.zip.
- You can also “Save Version” and fetch files from the notebook version outputs.
