In [4]:
# Data Preparation Script
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import os

# Ensure processed folder exists
os.makedirs("data/processed", exist_ok=True)

# Load raw dataset
print("Loading raw data...")
df = pd.read_csv("../data/raw/creditcard.csv")
print(f"Raw shape: {df.shape}")

# Separate features and target
X = df.drop("Class", axis=1).values
y = df["Class"].values
print(f"Features shape: {X.shape}")
print(f"Target distribution: {np.bincount(y)}")

# Scale features
print("\nScaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (stratified)
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(f"Train shape: {X_train.shape}, {y_train.shape}")
print(f"Test shape: {X_test.shape}, {y_test.shape}")

# Handle class imbalance (SMOTE)
print(f"\nBefore SMOTE: {np.bincount(y_train)}")
sm = SMOTE(random_state=42, sampling_strategy=1.0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
print(f"After SMOTE: {np.bincount(y_train_res)}")

# Save processed datasets
print("\nSaving processed files...")
np.save("../data/processed/X_train.npy", X_train_res)
np.save("../data/processed/X_test.npy", X_test)
np.save("../data/processed/y_train.npy", y_train_res)
np.save("../data/processed/y_test.npy", y_test)
joblib.dump(scaler, "../data/processed/scaler.pkl")

print("\n✅ Data preparation complete!")
print("Files saved in data/processed/:")
print("  - X_train.npy")
print("  - X_test.npy")
print("  - y_train.npy")
print("  - y_test.npy")
print("  - scaler.pkl")

Loading raw data...
Raw shape: (284807, 31)
Features shape: (284807, 30)
Target distribution: [284315    492]

Scaling features...
Splitting data...
Train shape: (227845, 30), (227845,)
Test shape: (56962, 30), (56962,)

Before SMOTE: [227451    394]
After SMOTE: [227451 227451]

Saving processed files...

✅ Data preparation complete!
Files saved in data/processed/:
  - X_train.npy
  - X_test.npy
  - y_train.npy
  - y_test.npy
  - scaler.pkl
