In [1]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import os

# Load German dataset
data_path = os.path.join("..", "data", "south_german_credit.csv")
df = pd.read_csv(data_path)

print("✅ Data loaded. Shape:", df.shape)

# Separate features & target
X = df.drop("Credit_Risk", axis=1)
y = df["Credit_Risk"]

# One-hot encode categoricals
X_encoded = pd.get_dummies(X)
print("✅ One-hot encoding complete. Shape:", X_encoded.shape)

# Standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

print("✅ Scaling complete. Shape:", X_scaled.shape)

# Handle imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("✅ SMOTE complete. Resampled shape:", X_resampled.shape)

# Check class distribution after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_resampled).value_counts())

# Save the arrays
output_dir = os.path.join("..", "data")
os.makedirs(output_dir, exist_ok=True)

np.save(os.path.join(output_dir, "X_resampled_german.npy"), X_resampled)
np.save(os.path.join(output_dir, "y_resampled_german.npy"), y_resampled)

print("✅ Saved resampled arrays.")


✅ Data loaded. Shape: (1000, 21)
✅ One-hot encoding complete. Shape: (1000, 61)
✅ Scaling complete. Shape: (1000, 61)
✅ SMOTE complete. Resampled shape: (1400, 61)
Class distribution after SMOTE:
 Credit_Risk
1    700
2    700
Name: count, dtype: int64
✅ Saved resampled arrays.
