In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 1. Load the Engineered Data
# We use the final version we just saved
df = pd.read_csv('../data/processed/fraud_data_feature_engineered.csv')
print(f"✅ Data loaded. Shape: {df.shape}")

# 2. Separate Features (X) and Target (y)
# 'class' is our target variable
X = df.drop(columns=['class'])
y = df['class']

# 3. Stratified Train-Test Split [cite: 1174]
# We MUST do this before SMOTE to prevent data leakage.
# Stratify=y ensures the % of fraud is the same in both sets initially.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n--- Split Status ---")
print(f"Training Set: {X_train.shape}")
print(f"Test Set:     {X_test.shape}")

# 4. Handle Class Imbalance (SMOTE) [cite: 1166-1169]
# We apply this ONLY to X_train, y_train.

print("\n--- Class Balance BEFORE SMOTE (Training Set) ---")
print(y_train.value_counts())
print(f"Fraud Ratio: {y_train.value_counts(normalize=True)[1]:.4f}")

print("\n⚡ Applying SMOTE... (This may take a moment)")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 5. Document the Result [cite: 1169]
print("\n--- Class Balance AFTER SMOTE (Training Set) ---")
print(y_train_resampled.value_counts())
print(f"Fraud Ratio: {y_train_resampled.value_counts(normalize=True)[1]:.4f}")

# 6. Justification Visualization (Optional but great for report)
# Show that the classes are now 50/50
plt.figure(figsize=(6, 4))
sns.countplot(x=y_train_resampled)
plt.title('Class Distribution After SMOTE')
plt.xlabel('Class (0=Legit, 1=Fraud)')
plt.show()

# 7. Save the Split Data (Optional)
# Often we just keep variables in memory, but saving ensures reproducibility.
X_train_resampled.to_csv('../data/processed/X_train_smote.csv', index=False)
y_train_resampled.to_csv('../data/processed/y_train_smote.csv', index=False)

ImportError: cannot import name '_is_pandas_df' from 'sklearn.utils.validation' (/Users/admin/week5_6/Fraud-detection(ADEY)/.venv/lib/python3.11/site-packages/sklearn/utils/validation.py)