# Notebook 2: Data Engineering

**Purpose**: Data type casting, feature normalization, and train-test split creation.

**Inputs**:
- `cleaned_data.csv` from Notebook 1

**Outputs**:
- `engineered_train.csv` → `data/splits/`
- `engineered_test.csv` → `data/splits/`
- `split_metadata.json` → `results/`
- `scaler.pkl` → `models/`

---

In [None]:
# Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib

# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths
BASE_DIR = Path('.').resolve().parent
PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
SPLITS_DIR = BASE_DIR / 'data' / 'splits'
MODELS_DIR = BASE_DIR / 'models'
RESULTS_DIR = BASE_DIR / 'results'
FIGURES_DIR = BASE_DIR / 'figures'

# Create directories
for d in [SPLITS_DIR, MODELS_DIR, RESULTS_DIR, FIGURES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Configuration
TARGET_COLUMN = 'Class'
TEST_SIZE = 0.2

print(f"Random Seed: {RANDOM_SEED}")
print(f"Test Size: {TEST_SIZE}")

## 1. Load Cleaned Data

In [None]:
# Load data from Notebook 1
input_path = PROCESSED_DIR / 'cleaned_data.csv'
df = pd.read_csv(input_path)

print(f"Loaded data shape: {df.shape}")
print(f"\nClass distribution:")
print(df[TARGET_COLUMN].value_counts())

In [None]:
# Display first few rows
df.head()

## 2. Data Type Casting

In [None]:
# Check current data types
print("Current Data Types:")
print(df.dtypes)

In [None]:
# Ensure target is integer
df[TARGET_COLUMN] = df[TARGET_COLUMN].astype(int)

# Ensure features are float64
feature_cols = [col for col in df.columns if col != TARGET_COLUMN]
for col in feature_cols:
    df[col] = df[col].astype(np.float64)

print("\nData types after casting:")
print(df.dtypes.value_counts())

## 3. Feature-Target Separation

In [None]:
# Separate features and target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

## 4. Stratified Train-Test Split

In [None]:
# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_SEED, 
    stratify=y  # Maintain class ratio
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining class distribution:")
print(y_train.value_counts())
print(f"\nTest class distribution:")
print(y_test.value_counts())

In [None]:
# Verify stratification
train_ratio = y_train.mean() * 100
test_ratio = y_test.mean() * 100

print(f"Training anomaly ratio: {train_ratio:.2f}%")
print(f"Test anomaly ratio: {test_ratio:.2f}%")
print(f"\n✅ Stratification successful!" if abs(train_ratio - test_ratio) < 1 else "⚠️ Stratification mismatch!")

## 5. Feature Normalization (StandardScaler)

In [None]:
# Check feature distributions before scaling
print("Feature Statistics (Before Scaling):")
print(f"Mean range: [{X_train.mean().min():.4f}, {X_train.mean().max():.4f}]")
print(f"Std range: [{X_train.std().min():.4f}, {X_train.std().max():.4f}]")

In [None]:
# Apply StandardScaler
scaler = StandardScaler()

# Fit on training data only
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("\nFeature Statistics (After Scaling):")
print(f"Mean range: [{X_train_scaled.mean().min():.4f}, {X_train_scaled.mean().max():.4f}]")
print(f"Std range: [{X_train_scaled.std().min():.4f}, {X_train_scaled.std().max():.4f}]")

In [None]:
# Visualize scaling effect
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Before scaling (using first 5 features)
sample_cols = X_train.columns[:5]
axes[0].boxplot([X_train[col] for col in sample_cols], labels=sample_cols)
axes[0].set_title('Before Scaling (First 5 Features)')
axes[0].set_ylabel('Value')
axes[0].tick_params(axis='x', rotation=45)

# After scaling
axes[1].boxplot([X_train_scaled[col] for col in sample_cols], labels=sample_cols)
axes[1].set_title('After StandardScaler (First 5 Features)')
axes[1].set_ylabel('Standardized Value')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'scaling_comparison.png', dpi=150)
plt.show()

## 6. Save Outputs

In [None]:
# Combine features with target for saving
train_df = X_train_scaled.copy()
train_df[TARGET_COLUMN] = y_train.values

test_df = X_test_scaled.copy()
test_df[TARGET_COLUMN] = y_test.values

print(f"Training DataFrame shape: {train_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")

In [None]:
# Save train/test splits
train_path = SPLITS_DIR / 'engineered_train.csv'
test_path = SPLITS_DIR / 'engineered_test.csv'

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"✅ Saved training data to: {train_path}")
print(f"✅ Saved test data to: {test_path}")

In [None]:
# Save scaler
scaler_path = MODELS_DIR / 'scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"✅ Saved scaler to: {scaler_path}")

In [None]:
# Create and save split metadata
split_metadata = {
    "random_seed": RANDOM_SEED,
    "test_size": TEST_SIZE,
    "stratified": True,
    "target_column": TARGET_COLUMN,
    "training_samples": int(len(train_df)),
    "test_samples": int(len(test_df)),
    "training_class_distribution": {
        "normal": int((y_train == 0).sum()),
        "anomaly": int((y_train == 1).sum()),
        "anomaly_ratio": round(y_train.mean() * 100, 2)
    },
    "test_class_distribution": {
        "normal": int((y_test == 0).sum()),
        "anomaly": int((y_test == 1).sum()),
        "anomaly_ratio": round(y_test.mean() * 100, 2)
    },
    "scaler": {
        "type": "StandardScaler",
        "fit_on": "training_data_only"
    },
    "feature_count": len(feature_cols),
    "feature_names": feature_cols
}

metadata_path = RESULTS_DIR / 'split_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(split_metadata, f, indent=2)

print(f"✅ Saved split metadata to: {metadata_path}")

In [None]:
# Display metadata
print("\nSplit Metadata:")
print("=" * 50)
print(json.dumps(split_metadata, indent=2))

## 7. Verification

In [None]:
# Verify saved files
train_verify = pd.read_csv(SPLITS_DIR / 'engineered_train.csv')
test_verify = pd.read_csv(SPLITS_DIR / 'engineered_test.csv')
scaler_verify = joblib.load(MODELS_DIR / 'scaler.pkl')

print("Verification:")
print(f"  Training shape: {train_verify.shape}")
print(f"  Test shape: {test_verify.shape}")
print(f"  Scaler type: {type(scaler_verify).__name__}")
print(f"  Total samples: {len(train_verify) + len(test_verify)}")
print("\n✅ Notebook 2 Complete!")