# Notebook 3: Feature Engineering

**Purpose**: Create statistical features, interaction features, perform feature importance screening and correlation analysis.

**Inputs**:
- `engineered_train.csv` from Notebook 2
- `engineered_test.csv` from Notebook 2

**Outputs**:
- `feature_engineered_train.csv` → `data/features/`
- `feature_engineered_test.csv` → `data/features/`
- `feature_report.json` → `results/`

---

In [None]:
# Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths
BASE_DIR = Path('.').resolve().parent
SPLITS_DIR = BASE_DIR / 'data' / 'splits'
FEATURES_DIR = BASE_DIR / 'data' / 'features'
RESULTS_DIR = BASE_DIR / 'results'
FIGURES_DIR = BASE_DIR / 'figures'

# Create directories
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# Configuration
TARGET_COLUMN = 'Class'

print(f"Random Seed: {RANDOM_SEED}")

## 1. Load Engineered Data

In [None]:
# Load train and test data
train_df = pd.read_csv(SPLITS_DIR / 'engineered_train.csv')
test_df = pd.read_csv(SPLITS_DIR / 'engineered_test.csv')

print(f"Training set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

In [None]:
# Separate features and target
X_train = train_df.drop(columns=[TARGET_COLUMN])
y_train = train_df[TARGET_COLUMN]

X_test = test_df.drop(columns=[TARGET_COLUMN])
y_test = test_df[TARGET_COLUMN]

original_features = list(X_train.columns)
print(f"Original features: {len(original_features)}")

## 2. Statistical Feature Creation

In [None]:
def create_statistical_features(df):
    """
    Create row-wise statistical features.
    """
    stats_df = pd.DataFrame(index=df.index)
    
    # Basic statistics across V1-V28 features
    v_cols = [col for col in df.columns if col.startswith('V')]
    
    stats_df['V_mean'] = df[v_cols].mean(axis=1)
    stats_df['V_std'] = df[v_cols].std(axis=1)
    stats_df['V_min'] = df[v_cols].min(axis=1)
    stats_df['V_max'] = df[v_cols].max(axis=1)
    stats_df['V_range'] = stats_df['V_max'] - stats_df['V_min']
    stats_df['V_skew'] = df[v_cols].skew(axis=1)
    stats_df['V_kurtosis'] = df[v_cols].kurtosis(axis=1)
    
    # Quantile-based features
    stats_df['V_q25'] = df[v_cols].quantile(0.25, axis=1)
    stats_df['V_q75'] = df[v_cols].quantile(0.75, axis=1)
    stats_df['V_iqr'] = stats_df['V_q75'] - stats_df['V_q25']
    
    # Count-based features
    stats_df['V_positive_count'] = (df[v_cols] > 0).sum(axis=1)
    stats_df['V_negative_count'] = (df[v_cols] < 0).sum(axis=1)
    
    return stats_df

In [None]:
# Create statistical features
stats_train = create_statistical_features(X_train)
stats_test = create_statistical_features(X_test)

print(f"Statistical features created: {stats_train.shape[1]}")
print(f"Features: {list(stats_train.columns)}")

## 3. Interaction Features

In [None]:
def create_interaction_features(df, top_features, n_interactions=5):
    """
    Create interaction features between top important features.
    """
    interaction_df = pd.DataFrame(index=df.index)
    
    # Select top features for interactions
    features_to_interact = top_features[:n_interactions]
    
    for i, f1 in enumerate(features_to_interact):
        for f2 in features_to_interact[i+1:]:
            # Multiplication
            interaction_df[f'{f1}_x_{f2}'] = df[f1] * df[f2]
            # Ratio (with small epsilon to avoid division by zero)
            interaction_df[f'{f1}_div_{f2}'] = df[f1] / (df[f2] + 1e-8)
    
    return interaction_df

In [None]:
# First, get feature importance to select top features for interaction
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
rf.fit(X_train, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Features by Importance:")
print(feature_importance.head(10))

In [None]:
# Create interaction features using top 5 important features
top_features = feature_importance['feature'].head(5).tolist()
print(f"Top features for interaction: {top_features}")

interaction_train = create_interaction_features(X_train, top_features)
interaction_test = create_interaction_features(X_test, top_features)

print(f"\nInteraction features created: {interaction_train.shape[1]}")

## 4. Combine All Features

In [None]:
# Combine original + statistical + interaction features
X_train_enhanced = pd.concat([X_train, stats_train, interaction_train], axis=1)
X_test_enhanced = pd.concat([X_test, stats_test, interaction_test], axis=1)

print(f"Enhanced training features: {X_train_enhanced.shape}")
print(f"Enhanced test features: {X_test_enhanced.shape}")

## 5. Feature Importance Screening

In [None]:
# Recalculate feature importance with all features
rf_full = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
rf_full.fit(X_train_enhanced, y_train)

full_importance = pd.DataFrame({
    'feature': X_train_enhanced.columns,
    'importance': rf_full.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Features (All):")
print(full_importance.head(15))

In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))
top_20 = full_importance.head(20)
plt.barh(range(len(top_20)), top_20['importance'].values, color='steelblue')
plt.yticks(range(len(top_20)), top_20['feature'].values)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 20 Feature Importances (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'feature_importance.png', dpi=150)
plt.show()

## 6. Correlation Analysis

In [None]:
# Compute correlation with target
correlations = X_train_enhanced.corrwith(y_train).abs().sort_values(ascending=False)

print("Top 15 Features by Correlation with Target:")
print(correlations.head(15))

In [None]:
# Correlation heatmap for top features
top_corr_features = correlations.head(15).index.tolist()
corr_matrix = X_train_enhanced[top_corr_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix (Top 15 Features)')
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'correlation_matrix.png', dpi=150)
plt.show()

## 7. Select Final Features

In [None]:
# Combine importance and correlation for feature selection
selection_df = pd.DataFrame({
    'feature': full_importance['feature'],
    'importance_rank': range(1, len(full_importance) + 1),
    'importance': full_importance['importance']
})

# Add correlation ranking
corr_df = pd.DataFrame({
    'feature': correlations.index,
    'correlation': correlations.values,
    'correlation_rank': range(1, len(correlations) + 1)
})

selection_df = selection_df.merge(corr_df, on='feature')
selection_df['combined_rank'] = (selection_df['importance_rank'] + selection_df['correlation_rank']) / 2
selection_df = selection_df.sort_values('combined_rank')

print("Top 20 Features by Combined Ranking:")
print(selection_df.head(20))

In [None]:
# Select top features (keeping all for now, dimensionality reduction in Notebook 4)
# Here we just ensure features are well-formed
selected_features = X_train_enhanced.columns.tolist()

X_train_final = X_train_enhanced[selected_features]
X_test_final = X_test_enhanced[selected_features]

print(f"Final feature count: {len(selected_features)}")

## 8. Save Outputs

In [None]:
# Combine with target and save
train_final = X_train_final.copy()
train_final[TARGET_COLUMN] = y_train.values

test_final = X_test_final.copy()
test_final[TARGET_COLUMN] = y_test.values

# Save
train_path = FEATURES_DIR / 'feature_engineered_train.csv'
test_path = FEATURES_DIR / 'feature_engineered_test.csv'

train_final.to_csv(train_path, index=False)
test_final.to_csv(test_path, index=False)

print(f"✅ Saved feature-engineered training data to: {train_path}")
print(f"✅ Saved feature-engineered test data to: {test_path}")

In [None]:
# Create and save feature report
feature_report = {
    "random_seed": RANDOM_SEED,
    "original_features": len(original_features),
    "statistical_features": stats_train.shape[1],
    "interaction_features": interaction_train.shape[1],
    "total_features": len(selected_features),
    "feature_names": selected_features,
    "top_10_by_importance": full_importance.head(10).to_dict('records'),
    "top_10_by_correlation": [
        {"feature": f, "correlation": round(c, 4)} 
        for f, c in zip(correlations.head(10).index, correlations.head(10).values)
    ],
    "engineering_steps": [
        "Created row-wise statistical features (mean, std, skew, kurtosis, etc.)",
        "Created interaction features (multiplication, division) for top 5 features",
        "Computed Random Forest feature importance",
        "Computed correlation with target"
    ]
}

report_path = RESULTS_DIR / 'feature_report.json'
with open(report_path, 'w') as f:
    json.dump(feature_report, f, indent=2)

print(f"✅ Saved feature report to: {report_path}")

## 9. Verification

In [None]:
# Verify
train_verify = pd.read_csv(FEATURES_DIR / 'feature_engineered_train.csv')
test_verify = pd.read_csv(FEATURES_DIR / 'feature_engineered_test.csv')

print("Verification:")
print(f"  Training shape: {train_verify.shape}")
print(f"  Test shape: {test_verify.shape}")
print(f"  Feature columns: {len(train_verify.columns) - 1}")
print("\n✅ Notebook 3 Complete!")