# 🔧 Phase 3: Feature Engineering

This notebook demonstrates comprehensive feature engineering techniques for MLOps, covering feature extraction, selection, and transformation.

## Table of Contents
1. [Feature Extraction](#1-feature-extraction)
2. [Feature Selection](#2-feature-selection)
3. [Feature Transformation](#3-feature-transformation)
4. [Advanced Feature Engineering](#4-advanced-feature-engineering)

---

## Prerequisites
Make sure you have the required libraries installed:
```bash
pip install pandas numpy scikit-learn matplotlib seaborn plotly
```


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
import warnings

warnings.filterwarnings('ignore')
np.random.seed(42)

print("✅ Libraries imported successfully!")

# Load data (create sample if not available)
try:
    df = pd.read_parquet('data/processed/dataset_v*.parquet')
    print("✅ Loaded processed data")
except:
    print("⚠️  Creating sample data...")
    np.random.seed(42)
    n_samples = 1000
    
    data = {
        'customer_id': range(1, n_samples + 1),
        'age': np.random.normal(35, 12, n_samples).astype(int),
        'income': np.random.lognormal(10, 0.5, n_samples),
        'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
        'employment_status': np.random.choice(['Employed', 'Unemployed', 'Self-employed', 'Retired'], n_samples, p=[0.6, 0.1, 0.2, 0.1]),
        'credit_score': np.random.normal(650, 100, n_samples).astype(int),
        'loan_amount': np.random.exponential(50000, n_samples),
        'default_risk': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'application_date': pd.date_range('2020-01-01', periods=n_samples, freq='D'),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_samples, p=[0.2, 0.15, 0.15, 0.15, 0.35]),
        'marital_status': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], n_samples, p=[0.4, 0.4, 0.15, 0.05]),
        'dependents': np.random.poisson(1.5, n_samples),
        'previous_loans': np.random.poisson(2, n_samples),
        'late_payments': np.random.poisson(0.5, n_samples),
        'debt_to_income_ratio': np.random.beta(2, 5, n_samples),
        'credit_utilization': np.random.beta(3, 2, n_samples),
        'home_ownership': np.random.choice(['Rent', 'Own', 'Mortgage'], n_samples, p=[0.4, 0.2, 0.4]),
        'purpose': np.random.choice(['Debt Consolidation', 'Home Improvement', 'Business', 'Education'], n_samples, p=[0.4, 0.2, 0.2, 0.2])
    }
    
    df = pd.DataFrame(data)
    print("✅ Sample dataset created")

print(f"📊 Dataset shape: {df.shape}")


## 1. Feature Extraction

**Purpose**: Extract meaningful features from raw data.


In [None]:
# 1.1 Extract Date/Time Features
print("🔧 Step 10: Feature Engineering")
print("=" * 50)

# Extract date/time features
if 'application_date' in df.columns:
    df['year'] = df['application_date'].dt.year
    df['month'] = df['application_date'].dt.month
    df['day_of_week'] = df['application_date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6])
    df['quarter'] = df['application_date'].dt.quarter
    print("✅ Extracted date/time features")

# 1.2 Extract Text Features
if 'city' in df.columns:
    df['city_length'] = df['city'].str.len()
    df['is_major_city'] = df['city'].isin(['New York', 'Los Angeles', 'Chicago'])
    print("✅ Extracted text features")

# 1.3 Extract Numerical Features
# Create age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100], 
                        labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])

# Create income brackets
df['income_bracket'] = pd.cut(df['income'], 
                            bins=[0, 30000, 50000, 75000, 100000, float('inf')], 
                            labels=['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High'])

# Create credit score categories
df['credit_rating'] = pd.cut(df['credit_score'], 
                           bins=[0, 580, 670, 740, 800, 850], 
                           labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])

print("✅ Extracted numerical features")

# 1.4 Create Derived Features
# Loan to income ratio
df['loan_to_income_ratio'] = df['loan_amount'] / df['income']

# Credit utilization impact
df['credit_utilization_impact'] = df['credit_utilization'] * df['credit_score']

# Risk score
df['risk_score'] = (df['late_payments'] * 10) + (df['debt_to_income_ratio'] * 100) - df['credit_score']

print("✅ Created derived features")
print(f"New dataset shape: {df.shape}")


## 2. Feature Selection

**Purpose**: Select the most relevant features for model training.


In [None]:
# 2.1 Statistical Feature Selection
print("\n🎯 Feature Selection")
print("=" * 50)

# Prepare features and target
X = df.drop(['customer_id', 'default_risk', 'application_date'], axis=1, errors='ignore')
y = df['default_risk']

# Handle categorical variables for feature selection
X_encoded = X.copy()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Label encode categorical variables
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
    le_dict[col] = le

print(f"Features before selection: {X_encoded.shape[1]}")

# 2.2 Univariate Feature Selection
# Select top 10 features using f_classif
selector_f = SelectKBest(score_func=f_classif, k=10)
X_selected_f = selector_f.fit_transform(X_encoded, y)

# Get selected feature names
selected_features_f = X_encoded.columns[selector_f.get_support()].tolist()
print(f"Top 10 features (f_classif): {selected_features_f}")

# Mutual information feature selection
selector_mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected_mi = selector_mi.fit_transform(X_encoded, y)

selected_features_mi = X_encoded.columns[selector_mi.get_support()].tolist()
print(f"Top 10 features (mutual_info): {selected_features_mi}")

# 2.3 Tree-based Feature Selection
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_encoded.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 10 features (Random Forest):")
print(feature_importance.head(10))

# Select top features based on importance threshold
importance_threshold = 0.01
selected_features_rf = feature_importance[feature_importance['importance'] > importance_threshold]['feature'].tolist()
print(f"Features with importance > {importance_threshold}: {len(selected_features_rf)}")

# 2.4 Correlation-based Feature Selection
correlation_matrix = X_encoded.corr()
high_corr_pairs = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = abs(correlation_matrix.iloc[i, j])
        if corr_val > 0.8:  # High correlation threshold
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                corr_val
            ))

print(f"\nHigh correlation pairs (>0.8): {len(high_corr_pairs)}")
for pair in high_corr_pairs[:5]:  # Show first 5
    print(f"{pair[0]} - {pair[1]}: {pair[2]:.3f}")

# Remove highly correlated features
features_to_remove = set()
for pair in high_corr_pairs:
    # Remove the feature with lower importance
    if pair[0] in feature_importance['feature'].values and pair[1] in feature_importance['feature'].values:
        imp1 = feature_importance[feature_importance['feature'] == pair[0]]['importance'].iloc[0]
        imp2 = feature_importance[feature_importance['feature'] == pair[1]]['importance'].iloc[0]
        if imp1 < imp2:
            features_to_remove.add(pair[0])
        else:
            features_to_remove.add(pair[1])

print(f"Features to remove due to high correlation: {len(features_to_remove)}")

# Final feature selection
final_features = [f for f in selected_features_rf if f not in features_to_remove]
print(f"Final selected features: {len(final_features)}")
print(f"Features: {final_features}")


## 3. Feature Transformation

**Purpose**: Transform features to improve model performance.


In [None]:
# 3.1 One-Hot Encoding for Categorical Variables
print("\n🔄 Feature Transformation")
print("=" * 50)

# Create final dataset with selected features
X_final = X[final_features].copy()

# One-hot encode categorical variables
categorical_features = X_final.select_dtypes(include=['object', 'category']).columns
numerical_features = X_final.select_dtypes(include=[np.number]).columns

print(f"Categorical features to encode: {list(categorical_features)}")
print(f"Numerical features: {list(numerical_features)}")

# Apply one-hot encoding
X_encoded_final = pd.get_dummies(X_final, columns=categorical_features, prefix=categorical_features)
print(f"Features after one-hot encoding: {X_encoded_final.shape[1]}")

# 3.2 Scaling Numerical Features
print("\n📏 Scaling Numerical Features")
print("-" * 30)

# Standard Scaling
scaler_standard = StandardScaler()
X_standard = X_encoded_final.copy()
X_standard[numerical_features] = scaler_standard.fit_transform(X_standard[numerical_features])

print("✅ Applied Standard Scaling")

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
X_minmax = X_encoded_final.copy()
X_minmax[numerical_features] = scaler_minmax.fit_transform(X_minmax[numerical_features])

print("✅ Applied Min-Max Scaling")

# 3.3 Polynomial Features
print("\n🔢 Polynomial Features")
print("-" * 30)

# Create polynomial features for numerical columns
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_poly = poly.fit_transform(X_encoded_final[numerical_features])
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(numerical_features))

print(f"Polynomial features created: {X_poly_df.shape[1]}")
print(f"Original numerical features: {len(numerical_features)}")

# 3.4 Feature Engineering Summary
print("\n📊 Feature Engineering Summary")
print("-" * 30)

print(f"Original dataset shape: {df.shape}")
print(f"Features after extraction: {X.shape[1]}")
print(f"Features after selection: {X_final.shape[1]}")
print(f"Features after encoding: {X_encoded_final.shape[1]}")
print(f"Features after polynomial: {X_poly_df.shape[1]}")

# Save processed features
feature_engineering_results = {
    'X_original': X,
    'X_selected': X_final,
    'X_encoded': X_encoded_final,
    'X_standard': X_standard,
    'X_minmax': X_minmax,
    'X_polynomial': X_poly_df,
    'selected_features': final_features,
    'categorical_features': list(categorical_features),
    'numerical_features': list(numerical_features),
    'scalers': {
        'standard': scaler_standard,
        'minmax': scaler_minmax
    },
    'label_encoders': le_dict
}

print("✅ Feature engineering completed successfully!")
print("✅ All transformations saved for model training")
