# Feature Engineering

This notebook creates features for predictive modeling of job displacement risk.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

sys.path.append('../src')

from data_loader import load_mckinsey_data, load_global_ai_adoption
from feature_engineering import (
    create_risk_features,
    create_time_series_features,
    prepare_ml_features,
    create_interaction_features,
    encode_categorical_features
)

sns.set_style("whitegrid")

## Load and Transform Data

In [None]:
# Load data
mckinsey = load_mckinsey_data()
global_ai = load_global_ai_adoption()

print("Original McKinsey Data Shape:", mckinsey.shape)
print("\nColumns:", mckinsey.columns.tolist())

## Create Risk Features

In [None]:
# Create risk features
mckinsey_features = create_risk_features(mckinsey)

print("New features created:")
print("- composite_risk")
print("- skill_level_numeric")
print("- education_numeric")
print("- protection_score")
print("- vulnerability_score")
print("- employment_category")
print("- wage_category")

display(mckinsey_features[['occupation_name', 'composite_risk', 'protection_score', 'vulnerability_score']].head(10))

In [None]:
# Visualize new features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Protection Score Distribution
mckinsey_features['protection_score'].hist(bins=20, ax=axes[0, 0], color='steelblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Protection Score')
axes[0, 0].set_xlabel('Protection Score')
axes[0, 0].set_ylabel('Count')

# Vulnerability Score Distribution
mckinsey_features['vulnerability_score'].hist(bins=20, ax=axes[0, 1], color='coral', edgecolor='black')
axes[0, 1].set_title('Distribution of Vulnerability Score')
axes[0, 1].set_xlabel('Vulnerability Score')

# Composite Risk by Sector
sector_risk = mckinsey_features.groupby('sector')['composite_risk'].mean().sort_values()
sector_risk.plot(kind='barh', ax=axes[1, 0], color='teal')
axes[1, 0].set_title('Average Composite Risk by Sector')

# Skill Level Distribution
mckinsey_features['skill_level_numeric'].value_counts().sort_index().plot(kind='bar', ax=axes[1, 1], color='forestgreen')
axes[1, 1].set_title('Distribution of Skill Levels')

plt.tight_layout()
plt.show()

## Time Series Features

In [None]:
# Create time series features
global_features = create_time_series_features(global_ai)

print("Time series features created:")
print("- ai_adoption_growth")
print("- investment_growth")
print("- ai_adoption_lag1")
print("- ai_adoption_lag2")
print("- ai_adoption_ma3")

# Display features for United States
us_data = global_features[global_features['country'] == 'United States']
display(us_data[['year', 'ai_adoption_rate', 'ai_adoption_growth', 'ai_adoption_ma3']].head(10))

## Interaction Features

In [None]:
# Create interaction features
mckinsey_interactions = create_interaction_features(mckinsey_features)

print("Interaction features created:")
print("- automation_wage_interaction")
print("- skill_automation_interaction")
print("- employment_risk")

# Visualize interactions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].scatter(mckinsey_interactions['automation_potential'], 
                np.log(mckinsey_interactions['median_wage_usd'] + 1),
                c=mckinsey_interactions['automation_wage_interaction'], 
                cmap='viridis', alpha=0.6)
axes[0].set_xlabel('Automation Potential')
axes[0].set_ylabel('Log(Median Wage)')
axes[0].set_title('Automation vs Wage Interaction')

axes[1].scatter(mckinsey_interactions['skill_level_numeric'], 
                mckinsey_interactions['automation_potential'],
                c=mckinsey_interactions['skill_automation_interaction'], 
                cmap='plasma', alpha=0.6)
axes[1].set_xlabel('Skill Level')
axes[1].set_ylabel('Automation Potential')
axes[1].set_title('Skill vs Automation Interaction')

axes[2].scatter(mckinsey_interactions['current_employment_us_millions'], 
                mckinsey_interactions['vulnerability_score'],
                c=mckinsey_interactions['employment_risk'], 
                cmap='coolwarm', alpha=0.6)
axes[2].set_xlabel('Employment (Millions)')
axes[2].set_ylabel('Vulnerability Score')
axes[2].set_title('Employment vs Risk Interaction')

plt.tight_layout()
plt.show()

## Prepare ML Features

In [None]:
# Prepare features for machine learning
X, y, feature_cols = prepare_ml_features(mckinsey_interactions)

print(f"Feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")
print(f"\nFeatures used:")
for i, col in enumerate(feature_cols, 1):
    print(f"{i}. {col}")

# Display correlation matrix
correlation_matrix = X.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Feature Importance Analysis

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest for feature importance
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 8))
feature_importance.head(20).plot(x='feature', y='importance', kind='barh', color='steelblue')
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
display(feature_importance.head(10))

## Save Features

In [None]:
# Save processed features
import pickle

# Save feature matrix and target
with open('../data/processed/features.pkl', 'wb') as f:
    pickle.dump({
        'X': X,
        'y': y,
        'feature_names': X.columns.tolist(),
        'mckinsey_features': mckinsey_interactions
    }, f)

print("Features saved to ../data/processed/features.pkl")
print(f"\nTotal features: {X.shape[1]}")
print(f"Total samples: {X.shape[0]}")