# Feature Importance Analysis with SHAP

This notebook analyzes the feature importance of the Random Forest model using SHAP (SHapley Additive exPlanations) values to understand which features contribute most to movie rating predictions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
import sys
import os

# Add project root to path
sys.path.append('..')

from src.preprocessor import DataPreprocessor
from src.model import RecommendationModel
from sklearn.model_selection import train_test_split

plt.style.use('default')
sns.set_palette('husl')

## Load Data and Models

In [None]:
# Load data
df = pd.read_csv('../data/ml100k_combined.csv')
print(f"Dataset shape: {df.shape}")

# Load trained models
try:
    preprocessor = DataPreprocessor().load('../models/preprocessor.pkl')
    ml_model = RecommendationModel('rf').load('../models/ml_model.pkl')
    print("✓ Pre-trained models loaded successfully!")
except FileNotFoundError:
    print("Models not found. Training new models...")
    # Train new models
    preprocessor = DataPreprocessor()
    X, y = preprocessor.prepare_features(df, is_training=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    ml_model = RecommendationModel('rf')
    ml_model.train(X_train, y_train)
    
    print("✓ New models trained successfully!")

## Prepare Data for SHAP Analysis

In [None]:
# Prepare features
X, y = preprocessor.prepare_features(df, is_training=False)
print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {len(X.columns)}")

# Sample data for SHAP analysis (SHAP can be slow on large datasets)
sample_size = min(1000, len(X))
X_sample = X.sample(n=sample_size, random_state=42)
y_sample = y.loc[X_sample.index]

print(f"Using {sample_size} samples for SHAP analysis")

## Random Forest Feature Importance

In [None]:
# Get feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': ml_model.model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Random Forest Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"{i:2d}. {row['feature']:<30} {row['importance']:.4f}")

## SHAP Analysis

In [None]:
# Initialize SHAP explainer
print("Initializing SHAP explainer...")
explainer = shap.TreeExplainer(ml_model.model)

# Calculate SHAP values
print("Calculating SHAP values...")
shap_values = explainer.shap_values(X_sample)

print(f"SHAP values shape: {shap_values.shape}")
print("✓ SHAP analysis complete!")

## SHAP Visualizations

In [None]:
# Summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_sample, plot_type="bar", show=False)
plt.title('SHAP Feature Importance (Mean Absolute SHAP Values)')
plt.tight_layout()
plt.show()

In [None]:
# Detailed summary plot
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values, X_sample, show=False)
plt.title('SHAP Summary Plot (Feature Impact on Model Output)')
plt.tight_layout()
plt.show()

## Feature Categories Analysis

In [None]:
# Categorize features
def categorize_feature(feature_name):
    if 'user_avg' in feature_name and 'rating' in feature_name:
        return 'User Preferences'
    elif 'movie_avg_rating' in feature_name:
        return 'Movie Popularity'
    elif 'global_avg' in feature_name and 'rating' in feature_name:
        return 'Global Trends'
    elif feature_name in ['age', 'user_age_at_release']:
        return 'Demographics'
    elif feature_name in ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
                         'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
                         'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']:
        return 'Genre Binary'
    elif 'tfidf' in feature_name.lower():
        return 'TF-IDF Features'
    else:
        return 'Other'

# Calculate mean absolute SHAP values by category
mean_shap = np.abs(shap_values).mean(axis=0)
feature_categories = pd.DataFrame({
    'feature': X_sample.columns,
    'mean_abs_shap': mean_shap,
    'category': [categorize_feature(f) for f in X_sample.columns]
})

category_importance = feature_categories.groupby('category')['mean_abs_shap'].sum().sort_values(ascending=False)

# Plot category importance
plt.figure(figsize=(10, 6))
category_importance.plot(kind='bar')
plt.title('Feature Importance by Category (Sum of Mean Absolute SHAP Values)')
plt.xlabel('Feature Category')
plt.ylabel('Total SHAP Importance')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Feature Importance by Category:")
for category, importance in category_importance.items():
    print(f"{category:<20} {importance:.4f}")

## Top Features Analysis

In [None]:
# Get top 15 features by SHAP importance
top_shap_features = feature_categories.nlargest(15, 'mean_abs_shap')

print("Top 15 Features by SHAP Importance:")
print("=" * 70)
print(f"{'Rank':<5} {'Feature':<35} {'Category':<15} {'SHAP Value':<10}")
print("=" * 70)

for i, (_, row) in enumerate(top_shap_features.iterrows(), 1):
    print(f"{i:<5} {row['feature']:<35} {row['category']:<15} {row['mean_abs_shap']:<10.4f}")

## Individual Prediction Analysis

In [None]:
# Analyze a specific prediction
sample_idx = 0
sample_prediction = X_sample.iloc[sample_idx:sample_idx+1]
sample_shap = shap_values[sample_idx]

print(f"Sample Prediction Analysis (Index {sample_idx}):")
print(f"Predicted Rating: {ml_model.predict(sample_prediction)[0]:.3f}")
print(f"Actual Rating: {y_sample.iloc[sample_idx]:.3f}")

# Get top contributing features for this prediction
feature_contributions = pd.DataFrame({
    'feature': X_sample.columns,
    'shap_value': sample_shap,
    'feature_value': sample_prediction.iloc[0].values
}).sort_values('shap_value', key=abs, ascending=False)

print("\nTop 10 Contributing Features:")
print(f"{'Feature':<30} {'SHAP Value':<12} {'Feature Value':<12}")
print("-" * 55)
for _, row in feature_contributions.head(10).iterrows():
    print(f"{row['feature']:<30} {row['shap_value']:<12.4f} {row['feature_value']:<12.4f}")

## Waterfall Plot for Individual Prediction

In [None]:
# Create waterfall plot for the sample prediction
plt.figure(figsize=(12, 8))
shap.plots.waterfall(shap.Explanation(values=sample_shap, 
                                     base_values=explainer.expected_value, 
                                     data=sample_prediction.iloc[0]))
plt.title(f'SHAP Waterfall Plot - Sample Prediction {sample_idx}')
plt.tight_layout()
plt.show()

## Key Insights

Based on the SHAP analysis, we can draw the following insights:

1. **Most Important Feature Categories**: User preferences and movie popularity tend to be the strongest predictors
2. **Individual Features**: Statistical features (user/movie averages) typically outperform demographic and genre features
3. **Model Interpretability**: SHAP values show both positive and negative contributions to predictions
4. **Feature Engineering Success**: The engineered statistical features provide significant predictive power

This analysis helps explain why certain movies are recommended and validates our feature engineering approach.