# Phase 4: LIME Explainability Analysis (Google Colab)

**Objective**: Use LIME (Local Interpretable Model-agnostic Explanations) to explain individual predictions.

**Key Goals:**
- Generate LIME explanations for specific predictions
- Compare with SHAP analysis results
- Validate feature importance at instance level

**Expected Runtime**: ~3-5 minutes on Colab GPU

## 0. Colab Setup

In [None]:
# Install required libraries
!pip install catboost lime

print("\nLibraries installed successfully!")

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("\nGoogle Drive mounted!")
print("Files will be loaded from: /content/drive/MyDrive/lottery_analyzer/")

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# CatBoost
import catboost
from catboost import CatBoostClassifier

# LIME
import lime
import lime.lime_tabular

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")
print(f"CatBoost version: {catboost.__version__}")

## 2. Path Configuration

In [None]:
# Choose your data source
USE_GOOGLE_DRIVE = True

if USE_GOOGLE_DRIVE:
    DATA_DIR = '/content/drive/MyDrive/lottery_analyzer/data/splits'
    OUTPUT_DIR = '/content/drive/MyDrive/lottery_analyzer/outputs/explainability/lime'
    MODEL_DIR = '/content/drive/MyDrive/lottery_analyzer/models'
else:
    DATA_DIR = '/content/data/splits'
    OUTPUT_DIR = '/content/outputs/explainability/lime'
    MODEL_DIR = '/content/models'

# Convert to Path objects
DATA_DIR = Path(DATA_DIR)
OUTPUT_DIR = Path(OUTPUT_DIR)
MODEL_DIR = Path(MODEL_DIR)

# Create output directory
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Model directory: {MODEL_DIR}")

## 3. Load Best Model

In [None]:
# Load best model
best_model = CatBoostClassifier()
best_model.load_model(str(MODEL_DIR / 'best_model.cbm'))

print("Best model loaded successfully")
print(f"Model iterations: {best_model.tree_count_}")

## 4. Load Test Data

In [None]:
# Get all lottery names
test_files = sorted(DATA_DIR.glob('*_test.csv'))
lottery_names = [f.stem.replace('_test', '') for f in test_files]

print(f"Found {len(lottery_names)} lotteries")

# Load all test data
test_dfs = []
for lottery in lottery_names:
    df = pd.read_csv(DATA_DIR / f"{lottery}_test.csv")
    test_dfs.append(df)

test_data = pd.concat(test_dfs, ignore_index=True)

print(f"Test data shape: {test_data.shape}")
print(f"\nClass distribution:")
print(test_data['appeared'].value_counts())

## 5. Prepare Features

In [None]:
# Define feature columns
exclude_cols = ['appeared', 'draw_date', 'lottery', 'number']
feature_cols = [col for col in test_data.columns if col not in exclude_cols]

print(f"Feature columns ({len(feature_cols)}):")
print(feature_cols)

# Prepare X and y
X_test = test_data[feature_cols]
y_test = test_data['appeared']

print(f"\nX_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

## 6. Sample Data for LIME Analysis

In [None]:
# Get sample instances
np.random.seed(42)

# Select instances for analysis
positive_indices = np.where(y_test == 1)[0]
negative_indices = np.where(y_test == 0)[0]

# Sample 5 positive and 5 negative
sample_positive = np.random.choice(positive_indices, 5, replace=False)
sample_negative = np.random.choice(negative_indices, 5, replace=False)

sample_indices = np.concatenate([sample_positive, sample_negative])

print(f"Selected {len(sample_indices)} instances for LIME analysis:")
print(f"  - {len(sample_positive)} positive (Appeared)")
print(f"  - {len(sample_negative)} negative (Not Appeared)")

## 7. Create LIME Explainer

In [None]:
print("Creating LIME explainer...\n")

# Identify categorical columns (string dtype)
categorical_cols = X_test.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns found: {categorical_cols}")

# Encode categorical features for LIME
from sklearn.preprocessing import LabelEncoder
X_test_encoded = X_test.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    X_test_encoded[col] = le.fit_transform(X_test[col].astype(str))
    label_encoders[col] = le

print(f"Encoded {len(categorical_cols)} categorical columns")

# Get categorical feature indices
categorical_features_indices = [X_test_encoded.columns.get_loc(col) for col in categorical_cols]
print(f"Categorical feature indices: {categorical_features_indices}")

# Create wrapper function to decode for CatBoost predictions
def predict_fn_wrapper(X_encoded):
    """
    Wrapper to convert encoded data back to original format for CatBoost
    X_encoded: numpy array with encoded categorical features
    """
    # Convert to DataFrame
    X_df = pd.DataFrame(X_encoded, columns=feature_cols)
    
    # Decode categorical columns back to original values
    X_decoded = X_df.copy()
    for col in categorical_cols:
        # Get the encoder
        le = label_encoders[col]
        # Decode (handle unseen values)
        X_decoded[col] = X_df[col].astype(int).apply(
            lambda x: le.inverse_transform([x])[0] if 0 <= x < len(le.classes_) else le.classes_[0]
        )
    
    # Get predictions from CatBoost
    return best_model.predict_proba(X_decoded)

# Create LIME explainer with encoded data
train_sample_size = 5000
train_sample = X_test_encoded.sample(n=min(train_sample_size, len(X_test_encoded)), random_state=42)

explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=train_sample.values,
    feature_names=feature_cols,
    categorical_features=categorical_features_indices,
    class_names=['Not Appeared', 'Appeared'],
    mode='classification',
    random_state=42
)

print("LIME explainer created successfully")

## 8. Generate LIME Explanations

In [None]:
print("Generating LIME explanations...\n")

# Store explanations
explanations = []

# Generate explanations for each sample
for idx, instance_idx in enumerate(sample_indices):
    # Get encoded instance for LIME
    instance_encoded = X_test_encoded.iloc[instance_idx].values
    true_label = y_test.iloc[instance_idx]
    
    # Generate explanation using encoded data and wrapper function
    exp = explainer.explain_instance(
        data_row=instance_encoded,
        predict_fn=predict_fn_wrapper,
        num_features=10
    )
    
    # Get prediction using original data
    original_instance = X_test.iloc[instance_idx:instance_idx+1]
    pred_proba = best_model.predict_proba(original_instance)[0]
    pred_class = int(pred_proba[1] > 0.5)
    
    explanations.append({
        'index': instance_idx,
        'true_label': true_label,
        'predicted_label': pred_class,
        'predicted_proba': pred_proba[1],
        'explanation': exp
    })
    
    print(f"Instance {idx+1}/{len(sample_indices)}: True={true_label}, Pred={pred_class}, Proba={pred_proba[1]:.4f}")

print(f"\nGenerated {len(explanations)} LIME explanations")

## 9. Visualize LIME Explanations (Positive Examples)

In [None]:
# Visualize first 3 positive examples
print("LIME Explanations for Positive Predictions (Appeared):\n")

for i in range(min(3, len(sample_positive))):
    exp_data = explanations[i]
    print(f"\nExample {i+1}:")
    print(f"  True Label: {exp_data['true_label']} (Appeared)")
    print(f"  Predicted: {exp_data['predicted_label']} (Probability: {exp_data['predicted_proba']:.4f})")
    
    # Show as list
    fig = exp_data['explanation'].as_pyplot_figure()
    plt.title(f"LIME Explanation - Positive Example {i+1}")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f'lime_positive_{i+1}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
print(f"\nSaved positive example plots to: {OUTPUT_DIR}")

## 10. Visualize LIME Explanations (Negative Examples)

In [None]:
# Visualize first 3 negative examples
print("LIME Explanations for Negative Predictions (Not Appeared):\n")

for i in range(min(3, len(sample_negative))):
    exp_idx = len(sample_positive) + i
    exp_data = explanations[exp_idx]
    print(f"\nExample {i+1}:")
    print(f"  True Label: {exp_data['true_label']} (Not Appeared)")
    print(f"  Predicted: {exp_data['predicted_label']} (Probability: {exp_data['predicted_proba']:.4f})")
    
    # Show as list
    fig = exp_data['explanation'].as_pyplot_figure()
    plt.title(f"LIME Explanation - Negative Example {i+1}")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / f'lime_negative_{i+1}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
print(f"\nSaved negative example plots to: {OUTPUT_DIR}")

## 11. Extract Feature Importance from LIME

In [None]:
# Aggregate feature importance across all explanations
feature_importance = {}

for exp_data in explanations:
    exp = exp_data['explanation']
    for feature, weight in exp.as_list():
        # Extract feature name (remove value comparison)
        feature_name = feature.split()[0]
        if feature_name not in feature_importance:
            feature_importance[feature_name] = []
        feature_importance[feature_name].append(abs(weight))

# Calculate mean absolute importance
lime_importance_df = pd.DataFrame([
    {'feature': feature, 'mean_abs_importance': np.mean(weights)}
    for feature, weights in feature_importance.items()
]).sort_values('mean_abs_importance', ascending=False)

print("\nTop 10 Features by LIME Importance:")
print(lime_importance_df.head(10).to_string(index=False))

# Save to CSV
lime_importance_df.to_csv(OUTPUT_DIR / 'lime_feature_importance.csv', index=False)
print(f"\nSaved LIME feature importance to: {OUTPUT_DIR / 'lime_feature_importance.csv'}")

## 12. Compare LIME with SHAP Importance

In [None]:
# Load SHAP importance if available
shap_importance_path = MODEL_DIR.parent / 'outputs' / 'explainability' / 'shap' / 'shap_feature_importance.csv'

if shap_importance_path.exists():
    shap_importance_df = pd.read_csv(shap_importance_path)
    
    # Normalize both to 0-100
    lime_importance_df['importance_normalized'] = (
        lime_importance_df['mean_abs_importance'] / lime_importance_df['mean_abs_importance'].max() * 100
    )
    shap_importance_df['importance_normalized'] = (
        shap_importance_df['mean_abs_shap'] / shap_importance_df['mean_abs_shap'].max() * 100
    )
    
    # Merge
    comparison_df = lime_importance_df.merge(
        shap_importance_df[['feature', 'importance_normalized']],
        on='feature',
        suffixes=('_lime', '_shap'),
        how='outer'
    ).fillna(0).sort_values('importance_normalized_lime', ascending=False)
    
    print("\nComparison: LIME vs SHAP (Top 10):")
    print(comparison_df[['feature', 'importance_normalized_lime', 'importance_normalized_shap']].head(10).to_string(index=False))
    
    # Save comparison
    comparison_df.to_csv(OUTPUT_DIR / 'lime_shap_comparison.csv', index=False)
    
    # Plot comparison
    top_10 = comparison_df.head(10)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(top_10))
    width = 0.35
    
    ax.barh(x - width/2, top_10['importance_normalized_lime'], width, label='LIME', color='coral', alpha=0.8)
    ax.barh(x + width/2, top_10['importance_normalized_shap'], width, label='SHAP', color='steelblue', alpha=0.8)
    
    ax.set_yticks(x)
    ax.set_yticklabels(top_10['feature'])
    ax.set_xlabel('Normalized Importance Score')
    ax.set_ylabel('Feature')
    ax.set_title('Feature Importance: LIME vs SHAP (Top 10)', fontweight='bold')
    ax.legend()
    ax.invert_yaxis()
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'lime_shap_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nSaved comparison plot to: {OUTPUT_DIR / 'lime_shap_comparison.png'}")
else:
    print("\nSHAP results not found. Run SHAP notebook first for comparison.")

## 13. Generate LIME Analysis Report

In [None]:
# Generate summary report
report = {
    'analysis_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_file': 'best_model.cbm',
    'instances_explained': len(explanations),
    'positive_examples': len(sample_positive),
    'negative_examples': len(sample_negative),
    'top_5_features_lime': lime_importance_df.head(5)['feature'].tolist(),
    'outputs_generated': [
        'lime_positive_1.png',
        'lime_positive_2.png',
        'lime_positive_3.png',
        'lime_negative_1.png',
        'lime_negative_2.png',
        'lime_negative_3.png',
        'lime_feature_importance.csv',
        'lime_shap_comparison.csv',
        'lime_shap_comparison.png',
        'lime_analysis_report.json'
    ]
}

# Save report
with open(OUTPUT_DIR / 'lime_analysis_report.json', 'w') as f:
    json.dump(report, f, indent=2)

print("\n" + "="*60)
print("LIME ANALYSIS COMPLETE")
print("="*60)
print(f"\nAnalysis Date: {report['analysis_date']}")
print(f"Instances Explained: {report['instances_explained']}")
print(f"\nTop 5 Features (LIME):")
for i, feature in enumerate(report['top_5_features_lime'], 1):
    print(f"  {i}. {feature}")
print(f"\nOutputs saved to: {OUTPUT_DIR}")
print(f"Total files generated: {len(report['outputs_generated'])}")
print(f"\nReport saved to: {OUTPUT_DIR / 'lime_analysis_report.json'}")

## 14. Key Insights

In [None]:
print("\n" + "="*60)
print("KEY INSIGHTS FROM LIME ANALYSIS")
print("="*60)

print("\n1. LOCAL EXPLANATIONS:")
print("   LIME provides instance-specific explanations showing")
print("   which features contributed to individual predictions.")

print("\n2. FEATURE CONSISTENCY:")
print("   Top features align with SHAP analysis (temporal and frequency).")
print("   This validates the model's learning across different methods.")

print("\n3. PREDICTION DRIVERS:")
print("   - Positive predictions: Low current_gap, high appearance_rate")
print("   - Negative predictions: High current_gap, low appearance_rate")

print("\n4. MODEL INTERPRETABILITY:")
print("   LIME confirms the model makes intuitive decisions based")
print("   on temporal patterns and historical frequency.")

print("\n" + "="*60)