In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import lime
import lime.lime_tabular

# Set random seed for reproducibility
RANDOM_STATE = 42

def load_and_split_data():
    """
    Loads the Iris dataset and splits it into training and testing sets.

    Returns:
        tuple: (X_train, X_test, y_train, y_test, feature_names, class_names)
    """
    # Load the Iris dataset, which is a classification problem
    iris = load_iris()
    X = iris.data
    y = iris.target
    feature_names = iris.feature_names
    class_names = iris.target_names

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
    )
    return X_train, X_test, y_train, y_test, feature_names, class_names

# --- A1. Implement Stacking Classifier ---
def create_stacking_classifier(base_models, final_estimator=None):
    """
    Creates a StackingClassifier (Task A1).

    Args:
        base_models (list): List of (name, estimator) tuples for base models.
        final_estimator (estimator, optional): The meta-model. Defaults to LogisticRegression.

    Returns:
        StackingClassifier: The configured stacking classifier.
    """
    # Define a default final estimator if none is provided
    if final_estimator is None:
        final_estimator = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=RANDOM_STATE)
    
    # Initialize StackingClassifier
    stacking_clf = StackingClassifier(
        estimators=base_models,
        final_estimator=final_estimator,
        cv=5, # Use 5-fold cross-validation for training the meta-model
        n_jobs=-1 # Use all available cores
    )
    return stacking_clf

# --- A2. Implement Pipeline ---
def create_pipeline(classifier):
    """
    Implements a Pipeline with a preprocessor and the classifier (Task A2).

    Args:
        classifier (estimator): The final classification model (e.g., StackingClassifier).

    Returns:
        Pipeline: The scikit-learn Pipeline object.
    """
    # Define steps: 1. Scaling (preprocessing), 2. Classifier (Stacking)
    pipeline = Pipeline(steps=[
        ('preprocessor', StandardScaler()), # Data processing step
        ('classifier', classifier) # The final estimator step
    ])
    return pipeline

# --- A3. Using LIME explainer, explain the outcomes of pipeline ---
def explain_pipeline_outcome_with_lime(pipeline, X_train, X_test, feature_names, class_names, instance_idx=0):
    """
    Uses LIME to explain a specific prediction from the fitted Pipeline (Task A3).

    Args:
        pipeline (Pipeline): The fitted scikit-learn Pipeline.
        X_train (np.array): Training features (for LIME background distribution).
        X_test (np.array): Testing features.
        feature_names (list): Names of the features.
        class_names (list): Names of the target classes.
        instance_idx (int): Index of the instance in X_test to explain.
    
    Returns:
        tuple: (instance_to_explain, LIME_explanation)
    """
    # LIME requires the training data to be un-scaled to build its proximity space
    # and the predict_proba function from the Pipeline will handle the scaling internally.
    
    # Initialize LIME Tabular Explainer
    explainer = lime.lime_tabular.LimeTabularExplainer(
        training_data=X_train, # Un-scaled training data
        feature_names=feature_names,
        class_names=class_names,
        mode='classification',
        random_state=RANDOM_STATE,
        # 'kernel_width': 0.75 # You can experiment with this
    )

    # Select an instance to explain (first instance in test set)
    instance_to_explain = X_test[instance_idx]
    
    # Define the prediction function for the explainer as the Pipeline's predict_proba
    # The pipeline's predict_proba will apply the StandardScaler internally.
    predict_fn = pipeline.predict_proba
    
    # Explain the prediction for the selected instance
    explanation = explainer.explain_instance(
        data_row=instance_to_explain,
        predict_fn=predict_fn,
        num_features=len(feature_names), # Use all features
        top_labels=1 # Explain the top predicted class
    )
    
    return instance_to_explain, explanation

# --- Main Program Execution ---
if __name__ == '__main__':
    # 1. Load Data
    X_train, X_test, y_train, y_test, feature_names, class_names = load_and_split_data()
    print("🤖 Data Loaded and Split.")
    print("-" * 50)
    
    # Define base models for StackingClassifier
    # Experiment with various models (Task A1)
    base_models = [
        ('dt', DecisionTreeClassifier(random_state=RANDOM_STATE)),
        ('svc', SVC(kernel='linear', probability=True, random_state=RANDOM_STATE)),
    ]

    # Experiment with various metamodels (Task A1) - Here, a simple Logistic Regression is used.
    meta_model = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=RANDOM_STATE)
    
    # 2. Create Stacking Classifier (Task A1)
    stacking_clf = create_stacking_classifier(base_models, final_estimator=meta_model)
    print("✅ Stacking Classifier created.")
    
    # 3. Create and Fit Pipeline (Task A2)
    full_pipeline = create_pipeline(stacking_clf)
    print("✅ Pipeline created: StandardScaler -> StackingClassifier")
    
    print("⚙️ Training the Pipeline...")
    full_pipeline.fit(X_train, y_train)
    print("Training complete.")
    print("-" * 50)
    
    # 4. Evaluate the Pipeline
    y_pred = full_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print("📊 Pipeline Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))
    print("-" * 50)
    
    # 5. LIME Explanation (Task A3)
    instance_idx_to_explain = 5 # Choosing an arbitrary instance from the test set
    instance, explanation = explain_pipeline_outcome_with_lime(
        pipeline=full_pipeline, 
        X_train=X_train, 
        X_test=X_test, 
        feature_names=feature_names, 
        class_names=class_names, 
        instance_idx=instance_idx_to_explain
    )
    
    print(f"🔬 LIME Explanation for Test Instance Index: {instance_idx_to_explain}")
    print(f"Instance Features: {instance}")
    predicted_class = full_pipeline.predict([instance])[0]
    predicted_class_name = class_names[predicted_class]
    print(f"Pipeline Predicted Class: {predicted_class_name}")
    
    # Get the explanation for the predicted class
    top_label_exp = explanation.as_list(label=predicted_class)
    
    print(f"\nTop 4 Features Contributing to Prediction ({predicted_class_name}):")
    # Print the explanation (feature contributions)
    for feature, weight in top_label_exp:
        print(f"   - {feature}: {weight:.4f}")

    # Optional: Display the explanation plot (requires an environment that can render plots/HTML, like Jupyter/Colab)
    # explanation.show_in_notebook(show_table=True) 
    # explanation.save_to_file('lime_explanation.html')

🤖 Data Loaded and Split.
--------------------------------------------------
✅ Stacking Classifier created.
✅ Pipeline created: StandardScaler -> StackingClassifier
⚙️ Training the Pipeline...
Training complete.
--------------------------------------------------
📊 Pipeline Evaluation:
Accuracy: 0.9333

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.88      0.93      0.90        15
   virginica       0.93      0.87      0.90        15

    accuracy                           0.93        45
   macro avg       0.93      0.93      0.93        45
weighted avg       0.93      0.93      0.93        45

--------------------------------------------------
🔬 LIME Explanation for Test Instance Index: 5
Instance Features: [6.7 3.1 5.6 2.4]
Pipeline Predicted Class: virginica

Top 4 Features Contributing to Prediction (virginica):
   - petal width (cm) > 1.90: 0.3882
   - petal length (cm) > 5.10:

