<a href="https://colab.research.google.com/github/Saisathwika-08/Explainable-AI/blob/main/Lab_Assignment_2_Explainabke_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# ==============================================
# Olympics Medal Prediction + SHAP (Safe Version)
# ==============================================
# - 80/20 split
# - RandomForestClassifier
# - Metrics: Accuracy, Precision, Recall, F1, ROC-AUC
# - SHAP: summary plot only (50 samples, fast)
# ==============================================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Paths

CSV_PATH = "/content/dataset_olympics.csv"
OUTPUT_DIR = "."
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load and preprocess

df = pd.read_csv(CSV_PATH)

# Keep only 500 random rows (or less if dataset is smaller)
if len(df) > 500:
    df_small = df.sample(n=500, random_state=42)
else:
    df_small = df.copy()

# Save reduced dataset
df_small.to_csv("dataset_olympics_reduced.csv", index=False)

print("Original rows:", len(df))
print("Reduced rows:", len(df_small))

df = df.drop(columns=[c for c in ["ID", "Name"] if c in df.columns])

# Target: Medal (fill NaN as "No Medal")
df["Medal"] = df["Medal"].fillna("No Medal")
X = df.drop(columns=["Medal"])
y = df["Medal"]

# Split categorical/numeric
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

preprocess = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),
])


# Train/test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


# Model

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
pipe = Pipeline([("preprocess", preprocess), ("model", rf)])
pipe.fit(X_train, y_train)

# Metrics

y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
rec  = recall_score(y_test, y_pred, average="weighted", zero_division=0)
f1   = f1_score(y_test, y_pred, average="weighted", zero_division=0)
roc  = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")

print("=== Classification Metrics ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC AUC  : {roc:.4f}")


# Feature names

def get_feature_names(ct, features):
    names = []
    for name, trans, cols in ct.transformers_:
        if hasattr(trans, "get_feature_names_out"):
            out = trans.get_feature_names_out(cols)
        elif isinstance(trans, Pipeline):
            last = trans.steps[-1][1]
            if hasattr(last, "get_feature_names_out"):
                out = last.get_feature_names_out(cols)
            else:
                out = cols
        else:
            out = cols
        names.extend([str(x) for x in out])
    return names

feature_names = get_feature_names(pipe.named_steps["preprocess"], X.columns)

# SHAP (tiny sample, 50 rows)

X_test_trans = pipe.named_steps["preprocess"].transform(X_test)
try:
    X_test_np = X_test_trans.toarray()
except:
    X_test_np = np.asarray(X_test_trans)

X_shap = X_test_np[:50]  # only 50 rows → very fast
explainer = shap.Explainer(rf, X_shap)
shap_values = explainer(X_shap)

# Save summary plot
summary_path = os.path.join(OUTPUT_DIR, "shap_summary_fast.png")
plt.figure()
shap.summary_plot(shap_values, features=X_shap, feature_names=feature_names, show=False)
plt.tight_layout()
plt.savefig(summary_path, dpi=150, bbox_inches="tight")
plt.close()

print(f"\nSaved SHAP summary plot to: {summary_path}")

# Top 5 SHAP features

abs_mean_shap = np.abs(shap_values.values).mean(axis=0).mean(axis=1)
shap_importance = pd.Series(abs_mean_shap, index=feature_names).sort_values(ascending=False)
print("\n=== Top 5 Features by SHAP Importance ===")
print(shap_importance.head(5))

Original rows: 70000
Reduced rows: 500
=== Classification Metrics ===
Accuracy : 0.9061
Precision: 0.9004
Recall   : 0.9061
F1-score : 0.8874
ROC AUC  : 0.8545





Saved SHAP summary plot to: ./shap_summary_fast.png

=== Top 5 Features by SHAP Importance ===
Year               0.009213
Height             0.007832
Weight             0.007613
Sport_Hockey       0.007284
Sport_Athletics    0.006516
dtype: float64
