<a href="https://colab.research.google.com/github/Sameera326/EXPLAINABLE-AI-Assignment/blob/main/EX_AI_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Imports and Data Loading
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import shap
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv('credit_card_default.csv')

# Data Cleaning
df = df.drop_duplicates()
df = df.dropna()

# Categorical Encoding (if needed)
# Example: Label encode categorical columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Feature & Target Selection
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

# Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 2. Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))

# 3. SHAP Analysis
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

# Summary Plot
shap.summary_plot(shap_values[1], X_test, feature_names=X.columns)

# Force Plot (first sample)
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test, feature_names=X.columns, matplotlib=True)

# Waterfall Plot (first sample)
shap.waterfall_plot(shap.Explanation(values=shap_values[1], base_values=explainer.expected_value[1], data=X_test, feature_names=X.columns))

# 4. Feature Importance Comparison
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'RandomForestImportance': importances})
shap_feature_importance = pd.DataFrame({'Feature': X.columns, 'SHAPValue': np.abs(np.mean(shap_values[1], axis=0))})
merged_importance = feature_importance_df.merge(shap_feature_importance, on='Feature').sort_values('SHAPValue', ascending=False)
print("Top 5 features by SHAP:\n", merged_importance.head(5))
