<a href="https://colab.research.google.com/github/Shivamani162/EAI_lab/blob/main/Assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dice-ml
!pip install xgboost
!pip install shap

Collecting dice-ml
  Downloading dice_ml-0.12-py3-none-any.whl.metadata (20 kB)
Collecting raiutils>=0.4.0 (from dice-ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading dice_ml-0.12-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading raiutils-0.4.2-py3-none-any.whl (17 kB)
Installing collected packages: raiutils, dice-ml
Successfully installed dice-ml-0.12 raiutils-0.4.2


In [2]:

# -----------------------------
# Step 1: Imports
# -----------------------------
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from numpy.linalg import norm

# DiCE
import dice_ml
from dice_ml import Dice

# -----------------------------
# Step 2: Load dataset
# -----------------------------
raw_path = "/content/loan_approval_dataset.csv"
df_raw = pd.read_csv(raw_path)
print("Original shape:", df_raw.shape)
print(df_raw.columns.tolist())
# display(df_raw.head()) # Not supported in all environments, will comment out

# -----------------------------
# Step 3: Clean names & target
# -----------------------------
df_raw.columns = df_raw.columns.str.lower().str.strip()
possible_targets = [c for c in df_raw.columns if ('loan' in c and 'status' in c)]
if not possible_targets:
    fallback = [c for c in df_raw.columns if c in ['target', 'status', 'label', 'approved']]
    possible_targets = fallback
if not possible_targets:
    raise ValueError("Could not find target column automatically.")
target_col = possible_targets[0]
print("Using target column:", target_col)
id_cols = [c for c in df_raw.columns if 'id' in c]
if id_cols:
    df_raw = df_raw.drop(columns=id_cols)

# -----------------------------
# Step 4: Missing values
# -----------------------------
for c in df_raw.columns:
    if c != target_col:
        df_raw[c] = pd.to_numeric(df_raw[c], errors='ignore')
feature_cols = [c for c in df_raw.columns if c != target_col]
num_cols = df_raw[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]
df_imputed = df_raw.copy()
for c in num_cols:
    df_imputed[c] = df_imputed[c].fillna(df_imputed[c].median())
for c in cat_cols:
    mode_val = df_imputed[c].mode(dropna=True)
    df_imputed[c] = df_imputed[c].fillna(mode_val[0] if not mode_val.empty else "missing")

# -----------------------------
# Step 5: Encode target
# -----------------------------
le_target = LabelEncoder()
y_raw = df_imputed[target_col].astype(str)
y_encoded = le_target.fit_transform(y_raw)
print("Target classes mapping:", dict(zip(le_target.classes_, range(len(le_target.classes_)))))
df_dice = pd.concat([df_imputed[feature_cols].reset_index(drop=True),
                      pd.Series(y_encoded, name=target_col)], axis=1)
continuous_features = num_cols.copy()

# -----------------------------
# Step 6: Preprocessing pipeline
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

# -----------------------------
# Step 7: Train-test split
# -----------------------------
X = df_imputed[feature_cols].copy()
y = y_encoded
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# -----------------------------
# Step 8: Train classifiers
# -----------------------------
pipe_lr = Pipeline(steps=[('preprocessor', preprocessor),
                            ('clf', LogisticRegression(max_iter=2000, random_state=42))])
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
                            ('clf', RandomForestClassifier(n_estimators=200, random_state=42))])
pipe_lr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train)
models = {'Logistic Regression': pipe_lr, 'Random Forest': pipe_rf}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Performance:")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("Precision:", round(precision_score(y_test, y_pred, zero_division=0), 4))
    print("Recall:", round(recall_score(y_test, y_pred, zero_division=0), 4))
    print("F1-score:", round(f1_score(y_test, y_pred, zero_division=0), 4))
    print(classification_report(y_test, y_pred, zero_division=0))
bb_pipeline = pipe_rf  # final model for DiCE

# -----------------------------
# Step 9: Wrap data + model
# -----------------------------
d = dice_ml.Data(dataframe=df_dice,
                  continuous_features=continuous_features,
                  outcome_name=target_col)
m = dice_ml.Model(model=bb_pipeline, backend="sklearn", model_type='classifier')
exp = Dice(d, m, method="random")

# -----------------------------
# Step 10: Pick rejected instance
# -----------------------------
neg_label = 0  # usually "Rejected"
query_instance = None
for idx in X_test.index:
    pred = bb_pipeline.predict(X_test.loc[[idx]])[0]
    if pred == neg_label:
        query_instance = X_test.loc[[idx]].reset_index(drop=True)
        chosen_index = idx
        break
print("\nSelected instance (Rejected):")
# display(query_instance)
print(query_instance)
print("Predicted label:", neg_label, "==>", le_target.inverse_transform([neg_label])[0])

# -----------------------------
# Step 11: Generate CFs → Approved
# -----------------------------
pos_label = 1 if neg_label == 0 else 0
dice_exp = exp.generate_counterfactuals(query_instance,
                                        total_CFs=3,
                                        desired_class=pos_label,
                                        features_to_vary="all")
cf_df = dice_exp.cf_examples_list[0].final_cfs_df.reset_index(drop=True)
print("\nCounterfactuals generated:")
# display(cf_df)
print(cf_df)

# -----------------------------
# Step 12: Show BEFORE vs AFTER
# -----------------------------
orig_pred = bb_pipeline.predict(query_instance)[0]
cf_preds = bb_pipeline.predict(cf_df[feature_cols])
print("\n=== Loan Decision Status ===")
print("Original Instance:", le_target.inverse_transform([orig_pred])[0], "(Rejected)")
for i, p in enumerate(cf_preds):
    print(f"CF_{i+1}:", le_target.inverse_transform([p])[0], "(Approved)")

# Build comparison table
compare_table = pd.concat([
    query_instance.assign(example="Original (Rejected)"),
    cf_df[feature_cols].assign(example=[f"CF_{i+1} (Approved)" for i in range(len(cf_df))])
], ignore_index=True)
cols = ['example'] + [c for c in compare_table.columns if c != 'example']
compare_table = compare_table[cols]
# display(compare_table)
print("\nComparison Table:")
print(compare_table)

# -----------------------------
# Step 12b: Compute Euclidean and Manhattan distance
# -----------------------------
# We need to apply the same preprocessor to the original and counterfactual instances
X_orig_scaled = preprocessor.transform(query_instance)
X_cf_scaled = preprocessor.transform(cf_df[feature_cols])

# Compute Euclidean distance (L2 norm) for each counterfactual
euclidean_distances = [norm(X_cf_scaled[i] - X_orig_scaled[0]) for i in range(len(cf_df))]
cf_df['euclidean_distance'] = euclidean_distances

# Compute Manhattan distance (L1 norm) for each counterfactual
manhattan_distances = [norm(X_cf_scaled[i] - X_orig_scaled[0], ord=1) for i in range(len(cf_df))]
cf_df['manhattan_distance'] = manhattan_distances

print("\nCounterfactuals with Euclidean and Manhattan distances:")
# display(cf_df[['loan_status', 'euclidean_distance', 'manhattan_distance'] + feature_cols])
print(cf_df[['loan_status', 'euclidean_distance', 'manhattan_distance'] + feature_cols])

# -----------------------------
# Step 13: Reflections
# -----------------------------
print("\n--- REFLECTIONS ---")
print("✔ Original instance was REJECTED.")
print("✔ Counterfactuals flipped decision to APPROVED with minimal changes.")
print("✔ This shows how small, actionable changes (like income, loan amount, credit history) can alter outcomes.")
print("✔ Counterfactuals increase trust by answering 'what-if' questions for end-users.")

Original shape: (4269, 13)
['loan_id', ' no_of_dependents', ' education', ' self_employed', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value', ' loan_status']
Using target column: loan_status
Target classes mapping: {' Approved': 0, ' Rejected': 1}

Logistic Regression Performance:
Accuracy: 0.9239
Precision: 0.9188
Recall: 0.8762
F1-score: 0.897
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       531
           1       0.92      0.88      0.90       323

    accuracy                           0.92       854
   macro avg       0.92      0.91      0.92       854
weighted avg       0.92      0.92      0.92       854


Random Forest Performance:
Accuracy: 0.9789
Precision: 0.9841
Recall: 0.9598
F1-score: 0.9718
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       531
      

100%|██████████| 1/1 [00:00<00:00,  2.82it/s]


Counterfactuals generated:
   no_of_dependents  education self_employed  income_annum  loan_amount  \
0                 3   Graduate            No       8000000     26200000   
1                 3   Graduate            No       8000000     26200000   
2                 3   Graduate            No       8000000     26200000   

   loan_term  cibil_score  commercial_assets_value  luxury_assets_value  \
0         16          400                  1930064             25000000   
1         16          454                  4300000             25000000   
2         16          514                  4300000             25000000   

   bank_asset_value  loan_status  
0           4000000            1  
1           4000000            1  
2           4000000            1  

=== Loan Decision Status ===
Original Instance:  Approved (Rejected)
CF_1:  Rejected (Approved)
CF_2:  Rejected (Approved)
CF_3:  Rejected (Approved)

Comparison Table:
               example  no_of_dependents  education self_emp


