In [3]:
pip install shap



In [1]:

# ========================== IMPORTS & LOAD DATA ==========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

import shap
import lime
import lime.lime_tabular

# ========================== LOAD & PREPROCESS DATA ==========================
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Cancer_Data.xlsx")
df.drop(columns=['id'], inplace=True)

# Encode target label
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])  # M=1, B=0

X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ========================== A2: RANDOMIZED SEARCH CV ==========================
from sklearn.ensemble import RandomForestClassifier

param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rfc = RandomForestClassifier(random_state=42)
rscv = RandomizedSearchCV(rfc, param_distributions=param_dist, n_iter=10, cv=5, random_state=42, n_jobs=-1)
rscv.fit(X_train, y_train)

print("Best Parameters from RandomizedSearchCV:", rscv.best_params_)
print("Training Accuracy:", rscv.score(X_train, y_train))
print("Testing Accuracy:", rscv.score(X_test, y_test))

# ========================== A3: COMPARING MULTIPLE CLASSIFIERS ==========================
models = {
    "Perceptron": Perceptron(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "CatBoost": CatBoostClassifier(verbose=0),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(max_iter=1000)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    f1 = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
    results.append({
        'Model': name,
        'Train Accuracy': round(train_acc, 4),
        'Test Accuracy': round(test_acc, 4),
        'F1 Score': round(f1, 4)
    })

results_df = pd.DataFrame(results)
print("\n=== Model Comparison Table ===")
print(results_df)

# ========================== O1: SHAP EXPLAINABILITY ==========================
# Note: SHAP may take time with full dataset. Limit for demo.
explainer = shap.Explainer(models["Random Forest"], X_train)
shap_values = explainer(X_test[:100])

shap.summary_plot(shap_values, X_test[:100], plot_type="bar")

# ========================== O2: LIME EXPLAINABILITY ==========================
explainer_lime = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X.columns,
    class_names=["Benign", "Malignant"],
    mode="classification"
)

i = 10  # Choose an index from test set
exp = explainer_lime.explain_instance(X_test.iloc[i].values, models["Random Forest"].predict_proba, num_features=10)
exp.show_in_notebook(show_all=False)

# ========================== CONFUSION MATRIX FOR BEST MODEL ==========================
best_model = models["Random Forest"]
y_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


ModuleNotFoundError: No module named 'lime'