<a href="https://colab.research.google.com/github/Sanarazaaa/Deep-Learning-Classification-of-Parkinson-s-Dataset-with-SHAP-Interpretability/blob/main/Deep_Learning_Classification_of_Parkinson%E2%80%99s_Dataset_with_SHAP_Interpretability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import matplotlib.pyplot as plt

uploaded = files.upload()
file_name = list(uploaded.keys())[0]

data = pd.read_csv(file_name)

data = data.drop(columns=[col for col in data.columns if data[col].isnull().sum() == len(data)])

target_col = 'Replication'
y = data[target_col]

if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

X_cat = pd.get_dummies(data[categorical_cols])
X_num = data.select_dtypes(include=['int64', 'float64'])

X_processed = pd.concat([X_num, X_cat], axis=1)
feature_names = X_processed.columns.tolist()

X = X_processed.values
X = X.astype('float32')
y = y.astype('int')

print("Feature shape:", X.shape)
print("Target shape:", y.shape)
print("Target unique values:", np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Input(shape=(X.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(len(np.unique(y)), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2)

plt.figure(figsize=(10,5))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.figure(figsize=(10,5))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')

predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)

cm = confusion_matrix(y_test, predicted_classes)
cr = classification_report(y_test, predicted_classes)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", cr)

print("First 10 Predictions:", predicted_classes[:10])
print("First 10 True labels:", y_test[:10])

**This code plots multi-class ROC curves for the trained neural network.
It binarizes the test labels, predicts class probabilities, and computes
the false positive rate (FPR), true positive rate (TPR), and area under
the curve (AUC) for each class. The ROC curves visualize the model's
discriminative ability across all classes.**



In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

y_test_bin = label_binarize(y_test, classes=np.unique(y))
pred_prob = model.predict(X_test)

plt.figure(figsize=(10,6))
for i in range(y_test_bin.shape[1]):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], pred_prob[:, i])
    plt.plot(fpr, tpr, label=f'Class {i} (AUC = {auc(fpr,tpr):.2f})')

plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC Curves')
plt.legend()
plt.show()


**This code performs a t-SNE projection of the dataset to visualize high-dimensional features
in 2D space. Each class is plotted with a different color to explore clustering patterns
and separability between classes.**

In [None]:
from sklearn.manifold import TSNE

X_embedded = TSNE(n_components=2, random_state=42).fit_transform(X)
plt.figure(figsize=(8,6))
for class_val in np.unique(y):
    idx = np.where(y==class_val)
    plt.scatter(X_embedded[idx,0], X_embedded[idx,1], label=f'Class {class_val}')
plt.legend()
plt.title('t-SNE Projection of Parkinson\'s Features')
plt.show()


**This code trains a Random Forest classifier on the dataset to compute feature importance.
It identifies which features contribute most to the model's predictions and optionally
visualizes them with a horizontal bar plot.**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
for i, v in enumerate(importances):
    print(f"{feature_names[i]}: {v:.4f}")

# Optional: plot
import matplotlib.pyplot as plt
plt.barh(feature_names, importances)
plt.xlabel("Feature Importance")
plt.title("Random Forest Feature Importance")
plt.show()


**This code applies SHAP (SHapley Additive exPlanations) to interpret the neural network's
predictions. It computes SHAP values for a subset of test samples and visualizes the
impact of each feature on the model's outputs using a summary plot.**

In [None]:
import shap

explainer = shap.KernelExplainer(model.predict, X_train[:50])  # smaller sample for speed
shap_values = explainer.shap_values(X_test[:20])

shap.summary_plot(shap_values, X_test[:20], feature_names=feature_names)


**This code trains the neural network with early stopping to prevent overfitting.
Training stops if the validation loss does not improve for a specified number of epochs,
and the model restores the best weights observed during training.**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=16, callbacks=[early_stop])


**This code prepares the true and predicted labels for multi-class ROC/AUC evaluation.
It binarizes the test labels and obtains predicted probabilities from the neural network
to compute ROC curves and AUC scores for each class.**

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

from sklearn.preprocessing import label_binarize
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_bin = model.predict(X_test)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import auc

plt.figure(figsize=(8,6))
for i in range(y_test_bin.shape[1]):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    plt.plot(fpr, tpr, label=f'Class {i} (AUC = {auc(fpr, tpr):.2f})')

plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi-class ROC Curves')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

# Binarize the test labels
y_test_bin = label_binarize(y_test, classes=np.unique(y))
y_pred_bin = model.predict(X_test)

# Compute AUC for each class
for i, class_label in enumerate(np.unique(y)):
    auc_score = roc_auc_score(y_test_bin[:, i], y_pred_bin[:, i])
    print(f"Class {class_label} AUC: {auc_score:.4f}")


**This code prepares the true and predicted labels for multi-class ROC/AUC evaluation.
It binarizes the test labels and obtains predicted probabilities from the neural network
to compute ROC curves and AUC scores for each class.**

In [None]:
import nbformat
import os

# Find the notebook file
notebook_filename = None
for file in os.listdir('/content/'):
    if file.endswith('.ipynb'):
        notebook_filename = '/content/' + file
        break

if notebook_filename:
    # Load the current notebook
    nb = nbformat.read(notebook_filename, as_version=5)

    # Remove broken widget metadata
    if 'widgets' in nb['metadata']:
        nb['metadata'].pop('widgets')

    # Save cleaned notebook
    nbformat.write(nb, notebook_filename)
    print("Cleaned notebook saved.")
else:
    print("Notebook file not found in /content/")