<a href="https://colab.research.google.com/github/RafkaAS/Genetic_Disorder_Prediction_XAI/blob/main/Genetic_Disorder_Prediction_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Case Study: Genetic Disorder Prediction Using XAI**

### Load the dataset.

In [None]:
!wget "https://docs.google.com/uc?export=download&id=11XypQz1fEKj82bv29fO_7JjwDNPEv5PE" -O "Genetic_Disorder_Dataset.csv"

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Genetic_Disorder_Dataset.csv')

df.head()

### Encode categorical data.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cat_data = df[["Genes in mother's side",'Inherited from father','Maternal gene','Paternal gene','Status',
               'Respiratory Rate (breaths/min)','Heart Rate (rates/min','Follow-up','Gender',
               'Folic acid details (peri-conceptional)','H/O serious maternal illness','Assisted conception IVF/ART',
               'History of anomalies in previous pregnancies','Birth defects','Blood test result',
               'Symptom 1','Symptom 2','Symptom 3','Symptom 4','Symptom 5', 'Disorder Subclass', 'Genetic Disorder']]
num_data = df[['Patient Age','Blood cell count (mcL)',"Mother's age","Father's age",'No. of previous abortion',
               'White Blood cell count (thousand per microliter)']]

cat_data.head() #check

In [None]:
for i in cat_data:
    cat_data[i] = le.fit_transform(cat_data[i])

cat_data.head() #check

In [None]:
#concatenate categorical and numerical data
df_encoded = pd.concat([num_data, cat_data], axis=1)
df_encoded.head()

### Balance the data.

In [None]:
#standardize the data
df_max = df_encoded.iloc[:,0:-2].max()
df_encoded.iloc[:,0:-2] = df_encoded.iloc[:,0:-2].divide(df_max)

df_encoded.describe()

In [None]:
#visualize before data balancing
plt.hist(df_encoded['Genetic Disorder'])
plt.show()

In [None]:
#balance the data
from imblearn.over_sampling import SMOTE

x_new = df_encoded.iloc[:,0:-1]
y_new = df_encoded.iloc[:,-1]
 
over = SMOTE()

x_new, y_new = over.fit_resample(x_new, y_new)

#visualize after data balancing
plt.hist([y_new])

In [None]:
df_balanced = pd.concat([x_new, y_new], axis=1)
df_balanced.info()

In [None]:
from keras.utils import np_utils

y_new_encoded = le.fit_transform(y_new)
y_new_encoded = np_utils.to_categorical(y_new)

y_new_encoded #check

### Build the model.

In [None]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

def create_model():
  model = Sequential()
  model.add(Dense(100, input_dim = 26, activation = 'relu'))
  model.add(Dense(64, activation = 'relu'))
  model.add(Dense(48, activation = 'relu'))
  model.add(Dense(32, activation = 'relu'))
  model.add(Dense(16, activation = 'relu'))
  model.add(Dense(8, activation = 'relu'))
  model.add(Dense(4, activation = 'relu'))
  model.add(Dense(3, activation = "softmax"))
  model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
  return model

model = KerasClassifier(build_fn=create_model)

### Split the dataset into training and testing.

In [None]:
from sklearn.model_selection import train_test_split

x = x_new.iloc[:,0:-1]
y = y_new_encoded

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

### Perform hyperparameter tuning.

In [None]:
#apply grid search to find the best combination of parameters
from sklearn.model_selection import GridSearchCV

params={'batch_size':[50, 100], 
        'nb_epoch':[100, 150, 300]
        }

search = GridSearchCV(estimator=model, param_grid=params, cv=3)
search.fit(x.values, y)

print("Best params:", search.best_params_)
print("Best score:", search.best_score_)

### Perform first model run.

In [None]:
#first model run has "Genetic Disorder" as target variable

In [None]:
model = Sequential()
model.add(Dense(100, input_dim = 26, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(48, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(4, activation = 'relu'))
model.add(Dense(3, activation = "softmax"))

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', 
              metrics = ['accuracy', 'Precision', 'Recall'])

epochs = search.best_params_.get('nb_epoch')
batch_size = search.best_params_.get('batch_size')

model.fit(x.values, y, validation_split=0.2, epochs=epochs, 
          batch_size=batch_size, shuffle=True)

In [None]:
#plot the model's accuracies
accuracy1 = model.history.history['acc']
val_accuracy1 = model.history.history['val_acc']
epochs = range(1,301)
plt.plot(epochs, accuracy1, 'g', label='Training Accuracy')
plt.plot(epochs, val_accuracy1, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracies')
plt.xlabel('Epochs')
plt.ylabel('Accuracies')
plt.legend()
plt.show()

### Evaluate feature importance.

In [None]:
#feature importance for neural network using SHAP

In [None]:
!pip install lifelines
!pip install shap

In [None]:
import matplotlib.pyplot as plt
import sklearn
import lifelines
import shap


features = ['Patient Age','Blood cell count (mcL)',"Mother's age","Father's age",'No. of previous abortion',
               'White Blood cell count (thousand per microliter)', "Genes in mother's side",'Inherited from father','Maternal gene','Paternal gene','Status',
               'Respiratory Rate (breaths/min)','Heart Rate (rates/min','Follow-up','Gender',
               'Folic acid details (peri-conceptional)','H/O serious maternal illness','Assisted conception IVF/ART',
               'History of anomalies in previous pregnancies','Birth defects','Blood test result',
               'Symptom 1','Symptom 2','Symptom 3','Symptom 4','Symptom 5']


e = shap.DeepExplainer(model, x_train.iloc[:100, :]) #select a batch from dataset to avoid crashing

In [None]:
#plot bar graph of features' shap values
shap_values = e.shap_values(x_test.iloc[:10,:].values)
shap.summary_plot(shap_values[0],x_test,feature_names=features, plot_type="bar",show=False)
plt.savefig('summary.png')

In [None]:
shap.initjs()
# visualize the importance of all the features
r = shap.force_plot(e.expected_value[0], shap_values[0], feature_names=x_train.columns,show=False)
shap.save_html("all_features.html", r)

r

In [None]:
vals = e.shap_values(x_test[:1].values)
shap.initjs()
#visualize the feature importance for the first test sample
features_importance = shap.force_plot(e.expected_value[0], vals[0], x_test.iloc[0,:],show=False)
shap.save_html("features_importance.html", features_importance)

features_importance

### Perform second model run.

In [None]:
#second model run has "Disorder Subclass" as target variable, and "Genetic Disorder" output from first run as a feature

In [None]:
df_balanced = df_balanced[['Patient Age', 'Blood cell count (mcL)', "Mother's age", "Father's age",
       'No. of previous abortion',
       'White Blood cell count (thousand per microliter)',
       "Genes in mother's side", "Inherited from father", 'Maternal gene',
       'Paternal gene', 'Status', 'Respiratory Rate (breaths/min)',
       'Heart Rate (rates/min', 'Follow-up', 'Gender',
       'Folic acid details (peri-conceptional)',
       'H/O serious maternal illness', 'Assisted conception IVF/ART',
       'History of anomalies in previous pregnancies', 'Birth defects',
       'Blood test result', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4',
       'Symptom 5', 'Genetic Disorder', 'Disorder Subclass']]

df_balanced.head()

In [None]:
#standardize "Genetic Disorder" values
df_balanced['Genetic Disorder'] = df_balanced['Genetic Disorder'].divide(2)
df_balanced['Genetic Disorder']

In [None]:
#visualize "Disorder Subclass" data
plt.hist(df_balanced['Disorder Subclass'])
plt.show()

In [None]:
#encode target variable
y = le.fit_transform(df_balanced.iloc[:,-1])
y = np_utils.to_categorical(y)

y

In [None]:
#split the dataset into training and testing
x = df_balanced.iloc[:,0:-1]

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
#second model run
model2 = Sequential()
model2.add(Dense(100, input_dim = 27, activation = 'relu'))
model2.add(Dense(64, activation = 'relu'))
model2.add(Dense(48, activation = 'relu'))
model2.add(Dense(32, activation = 'relu'))
model2.add(Dense(16, activation = 'relu'))
model2.add(Dense(8, activation = 'relu'))
model2.add(Dense(4, activation = 'relu'))
model2.add(Dense(9, activation = "softmax"))

model2.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'Precision', 'Recall'])

epochs = search.best_params_.get('nb_epoch')
batch_size = search.best_params_.get('batch_size')

model2.fit(x.values, y, validation_split=0.2, epochs=epochs, 
          batch_size=batch_size, shuffle=True)

In [None]:
#plot the model's accuracies
accuracy2 = model2.history.history['acc']
val_accuracy2 = model2.history.history['val_acc']
epochs = range(1,301)
plt.plot(epochs, accuracy2, 'g', label='Training Accuracy')
plt.plot(epochs, val_accuracy2, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracies')
plt.xlabel('Epochs')
plt.ylabel('Accuracies')
plt.legend()
plt.show()

In [None]:
#apply SHAP
e = shap.DeepExplainer(model2,  x_train.iloc[:50, :])
#plot bar graph of features' shap values
shap_values = e.shap_values(x_test.iloc[:10,:].values)
shap.summary_plot(shap_values[0],x_test,feature_names=df_encoded.columns[:-1], plot_type="bar",show=False)
plt.savefig('summary_subclass.png')

In [None]:
vals = e.shap_values(x_test[:1].values)
shap.initjs()
#visualize the feature importance for the first test sample
features_importance_subclass = shap.force_plot(e.expected_value[0], vals[0], x_test.iloc[0,:],show=False)
shap.save_html("features_importance_subclass.html", features_importance_subclass)

features_importance_subclass

### ***For Testing purposes: do a model run just on "Disorder Subclass", and dropping the "Genetic Disorder" altogether.***

In [None]:
#drop the "Genetic Disorder" column
df_subclass = df_encoded.drop(columns=['Genetic Disorder'])

df_subclass.head()

In [None]:
#balance the data for "Disorder Subclass", the target variable
x_subclass = df_subclass.iloc[:,0:-1]
y_subclass = df_subclass.iloc[:,-1]
 
over = SMOTE()

x_subclass, y_subclass = over.fit_resample(x_subclass, y_subclass)

plt.hist([y_subclass])

In [None]:
y_subclass = le.fit_transform(y_subclass)
y_subclass = np_utils.to_categorical(y_subclass)

y_subclass

In [None]:
#split the dataset into training and testing
x_train, x_test, y_train, y_test = train_test_split(x_subclass, y_subclass)

In [None]:
#perform the model run
model3 = Sequential()
model3.add(Dense(100, input_dim = 26, activation = 'relu'))
model3.add(Dense(64, activation = 'relu'))
model3.add(Dense(48, activation = 'relu'))
model3.add(Dense(32, activation = 'relu'))
model3.add(Dense(16, activation = 'relu'))
model3.add(Dense(8, activation = 'relu'))
model3.add(Dense(4, activation = 'relu'))
model3.add(Dense(9, activation = "softmax"))

model3.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'Precision', 'Recall'])

epochs = search.best_params_.get('nb_epoch')
batch_size = search.best_params_.get('batch_size')

model3.fit(x_subclass.values, y_subclass, validation_split=0.2, epochs=epochs, 
          batch_size=epochs, shuffle=True)

In [None]:
#plot the model's accuracies
accuracy3 = model3.history.history['acc']
val_accuracy3 = model3.history.history['val_acc']
epochs = range(1,301)
plt.plot(epochs, accuracy3, 'g', label='Training Accuracy')
plt.plot(epochs, val_accuracy3, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracies')
plt.xlabel('Epochs')
plt.ylabel('Accuracies')
plt.legend()
plt.show()

In [None]:
#apply SHAP
e = shap.DeepExplainer(model3,  x_train.iloc[:50, :])
#plot bar graph of features' shap values
shap_values = e.shap_values(x_test.iloc[:10,:].values)
shap.summary_plot(shap_values[0],x_test,feature_names=df_encoded.columns[:-1], plot_type="bar",show=False)
plt.savefig('summary_subclass_alone.png')

In [None]:
e = shap.KernelExplainer(model3.predict, x_train.head(50))
shap_values = e.shap_values(x_test.iloc[0,:])
shap.initjs()
#visualize the feature importance for the first test sample
shap.force_plot(e.expected_value[0], shap_values[0], x_test.iloc[0,:])