<a href="https://colab.research.google.com/github/RafkaAS/Genetic_Disorder_Prediction_XAI/blob/main/Genetic_Disorder_Prediction_Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Case Study: Genetic Disorder Prediction Using XAI**

## **EDA & Data Preprocessing**

### Import libraries.

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

### Load the dataset.

In [None]:
#install Kaggle
!wget "https://docs.google.com/uc?export=download&id=1Nrj8zK6bpHkdoi11gcWWLJSsSCCuMPRY" -O "kaggle.json"  

! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
#check if Kaggle installation worked successfully
! kaggle datasets list

In [None]:
#load Kaggle dataset
! kaggle datasets download imsparsh/of-genomes-and-genetics-hackerearth-ml

In [None]:
#unzip the dataset folder
! unzip of-genomes-and-genetics-hackerearth-ml.zip

### Read and visualize the dataset.

In [None]:
#read the train dataset
df = pd.read_csv('train.csv')

df.head()

### Perform exploratory data analysis (EDA).

In [None]:
#determine the shape of the dataset
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns.')

In [None]:
#check the datatype, missing values, etc.
df.info()

In [None]:
#check the mean, std, min, max, etc. values for numerical features
df.describe()

In [None]:
#check count, unique, top, freq for categorical features
df.describe(include='O')

### Drop unnecessary features.

In [None]:
df = df.drop(columns=['Patient Id', 'Patient First Name', 'Family Name',
       "Father's name", 'Institute Name','Location of Institute','Parental consent',
       'Place of birth', 'Test 1', 'Test 2', 'Test 3', 'Test 4', 'Test 5'])

df.info() #check

### Visualize the features, and identify any anomalies.

In [None]:
df['Patient Age'].plot(kind = 'hist')
plt.show()
df['Gender'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df["Mother's age"].plot(kind = 'hist')
plt.show()
df["Father's age"].plot(kind = 'hist')
plt.show()

In [None]:
df["Genes in mother's side"].value_counts().plot(kind = 'bar')
plt.show()
df["Inherited from father"].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['Maternal gene'].value_counts().plot(kind = 'bar')
plt.show()
df['Paternal gene'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['Blood cell count (mcL)'].plot(kind = 'hist')
plt.show()
df['White Blood cell count (thousand per microliter)'].plot(kind = 'hist')
plt.show()
df['Blood test result'].value_counts().plot(kind = 'bar')
plt.show()
df['Folic acid details (peri-conceptional)'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['Respiratory Rate (breaths/min)'].value_counts().plot(kind = 'bar')
plt.show()
df['Heart Rate (rates/min'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['Status'].value_counts().plot(kind = 'bar')

In [None]:
df['Follow-up'].value_counts().plot(kind = 'bar')

In [None]:
df['Birth asphyxia'].value_counts().plot(kind = 'bar')
plt.show()
df['Birth defects'].value_counts().plot(kind = 'bar')
plt.show()
df['Autopsy shows birth defect (if applicable)'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['H/O serious maternal illness'].value_counts().plot(kind = 'bar')
plt.show()
df['H/O radiation exposure (x-ray)'].value_counts().plot(kind = 'bar')
plt.show()
df['H/O substance abuse'].value_counts().plot(kind = 'bar')
plt.show()
df['Assisted conception IVF/ART'].value_counts().plot(kind = 'bar')
plt.show()
df['History of anomalies in previous pregnancies'].value_counts().plot(kind = 'bar')
plt.show()
df['No. of previous abortion'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['Symptom 1'].value_counts().plot(kind = 'bar')
plt.show()
df['Symptom 2'].value_counts().plot(kind = 'bar')
plt.show()
df['Symptom 3'].value_counts().plot(kind = 'bar')
plt.show()
df['Symptom 4'].value_counts().plot(kind = 'bar')
plt.show()
df['Symptom 5'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
df['Genetic Disorder'].value_counts().plot(kind = 'bar')
plt.show()
df['Disorder Subclass'].value_counts().plot(kind = 'bar')
plt.show()

### Perform cross-tabbing to identify potential correlations between features.

In [None]:
#if any has strong correlation with class prediction, check for subclass prediction next.

In [None]:
age = pd.crosstab(df['Patient Age'],df['Genetic Disorder'])
age.plot(kind='bar')

In [None]:
gender = pd.crosstab(df['Gender'],df['Genetic Disorder'])
gender.plot(kind='bar')

In [None]:
mom_age = pd.crosstab(df["Mother's age"],df['Genetic Disorder'])
mom_age.plot(kind='bar')

In [None]:
dad_age = pd.crosstab(df["Father's age"],df['Genetic Disorder'])
dad_age.plot(kind='bar')

In [None]:
mom_side_genes = pd.crosstab(df["Genes in mother's side"],df['Genetic Disorder'])
mom_side_genes.plot(kind='bar')

In [None]:
dad_side_genes = pd.crosstab(df["Inherited from father"],df['Genetic Disorder'])
dad_side_genes.plot(kind='bar')

In [None]:
maternal_gene = pd.crosstab(df['Maternal gene'],df['Genetic Disorder'])
maternal_gene.plot(kind='bar')

In [None]:
paternal_gene = pd.crosstab(df['Paternal gene'],df['Genetic Disorder'])
paternal_gene.plot(kind='bar')

In [None]:
# blood_cell_count = pd.crosstab(df['Blood cell count (mcL)'],df['Genetic Disorder'])
# blood_cell_count.plot(kind='bar')

In [None]:
# white_blood = pd.crosstab(df['White Blood cell count (thousand per microliter)'],df['Genetic Disorder'])
# white_blood.plot(kind='bar')

In [None]:
blood_test = pd.crosstab(df['Blood test result'],df['Genetic Disorder'])
blood_test.plot(kind='bar')

In [None]:
folic_acid = pd.crosstab(df['Folic acid details (peri-conceptional)'],df['Genetic Disorder'])
folic_acid.plot(kind='bar')

In [None]:
respiratory_rate = pd.crosstab(df['Respiratory Rate (breaths/min)'],df['Genetic Disorder'])
respiratory_rate.plot(kind='bar')

In [None]:
heart_rate = pd.crosstab(df['Heart Rate (rates/min'],df['Genetic Disorder'])
heart_rate.plot(kind='bar')

In [None]:
status = pd.crosstab(df['Status'],df['Genetic Disorder'])
status.plot(kind='bar')

In [None]:
follow_up = pd.crosstab(df['Follow-up'],df['Genetic Disorder'])
follow_up.plot(kind='bar')

In [None]:
birth_asphyxia = pd.crosstab(df['Birth asphyxia'],df['Genetic Disorder'])
birth_asphyxia.plot(kind='bar')

In [None]:
birth_defects = pd.crosstab(df['Birth defects'],df['Genetic Disorder'])
birth_defects.plot(kind='bar')

In [None]:
autopsy_birth_defects = pd.crosstab(df['Autopsy shows birth defect (if applicable)'],df['Genetic Disorder'])
autopsy_birth_defects.plot(kind='bar')

In [None]:
history_maternal_illness = pd.crosstab(df['H/O serious maternal illness'],df['Genetic Disorder'])
history_maternal_illness.plot(kind='bar')

In [None]:
history_radiation = pd.crosstab(df['H/O radiation exposure (x-ray)'],df['Genetic Disorder'])
history_radiation.plot(kind='bar')

In [None]:
history_substance_abuse = pd.crosstab(df['H/O substance abuse'],df['Genetic Disorder'])
history_substance_abuse.plot(kind='bar')

In [None]:
ivf = pd.crosstab(df['Assisted conception IVF/ART'],df['Genetic Disorder'])
ivf.plot(kind='bar')

In [None]:
anom_prev_preg = pd.crosstab(df['History of anomalies in previous pregnancies'],df['Genetic Disorder'])
anom_prev_preg.plot(kind='bar')

In [None]:
abortion = pd.crosstab(df['No. of previous abortion'],df['Genetic Disorder'])
abortion.plot(kind='bar')

In [None]:
symptom_1 = pd.crosstab(df['Symptom 1'],df['Genetic Disorder'])
symptom_1.plot(kind='bar')

symptom_2 = pd.crosstab(df['Symptom 2'],df['Genetic Disorder'])
symptom_2.plot(kind='bar')

symptom_3 = pd.crosstab(df['Symptom 3'],df['Genetic Disorder'])
symptom_3.plot(kind='bar')

symptom_4 = pd.crosstab(df['Symptom 4'],df['Genetic Disorder'])
symptom_4.plot(kind='bar')

symptom_5 = pd.crosstab(df['Symptom 5'],df['Genetic Disorder'])
symptom_5.plot(kind='bar')

In [None]:
subclass_disorder = pd.crosstab(df["Genetic Disorder"],df["Disorder Subclass"], margins=True)
subclass_disorder

### Treat missing data.

In [None]:
#check if there are null values
df.isnull().sum()

In [None]:
#drop columns with low importance
df = df.drop(columns=['Birth asphyxia', 'Autopsy shows birth defect (if applicable)',
                      'H/O radiation exposure (x-ray)', 'H/O substance abuse'])

df.info()

In [None]:
#delete rows, or replace with mode if categorical and mean/median if numerical
df = df.dropna(subset=['Inherited from father'])
df['Patient Age'] = df['Patient Age'].fillna(df['Patient Age'].mean())
df["Mother's age"] = df["Mother's age"].fillna(df["Mother's age"].mean())
df["Father's age"] = df["Father's age"].fillna(df["Father's age"].mean())
df["Blood cell count (mcL)"] = df["Blood cell count (mcL)"].fillna(df["Blood cell count (mcL)"].mean())
df["White Blood cell count (thousand per microliter)"] = df["White Blood cell count (thousand per microliter)"].fillna(df["White Blood cell count (thousand per microliter)"].mean())
df["No. of previous abortion"] = df["No. of previous abortion"].fillna(df["No. of previous abortion"].median())

df['Symptom 1'] = df['Symptom 1'].fillna(df['Symptom 1'].mode()[0])
df['Symptom 2'] = df['Symptom 2'].fillna(df['Symptom 2'].mode()[0])
df['Symptom 3'] = df['Symptom 3'].fillna(df['Symptom 3'].mode()[0])
df['Symptom 4'] = df['Symptom 4'].fillna(df['Symptom 4'].mode()[0])
df['Symptom 5'] = df['Symptom 5'].fillna(df['Symptom 5'].mode()[0])
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Maternal gene'] = df['Maternal gene'].fillna(df['Maternal gene'].mode()[0])
df['Respiratory Rate (breaths/min)'] = df['Respiratory Rate (breaths/min)'].fillna(df['Respiratory Rate (breaths/min)'].mode()[0])
df['Heart Rate (rates/min'] = df['Heart Rate (rates/min'].fillna(df['Heart Rate (rates/min'].mode()[0])
df['Follow-up'] = df['Follow-up'].fillna(df['Follow-up'].mode()[0])
df['Folic acid details (peri-conceptional)'] = df['Folic acid details (peri-conceptional)'].fillna(df['Folic acid details (peri-conceptional)'].mode()[0])
df['H/O serious maternal illness'] = df['H/O serious maternal illness'].fillna(df['H/O serious maternal illness'].mode()[0])
df['Assisted conception IVF/ART'] = df['Assisted conception IVF/ART'].fillna(df['Assisted conception IVF/ART'].mode()[0])
df['History of anomalies in previous pregnancies'] = df['History of anomalies in previous pregnancies'].fillna(df['History of anomalies in previous pregnancies'].mode()[0])
df['Birth defects'] = df['Birth defects'].fillna(df['Birth defects'].mode()[0])
df['Blood test result'] = df['Blood test result'].fillna(df['Blood test result'].mode()[0])

df.isnull().sum() #check

In [None]:
#based on the values in the target variable "Disorder Subclass", predict the "Genetic Disorder" class 
null_disorders = df[df['Genetic Disorder'].isna()]
index_null = df['Genetic Disorder'].isna()

for i in range(len(null_disorders)):
 if(null_disorders.iloc[i]['Disorder Subclass'] == "Leber's hereditary optic neuropathy"	
    or null_disorders.iloc[i]['Disorder Subclass'] == "Leigh syndrome"
    or	null_disorders.iloc[i]['Disorder Subclass'] == "Mitochondrial myopathy"):
   df['Genetic Disorder'][df.index[index_null][i]] = 'Mitochondrial genetic inheritance disorders'

 elif(null_disorders.iloc[i]['Disorder Subclass'] == "Alzheimer's"	
    or null_disorders.iloc[i]['Disorder Subclass'] == "Cancer"
    or	null_disorders.iloc[i]['Disorder Subclass'] == "Diabetes"):
   df['Genetic Disorder'][df.index[index_null][i]] = 'Multifactorial genetic inheritance disorders'

 elif(null_disorders.iloc[i]['Disorder Subclass'] == "Cystic fibrosis"	
    or null_disorders.iloc[i]['Disorder Subclass'] == "Hemochromatosis"
    or null_disorders.iloc[i]['Disorder Subclass'] == "Tay-Sachs"):
   df['Genetic Disorder'][df.index[index_null][i]] = 'Single-gene inheritance diseases'

df[df['Genetic Disorder'].isna()]

In [None]:
#temporary deletion of other null rows in class and sublass
df = df.dropna(subset=['Genetic Disorder','Disorder Subclass'])
df.isnull().sum()

In [None]:
df.isnull().sum().any() #check

### Save dataset.

In [None]:
df.to_csv('Genetic_Disorder_Dataset.csv', index=False) #save the clean version to use later