In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
#dataset
df = pd.read_csv('dataset.csv')

label_encoder = LabelEncoder()
rf = RandomForestClassifier()

In [None]:
# Replace 'Unknown' with NaN
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

In [None]:
all_x_columns= [
    'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Blood cell count (mcL)', "Mother's age", "Father's age", 'Status', 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

df_encoded=df.copy()
for column in all_x_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))

In [None]:
# Extract features and target
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'], axis=1)
y = df_encoded['Disorder Subclass']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, stratify=y) # training as 50%
rf.fit(X_train, y_train)

In [None]:
feature_importances_cat = rf.feature_importances_
print (feature_importances_cat)
import matplotlib.pyplot as plt

feature_names = X.columns 
print(feature_names)
plt.barh(feature_names, feature_importances_cat)
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.title('Random Forest Disorder Subclass Feature Importances')
plt.show()

ANOVA

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
dfcolumns = pd.DataFrame(X.columns)
y = df_encoded['Disorder Subclass']

In [None]:
bestFeaturesANOVA = SelectKBest(f_classif, k=5)
fitANOVA = bestFeaturesANOVA.fit(X, y)

print(fitANOVA.get_support(indices=True))

scoresANOVA = pd.DataFrame(fitANOVA.scores_)

featureScoresANOVA = pd.concat([dfcolumns,scoresANOVA],axis=1)
featureScoresANOVA.columns = ['Feature','ANOVA Score']  #naming the dataframe columns


print("ANOVA:")
print(featureScoresANOVA.sort_values('ANOVA Score',ascending=False))  #print features sorted by score


# Plot ANOVA scores for each feature
featureScoresANOVA.sort_values('ANOVA Score',ascending=True).plot(kind='barh',x='Feature',color='tab:blue')
plt.title("ANOVA Disorder Subclass")


In [None]:
import seaborn as sns

In [None]:
df_encoded['Genetic Disorder'] = label_encoder.fit_transform(df_encoded['Genetic Disorder'].astype(str))
df_encoded['Disorder Subclass'] = label_encoder.fit_transform(df_encoded['Disorder Subclass'].astype(str))

In [None]:
#3. Correlation Matrix

# Genetic Disorder
corrmat = df_encoded.corr()
top_corr_features = corrmat.index

correlation=corrmat['Genetic Disorder'] #column with the correlation of the features with DEATH_EVENT

correlation=abs(correlation) #use the absolute value of the correlation

correlation=correlation.drop(['Genetic Disorder'])


#Plot the correlation of each feature with the DEATH_EVENT
plt.figure()
correlation.sort_values(ascending=True).plot(kind='barh',color='tab:green')
plt.title("Correlation with Genetic Disorder")

plt.figure(figsize=(20,20))

# Disorder subclass


#Plot Heat Map of the the correlation
g=sns.heatmap(df_encoded[top_corr_features].corr(),annot=True,cmap="RdYlGn")

print("\n\nCorrelation with Genetic Disorder:")
print(correlation.sort_values(ascending=False))



# Disorder Subclass
corrmat = df_encoded.drop('Genetic Disorder',axis=1).corr()
top_corr_features = corrmat.index

correlation=corrmat['Disorder Subclass'] #column with the correlation of the features with DEATH_EVENT

correlation=abs(correlation) #use the absolute value of the correlation

correlation=correlation.drop(['Disorder Subclass'])


#Plot the correlation of each feature with the DEATH_EVENT
plt.figure()
correlation.sort_values(ascending=True).plot(kind='barh',color='tab:green')
plt.title("Correlation with Disorder Subclass")

plt.figure(figsize=(20,20))

# Disorder subclass


#Plot Heat Map of the the correlation
g=sns.heatmap(df_encoded[top_corr_features].corr(),annot=True,cmap="RdYlGn")

print("\n\nCorrelation with Disorder Subclass:")
print(correlation.sort_values(ascending=False))