In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 


import copy
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
unneeded_data = ['Place_of_isolation','Patient_In','Patient_out']
catigorical_data=['Blood_Group','CONTROL_blood_group','Risk_Factor']
labels = ['Severeity','Outcome']
droped_col = unneeded_data+catigorical_data+labels

In [3]:
df = pd.read_csv("Covid-19_dataset.csv",header=1)


df['is_Male'] = pd.get_dummies(df['Gender'])['MALE']#one hot encoding gender so male = #1 and female = 0

blood_types = pd.get_dummies(df['Blood_Group']).columns
blood_group_columns = pd.get_dummies(df[['Blood_Group','CONTROL_blood_group']]).columns


df = pd.concat([df, pd.get_dummies(df[['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type']])], axis=1)



df = df.drop(['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type','Gender','Place_of_isolation','Patient_In','Patient_out'],axis=1)

columns = df.columns



# getting rid of blank values
print("before",df.shape)
for name in columns:
    df = df[df[name] != " "]
    df = df.dropna()
    
label_Severeity = df['Severeity'] #Severeity vs Outcome
label_Outcome = df['Outcome']
df = df.drop(['Severeity','Outcome'],axis=1)


df = df.applymap(lambda x: pd.to_numeric(x, errors='coerce'))#strings to numeric
all_columns = df.columns
# df = df.reset_index()

print("after",df.shape)


df_no_blood_data = copy.deepcopy(df.drop(blood_group_columns,axis=1))
df_no_blood_data


before (5668, 57)
after (5641, 55)


Unnamed: 0,Age,Incubation period (days),Fever,Chills,Cough,Dyspnea,Anosmia_Ageusia,Loss_of_appetite,Asthenia,Headache,...,Risk_Factor_DMCKD,Risk_Factor_DMHT,Risk_Factor_HT,Risk_Factor_HT.1,Risk_Factor_NONE,Risk_Factor_RF,Infection_type_Asymptomatic,Infection_type_Asymtomatic,Infection_type_Symptomatic,Infection_type_Symtomatic
0,61.0,14,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
1,49.0,28,1,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
2,88.0,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,56.0,17,1,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
4,41.0,1,0,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5663,65.0,14,1,0,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
5664,32.0,7,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
5665,28.0,4,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
5666,40.0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0


In [4]:
# X = df
# y = label_Severeity

def borutaSupportFeatures(X,y,verbose_level=1,print_fail=True,print_pass=True,tentive=False):
    np.random.seed(seed=1)
    # let's initialize a RF model 
    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5,random_state=1)

    # let's initialize Boruta
    feat_selector = BorutaPy(
            verbose=verbose_level,
            estimator=model,
            n_estimators='auto',
            alpha=0.1,
            max_iter=250  # number of iterations to perform
        )

    # train Boruta



    
    
    feat_selector.fit(np.array(X), np.array(y))
    support = [] #saves only conformed  features
    
    if(print_pass):
        for i in range(len(feat_selector.support_)):
            if feat_selector.support_[i]:
                print("Passes the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
                support.append(X.columns[i])

    return support
    


In [5]:
use_features = df.columns #all features 

In [6]:
possiable_features = borutaSupportFeatures(df[use_features],label_Outcome,verbose_level=0,print_fail=False,tentive=True)

Passes the test:  Age  - Ranking:  1
Passes the test:   Incubation period (days)  - Ranking:  1
Passes the test:  Cyanosis  - Ranking:  1
Passes the test:  Risk_Factor_AT  - Ranking:  1
Passes the test:  Risk_Factor_COPD  - Ranking:  1
Passes the test:  Risk_Factor_DM  - Ranking:  1
Passes the test:  Risk_Factor_DMHT  - Ranking:  1
Passes the test:  Risk_Factor_HT  - Ranking:  1
Passes the test:  Risk_Factor_NONE  - Ranking:  1


In [7]:
possiable_features

['Age',
 ' Incubation period (days)',
 'Cyanosis',
 'Risk_Factor_AT',
 'Risk_Factor_COPD',
 'Risk_Factor_DM',
 'Risk_Factor_DMHT',
 'Risk_Factor_HT',
 'Risk_Factor_NONE']

In [8]:
possiable_features = borutaSupportFeatures(df[use_features],label_Outcome,verbose_level=0,print_fail=False,tentive=True)

Passes the test:  Age  - Ranking:  1
Passes the test:   Incubation period (days)  - Ranking:  1
Passes the test:  Cyanosis  - Ranking:  1
Passes the test:  Risk_Factor_AT  - Ranking:  1
Passes the test:  Risk_Factor_COPD  - Ranking:  1
Passes the test:  Risk_Factor_DM  - Ranking:  1
Passes the test:  Risk_Factor_DMHT  - Ranking:  1
Passes the test:  Risk_Factor_HT  - Ranking:  1
Passes the test:  Risk_Factor_NONE  - Ranking:  1


In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

# load the iris dataset
X = df[possiable_features]
y = label_Outcome



In [10]:
# create a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)

# train the classifier
clf.fit(X, y)

# get feature importance scores
importance_scores = clf.feature_importances_

# get feature ranks
feature_ranks = sorted(range(len(importance_scores)), key=lambda k: importance_scores[k], reverse=True)

# print feature ranks
print("Feature importance ranks:")
for rank, idx in enumerate(feature_ranks):
    print(f"{rank+1}. Feature {df.columns[idx]}: {importance_scores[idx]}")


Feature importance ranks:
1. Feature  Incubation period (days): 0.5281007026666698
2. Feature Asthenia: 0.296215944080971
3. Feature Loss_of_appetite: 0.06453112706829803
4. Feature Age: 0.04852907578178657
5. Feature Dyspnea: 0.03153150973276426
6. Feature Cough: 0.01325957561262236
7. Feature Chills: 0.0106809551548222
8. Feature Anosmia_Ageusia: 0.003762608797615857
9. Feature Fever : 0.00338850110445009


In [11]:
df[possiable_features]

Unnamed: 0,Age,Incubation period (days),Cyanosis,Risk_Factor_AT,Risk_Factor_COPD,Risk_Factor_DM,Risk_Factor_DMHT,Risk_Factor_HT,Risk_Factor_NONE
0,61.0,14,0,0,0,0,0,0,1
1,49.0,28,0,0,0,0,0,0,1
2,88.0,1,1,0,1,0,0,0,0
3,56.0,17,0,0,0,0,0,0,1
4,41.0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
5663,65.0,14,0,0,0,0,0,0,1
5664,32.0,7,1,0,0,0,0,0,0
5665,28.0,4,1,0,0,0,0,0,0
5666,40.0,1,1,0,0,0,0,0,0


In [12]:
possiable_features

['Age',
 ' Incubation period (days)',
 'Cyanosis',
 'Risk_Factor_AT',
 'Risk_Factor_COPD',
 'Risk_Factor_DM',
 'Risk_Factor_DMHT',
 'Risk_Factor_HT',
 'Risk_Factor_NONE']