# Basic Baruta Check

In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 


import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
df = pd.read_csv("Covid-19_dataset.csv",header=1)


df['is_Male'] = pd.get_dummies(df['Gender'])['MALE']#one hot encoding gender so male = #1 and female = 0

df = pd.concat([df, pd.get_dummies(df['Blood_Group'])], axis=1)
# df = pd.concat([df, pd.get_dummies(df['CONTROL_blood_group'])], axis=1) # needed to remove may not be needed
df = pd.concat([df, pd.get_dummies(df['Risk_Factor'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Infection_type'])], axis=1)

blood_types = pd.get_dummies(df['Blood_Group']).columns


df = df.drop(['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type','Gender','Place_of_isolation','Patient_In','Patient_out'],axis=1)

columns = df.columns



# getting rid of blank values
print("before",len(df))
for name in columns:
    df = df[df[name] != " "]
    df = df.dropna()
    
label_Severeity = df['Severeity'] #Severeity vs Outcome
label_Outcome = df['Outcome']
df = df.drop(['Severeity','Outcome'],axis=1)

df = df.applymap(lambda x: pd.to_numeric(x, errors='coerce'))
columns = df.columns


print("after",len(df))

before 5668
after 5641


In [3]:
df.dtypes

Age                          float64
 Incubation period (days)      int64
Fever                          int64
Chills                         int64
Cough                          int64
Dyspnea                        int64
Anosmia_Ageusia                int64
Loss_of_appetite               int64
Asthenia                       int64
Headache                       int64
Muscle_ache                    int64
Cyanosis                       int64
Rhinorrhea                     int64
Sore_throat                    int64
Diarrhea                       int64
Nausea_vomiting                int64
is_Male                        int64
A+                             int64
A-                             int64
AB+                            int64
AB-                            int64
B+                             int64
B-                             int64
O+                             int64
O-                             int64
AP                             int64
ARF                            int64
A

In [4]:
#df = pd.DataFrame(df["Age"])
# y = label_Severeity

# print(len(X),len(y))
#

In [5]:
df

Unnamed: 0,Age,Incubation period (days),Fever,Chills,Cough,Dyspnea,Anosmia_Ageusia,Loss_of_appetite,Asthenia,Headache,...,DMCKD,DMHT,HT,HT.1,NONE,RF,Asymptomatic,Asymtomatic,Symptomatic,Symtomatic
0,61.0,14,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
1,49.0,28,1,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
2,88.0,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,56.0,17,1,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
4,41.0,1,0,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5663,65.0,14,1,0,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
5664,32.0,7,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
5665,28.0,4,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
5666,40.0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0


# strength of the regression using logistic regression and root mean square error


In [6]:

from sklearn.linear_model import LogisticRegression


def rank(predictor):
    X = pd.DataFrame(df[predictor])

    y = label_Outcome
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
    my_logreg = LogisticRegression()

    my_logreg.fit(X_train,y_train)
    y_predict = my_logreg.predict(X_test)


    # here are the probabilites for every patient for every label
    y_predict_probs = my_logreg.predict_proba(X_test)
    
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict_probs[:,1], pos_label='Recovered')
    AUC = metrics.auc(fpr, tpr)
    return AUC



list_of_AUC = []
for feature in df.columns:
    #["Age"," Incubation period (days)","Cyanosis","AP","ARF","AT","COPD","DM","DMHT","HT","NONE"]:
    list_of_AUC.append({"Feature":feature, "Rank" : rank(feature)})
    print(list_of_AUC[-1]["Feature"], "   " ,list_of_AUC[-1]["Rank"])

df_temp = pd.DataFrame(list_of_AUC)
#https://towardsdatascience.com/feature-selection-with-boruta-in-python-676e3877e596



Age     0.8039894887720975
 Incubation period (days)     0.9988294314381271
Fever      0.5399187768752987
Chills     0.5650581302755215
Cough     0.5285873546743112
Dyspnea     0.5168498168498169
Anosmia_Ageusia     0.4774008600095557
Loss_of_appetite     0.4975553432075171
Asthenia     0.5133062589584328
Headache     0.49632903328555505
Muscle_ache     0.4807214524605829
Cyanosis     0.6863752189839146
Rhinorrhea     0.5067287784679089
Sore_throat     0.5009635292243988
Diarrhea     0.49550883898709985
Nausea_vomiting     0.5098264054785794
is_Male     0.5159181398311832
A+     0.4412884217232043
A-     0.5282130912565695
AB+     0.5096591813983118
AB-     0.5117216117216117
B+     0.5320831342570473
B-     0.4904682274247492
O+     0.461498646281255
O-     0.4833890746934225
AP     0.5108695652173914
ARF     0.5217391304347826
AT     0.5543478260869565
CA     0.5217391304347826
CKD     0.5108695652173914
CLD     0.5
COPD     0.5434782608695652
COPDDM     0.5217391304347826
COPDHT    

In [7]:
df_temp.sort_values(by=['Rank'], ascending=False).head(25)

Unnamed: 0,Feature,Rank
1,Incubation period (days),0.998829
41,NONE,0.943454
0,Age,0.803989
11,Cyanosis,0.686375
39,HT,0.651808
34,DM,0.575721
3,Chills,0.565058
27,AT,0.554348
31,COPD,0.543478
2,Fever,0.539919


In [8]:
df.columns

Index(['Age', ' Incubation period (days)', 'Fever ', 'Chills', 'Cough',
       'Dyspnea', 'Anosmia_Ageusia', 'Loss_of_appetite', 'Asthenia',
       'Headache', 'Muscle_ache', 'Cyanosis', 'Rhinorrhea', 'Sore_throat',
       'Diarrhea', 'Nausea_vomiting', 'is_Male', 'A+', 'A-', 'AB+', 'AB-',
       'B+', 'B-', 'O+', 'O-', 'AP', 'ARF', 'AT', 'CA', 'CKD', 'CLD', 'COPD',
       'COPDDM', 'COPDHT', 'DM', 'DMARF', 'DMAT', 'DMCKD', 'DMHT', 'HT', 'HT ',
       'NONE', 'RF', 'Asymptomatic', 'Asymtomatic', 'Symptomatic',
       'Symtomatic'],
      dtype='object')

In [9]:
# create a sample DataFrame
# for feature in df.columns:
    #["Age"," Incubation period (days)","Cyanosis","AP","ARF","AT","COPD","DM","DMHT","HT","NONE"]:
#     list_of_AUC.append({"Feature":feature, "Rank" : rank(feature)})
#     print(label_Outcome.corr(df[feature]))


In [10]:
label_Outcome

0       Recovered
1       Recovered
2            Died
3       Recovered
4       Recovered
          ...    
5663    Recovered
5664         Died
5665         Died
5666         Died
5667         Died
Name: Outcome, Length: 5641, dtype: object

NameError: name 'my_logreg' is not defined