#

In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
blacklist = ['NONE',' Incubation period (days)']

In [3]:
df = pd.read_csv("Covid-19_dataset.csv",header=1)


df['is_Male'] = pd.get_dummies(df['Gender'])['MALE']#one hot encoding gender so male = #1 and female = 0

#df = pd.concat([df, pd.get_dummies(df['Blood_Group'])], axis=1)
# df = pd.concat([df, pd.get_dummies(df['CONTROL_blood_group'])], axis=1) # needed to remove may not be needed
df = pd.concat([df, pd.get_dummies(df['Risk_Factor'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Infection_type'])], axis=1)

blood_types = pd.get_dummies(df['Blood_Group']).columns


df = df.drop(['CONTROL_blood_group','Risk_Factor','Infection_type','Gender','Place_of_isolation','Patient_In','Patient_out'],axis=1)
columns = df.columns



# getting rid of blank values
print("before",len(df))
for name in columns:
    df = df[df[name] != " "]
    df = df.dropna()
    
label_Severeity = df['Severeity'] #Severeity vs Outcome
label_Outcome = df[["Outcome","Blood_Group"]]





df = df.drop(['Severeity','Outcome'],axis=1)


blood = pd.DataFrame(df["Blood_Group"])
df = df.drop(['Blood_Group'],axis=1)


df = df.applymap(lambda x: pd.to_numeric(x, errors='coerce'))

df = df.join(blood)
columns = df.columns


print("after",len(df))

df

before 5668
after 5641


Unnamed: 0,Age,Incubation period (days),Fever,Chills,Cough,Dyspnea,Anosmia_Ageusia,Loss_of_appetite,Asthenia,Headache,...,DMHT,HT,HT.1,NONE,RF,Asymptomatic,Asymtomatic,Symptomatic,Symtomatic,Blood_Group
0,61.0,14,1,1,1,1,1,1,1,1,...,0,0,0,1,0,0,0,1,0,A+
1,49.0,28,1,0,1,0,1,1,1,1,...,0,0,0,1,0,0,0,1,0,O+
2,88.0,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,1,0,A+
3,56.0,17,1,0,1,0,1,1,1,1,...,0,0,0,1,0,0,0,1,0,A+
4,41.0,1,0,0,1,0,1,1,1,1,...,0,0,0,1,0,0,0,1,0,B+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5663,65.0,14,1,0,1,1,1,1,1,1,...,0,0,0,1,0,0,0,1,0,B+
5664,32.0,7,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,B+
5665,28.0,4,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,1,0,B+
5666,40.0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,1,0,A+


In [4]:
df.dtypes

Age                          float64
 Incubation period (days)      int64
Fever                          int64
Chills                         int64
Cough                          int64
Dyspnea                        int64
Anosmia_Ageusia                int64
Loss_of_appetite               int64
Asthenia                       int64
Headache                       int64
Muscle_ache                    int64
Cyanosis                       int64
Rhinorrhea                     int64
Sore_throat                    int64
Diarrhea                       int64
Nausea_vomiting                int64
is_Male                        int64
AP                             int64
ARF                            int64
AT                             int64
CA                             int64
CKD                            int64
CLD                            int64
COPD                           int64
COPDDM                         int64
COPDHT                         int64
DM                             int64
D

In [5]:
TearOneFeature = [' Incubation period (days)','Age', 'Cyanosis', 'AT', 'COPD', 'DM', 'DMHT', 'HT', 'NONE']
TearTwoFeature = ['Sore_throat', 'Diarrhea', 'AP']
TearThreeFeature = ['CA', 'Headache', 'ARF', 'CKD', 'COPDDM', 'Cough']

coreFeatures = TearOneFeature+TearTwoFeature+TearThreeFeature+['Blood_Group']

In [6]:
df=df[coreFeatures]

In [7]:
df=df.drop(blacklist,axis=1)

# strength of the regression using logistic regression and root mean square error


In [8]:

def rank(predictor,blood_type):

    df_temp = df[df['Blood_Group'] == blood_type]
    


    X = df_temp.drop(['Blood_Group'],axis=1)


    X = pd.DataFrame(X[predictor])
    #print(X)


    y = label_Outcome[label_Outcome['Blood_Group'] == blood_type]
    y = y["Outcome"]





    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
    my_logreg = LogisticRegression()

    my_logreg.fit(X_train,y_train)
    y_predict = my_logreg.predict(X_test)


    # here are the probabilites for every patient for every label
    y_predict_probs = my_logreg.predict_proba(X_test)
    
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict_probs[:,1], pos_label='Recovered')
    AUC = metrics.auc(fpr, tpr)
    
    return AUC





blood_types = ["A+","A-","AB+","B+","B-","O+","O-"]

dataframes = []

BloodType_df = pd.DataFrame()

for blood_type in blood_types:
    print(blood_type,"-----------------------")
    list_of_AUC = []
    for feature in df.columns:
        if feature == "Blood_Group":
            continue
        #["Age"," Incubation period (days)","Cyanosis","AP","ARF","AT","COPD","DM","DMHT","HT","NONE"]:
        list_of_AUC.append({blood_type+" Feature":feature, blood_type+" AUC" : rank(feature,blood_type)})
        #print(list_of_AUC[-1]["Feature"], "   " ,list_of_AUC[-1]["AUC"])
        
    df_temp = pd.DataFrame(list_of_AUC)
    df_temp = df_temp.sort_values(by=[blood_type+' AUC',blood_type+' Feature'], ascending=False)

    df_temp = df_temp.reset_index()
    
    dataframes.append(df_temp)
    
    BloodType_df[blood_type+' Feature'] = df_temp[blood_type+' Feature']
    BloodType_df[blood_type+' AUC'] = df_temp[blood_type+' AUC']
    #print(df_temp)



#https://towardsdatascience.com/feature-selection-with-boruta-in-python-676e3877e596



A+ -----------------------
A- -----------------------
AB+ -----------------------
B+ -----------------------
B- -----------------------
O+ -----------------------
O- -----------------------


In [9]:
BloodType_df.head(10)

Unnamed: 0,A+ Feature,A+ AUC,A- Feature,A- AUC,AB+ Feature,AB+ AUC,B+ Feature,B+ AUC,B- Feature,B- AUC,O+ Feature,O+ AUC,O- Feature,O- AUC
0,Age,0.8076,Age,0.888889,Age,0.867944,Age,0.851122,DM,1.0,Age,0.891319,Cyanosis,1.0
1,HT,0.714286,Headache,0.75,Cyanosis,0.733871,DM,0.7,Diarrhea,0.848214,HT,0.625,Age,0.666667
2,DM,0.642857,AT,0.75,HT,0.625,HT,0.65,Age,0.580357,DM,0.583333,Diarrhea,0.611111
3,AT,0.571429,Sore_throat,0.5,DM,0.625,COPD,0.6,Cough,0.553571,COPD,0.583333,Cough,0.555556
4,Sore_throat,0.559889,HT,0.5,AT,0.625,Cyanosis,0.583791,Sore_throat,0.544643,DMHT,0.581944,HT,0.5
5,COPD,0.535714,DMHT,0.5,Sore_throat,0.556452,Cough,0.5601,HT,0.5,Cyanosis,0.568056,DMHT,0.5
6,Headache,0.526462,DM,0.5,COPDDM,0.5,AT,0.55,DMHT,0.5,Sore_throat,0.515278,DM,0.5
7,Cyanosis,0.512037,Cough,0.5,COPD,0.5,Headache,0.527431,COPDDM,0.5,COPDDM,0.5,COPDDM,0.5
8,COPDDM,0.5,COPDDM,0.5,CKD,0.5,Sore_throat,0.50611,COPD,0.5,CKD,0.5,COPD,0.5
9,CKD,0.5,COPD,0.5,CA,0.5,COPDDM,0.5,CKD,0.5,CA,0.5,CKD,0.5


In [10]:
BloodType_df.head(10)

Unnamed: 0,A+ Feature,A+ AUC,A- Feature,A- AUC,AB+ Feature,AB+ AUC,B+ Feature,B+ AUC,B- Feature,B- AUC,O+ Feature,O+ AUC,O- Feature,O- AUC
0,Age,0.8076,Age,0.888889,Age,0.867944,Age,0.851122,DM,1.0,Age,0.891319,Cyanosis,1.0
1,HT,0.714286,Headache,0.75,Cyanosis,0.733871,DM,0.7,Diarrhea,0.848214,HT,0.625,Age,0.666667
2,DM,0.642857,AT,0.75,HT,0.625,HT,0.65,Age,0.580357,DM,0.583333,Diarrhea,0.611111
3,AT,0.571429,Sore_throat,0.5,DM,0.625,COPD,0.6,Cough,0.553571,COPD,0.583333,Cough,0.555556
4,Sore_throat,0.559889,HT,0.5,AT,0.625,Cyanosis,0.583791,Sore_throat,0.544643,DMHT,0.581944,HT,0.5
5,COPD,0.535714,DMHT,0.5,Sore_throat,0.556452,Cough,0.5601,HT,0.5,Cyanosis,0.568056,DMHT,0.5
6,Headache,0.526462,DM,0.5,COPDDM,0.5,AT,0.55,DMHT,0.5,Sore_throat,0.515278,DM,0.5
7,Cyanosis,0.512037,Cough,0.5,COPD,0.5,Headache,0.527431,COPDDM,0.5,COPDDM,0.5,COPDDM,0.5
8,COPDDM,0.5,COPDDM,0.5,CKD,0.5,Sore_throat,0.50611,COPD,0.5,CKD,0.5,COPD,0.5
9,CKD,0.5,COPD,0.5,CA,0.5,COPDDM,0.5,CKD,0.5,CA,0.5,CKD,0.5


In [11]:
feature_names = [x for x in BloodType_df.head(8).columns if 'Feature' in x]
df_topFive = BloodType_df.head(5)
df_topFive = df_topFive[feature_names]
df_topFive

Unnamed: 0,A+ Feature,A- Feature,AB+ Feature,B+ Feature,B- Feature,O+ Feature,O- Feature
0,Age,Age,Age,Age,DM,Age,Cyanosis
1,HT,Headache,Cyanosis,DM,Diarrhea,HT,Age
2,DM,AT,HT,HT,Age,DM,Diarrhea
3,AT,Sore_throat,DM,COPD,Cough,COPD,Cough
4,Sore_throat,HT,AT,Cyanosis,Sore_throat,DMHT,HT


In [12]:
for i in df_topFive.columns:
    print(i," : ",list(df_topFive[i]),",")

A+ Feature  :  ['Age', 'HT', 'DM', 'AT', 'Sore_throat'] ,
A- Feature  :  ['Age', 'Headache', 'AT', 'Sore_throat', 'HT'] ,
AB+ Feature  :  ['Age', 'Cyanosis', 'HT', 'DM', 'AT'] ,
B+ Feature  :  ['Age', 'DM', 'HT', 'COPD', 'Cyanosis'] ,
B- Feature  :  ['DM', 'Diarrhea', 'Age', 'Cough', 'Sore_throat'] ,
O+ Feature  :  ['Age', 'HT', 'DM', 'COPD', 'DMHT'] ,
O- Feature  :  ['Cyanosis', 'Age', 'Diarrhea', 'Cough', 'HT'] ,
