# Basic Baruta Check

In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve 
from sklearn.feature_selection import SelectFromModel

import copy
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [2]:
# !pip install Boruta


In [3]:
unneeded_data = ['Place_of_isolation','Patient_In','Patient_out']
catigorical_data=['Blood_Group','CONTROL_blood_group','Risk_Factor']
labels = ['Severeity','Outcome']
droped_col = unneeded_data+catigorical_data+labels

In [4]:
df = pd.read_csv("Covid-19_dataset.csv",header=1)


df['is_Male'] = pd.get_dummies(df['Gender'])['MALE']#one hot encoding gender so male = #1 and female = 0

blood_types = pd.get_dummies(df['Blood_Group']).columns
blood_group_columns = pd.get_dummies(df[['Blood_Group','CONTROL_blood_group']]).columns


df = pd.concat([df, pd.get_dummies(df[['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type']])], axis=1)



df = df.drop(['Blood_Group','CONTROL_blood_group','Risk_Factor','Infection_type','Gender','Place_of_isolation','Patient_In','Patient_out'],axis=1)

columns = df.columns



# getting rid of blank values
print("before",df.shape)
for name in columns:
    df = df[df[name] != " "]
    df = df.dropna()
    
label_Severeity = df['Severeity'] #Severeity vs Outcome
label_Outcome = df['Outcome']
df = df.drop(['Severeity','Outcome'],axis=1)


df = df.applymap(lambda x: pd.to_numeric(x, errors='coerce'))#strings to numeric
all_columns = df.columns
# df = df.reset_index()

print("after",df.shape)


df_no_blood_data = copy.deepcopy(df.drop(blood_group_columns,axis=1))
df_no_blood_data


before (5668, 57)
after (5641, 55)


Unnamed: 0,Age,Incubation period (days),Fever,Chills,Cough,Dyspnea,Anosmia_Ageusia,Loss_of_appetite,Asthenia,Headache,...,Risk_Factor_DMCKD,Risk_Factor_DMHT,Risk_Factor_HT,Risk_Factor_HT.1,Risk_Factor_NONE,Risk_Factor_RF,Infection_type_Asymptomatic,Infection_type_Asymtomatic,Infection_type_Symptomatic,Infection_type_Symtomatic
0,61.0,14,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
1,49.0,28,1,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
2,88.0,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,56.0,17,1,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
4,41.0,1,0,0,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5663,65.0,14,1,0,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
5664,32.0,7,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0
5665,28.0,4,1,1,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
5666,40.0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,0


In [5]:
# X = df
# y = label_Severeity

def borutaSupportFeatures(X,y,verbose_level=1,print_fail=True,print_pass=True,tentive=False):
    np.random.seed(seed=1)
    # let's initialize a RF model 
    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5,random_state=1)

    # let's initialize Boruta
    feat_selector = BorutaPy(
            verbose=verbose_level,
            estimator=model,
            n_estimators='auto',
            max_iter=250  # number of iterations to perform
        )

    # train Boruta


    feat_selector.fit(np.array(X), np.array(y))
    support = [] #saves only conformed  features

    if(print_fail):
        # print support and ranking for each feature
        print("\n------Support and Ranking for each feature------")
        for i in range(len(feat_selector.support_)):
            if not feat_selector.support_[i]:
                print("Doesn't pass the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
            
            


    if(print_pass):
        for i in range(len(feat_selector.support_)):
            if feat_selector.support_[i]:
                print("Passes the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
                support.append(X.columns[i])
                
    if(tentive):
        for i in range(len(feat_selector.support_weak_)):
            if feat_selector.support_weak_[i]:
                print("May pass the test: ", X.columns[i],
                      " - Ranking: ", feat_selector.ranking_[i])
                support.append(X.columns[i])
        
            
    return support
    


# All Features : Outcome

In [6]:
# all_confirmed_first_itteration_outcome_features = borutaSupportFeatures(df,label_Outcome,verbose_level=0,print_fail=False)

# Second set without known values

In [7]:
# second_itteration_features_Outcome = list(set(df.columns) - set(all_confirmed_first_itteration_outcome_features)) 
# all_confirmed_second_itteration_outcome_features=borutaSupportFeatures(df[second_itteration_features_Outcome],label_Outcome,verbose_level=0,print_fail=False)

In [8]:
# all_confirmed_second_itteration_outcome_features

# Third set without known values

In [9]:
# third_itteration_features_Outcome = list(set(df.columns) 
#                                           - set(all_confirmed_first_itteration_outcome_features)
#                                          -set(all_confirmed_second_itteration_outcome_features)) 
# all_confirmed_third_itteration_outcome_features=borutaSupportFeatures(df[third_itteration_features_Outcome],label_Outcome,verbose_level=0,print_fail=False)

In [10]:
# all_confirmed_third_itteration_outcome_features

# Forth set without known values

In [11]:
# forth_itteration_features_Outcome = list(set(df.columns) 
#                                           - set(all_confirmed_first_itteration_outcome_features)
#                                          - set(all_confirmed_second_itteration_outcome_features)
#                                         -set(all_confirmed_third_itteration_outcome_features)) 
# all_confirmed_forth_itteration_outcome_features=borutaSupportFeatures(df[forth_itteration_features_Outcome],label_Outcome,verbose_level=0,print_fail=False)

In [12]:
# all_confirmed_forth_itteration_outcome_features

# Fifth set without known values

In [13]:
# fifth_itteration_features_Outcome = list(set(df.columns) 
#                                           - set(all_confirmed_first_itteration_outcome_features)
#                                          - set(all_confirmed_second_itteration_outcome_features)
#                                         -set(all_confirmed_third_itteration_outcome_features)
#                                         -set(all_confirmed_forth_itteration_outcome_features)) 
# all_confirmed_fifth_itteration_outcome_features=borutaSupportFeatures(df[fifth_itteration_features_Outcome],label_Outcome,verbose_level=0,print_fail=False)

In [14]:
use_features = df.columns
iteration = 1
saved={}
useful = []
tentive = []
while(True):
    print("iteration: ",iteration)
    if(iteration > 4):
        v=1
    else:
        v=0
    
    positive_features = borutaSupportFeatures(df[use_features],label_Outcome,verbose_level=1,print_fail=False)
    if(len(positive_features)==0):
        print("No new features")
        temp = borutaSupportFeatures(df[use_features+tentive],label_Outcome,verbose_level=1,print_fail=False,tentive=True)
        tentive = tentive + temp
        if(len(temp) == 0):
            break
        else:
            print("temp: ",temp)
    else:
        useful = useful+ positive_features
        saved[iteration] = positive_features
        print(positive_features)
        use_features = list(set(use_features) -set(positive_features))
    iteration = iteration+1
    

iteration:  1
Iteration: 1 / 250
Iteration: 2 / 250
Iteration: 3 / 250
Iteration: 4 / 250
Iteration: 5 / 250
Iteration: 6 / 250
Iteration: 7 / 250
Iteration: 8 / 250
Iteration: 9 / 250
Iteration: 10 / 250
Iteration: 11 / 250
Iteration: 12 / 250


BorutaPy finished running.

Iteration: 	13 / 250
Confirmed: 	9
Tentative: 	0
Rejected: 	46
Passes the test:  Age  - Ranking:  1
Passes the test:   Incubation period (days)  - Ranking:  1
Passes the test:  Cyanosis  - Ranking:  1
Passes the test:  Risk_Factor_AT  - Ranking:  1
Passes the test:  Risk_Factor_COPD  - Ranking:  1
Passes the test:  Risk_Factor_DM  - Ranking:  1
Passes the test:  Risk_Factor_DMHT  - Ranking:  1
Passes the test:  Risk_Factor_HT  - Ranking:  1
Passes the test:  Risk_Factor_NONE  - Ranking:  1
['Age', ' Incubation period (days)', 'Cyanosis', 'Risk_Factor_AT', 'Risk_Factor_COPD', 'Risk_Factor_DM', 'Risk_Factor_DMHT', 'Risk_Factor_HT', 'Risk_Factor_NONE']
iteration:  2
Iteration: 1 / 250
Iteration: 2 / 250
Iteration: 3 / 

Iteration: 90 / 250
Iteration: 91 / 250
Iteration: 92 / 250
Iteration: 93 / 250
Iteration: 94 / 250
Iteration: 95 / 250
Iteration: 96 / 250
Iteration: 97 / 250
Iteration: 98 / 250
Iteration: 99 / 250
Iteration: 100 / 250
Iteration: 101 / 250
Iteration: 102 / 250
Iteration: 103 / 250
Iteration: 104 / 250
Iteration: 105 / 250
Iteration: 106 / 250
Iteration: 107 / 250
Iteration: 108 / 250
Iteration: 109 / 250
Iteration: 110 / 250
Iteration: 111 / 250
Iteration: 112 / 250
Iteration: 113 / 250
Iteration: 114 / 250
Iteration: 115 / 250
Iteration: 116 / 250
Iteration: 117 / 250
Iteration: 118 / 250
Iteration: 119 / 250
Iteration: 120 / 250
Iteration: 121 / 250
Iteration: 122 / 250
Iteration: 123 / 250
Iteration: 124 / 250
Iteration: 125 / 250
Iteration: 126 / 250
Iteration: 127 / 250
Iteration: 128 / 250
Iteration: 129 / 250
Iteration: 130 / 250
Iteration: 131 / 250
Iteration: 132 / 250
Iteration: 133 / 250
Iteration: 134 / 250
Iteration: 135 / 250
Iteration: 136 / 250
Iteration: 137 / 250
It

Iteration: 30 / 250
Iteration: 31 / 250
Iteration: 32 / 250
Iteration: 33 / 250
Iteration: 34 / 250
Iteration: 35 / 250
Iteration: 36 / 250
Iteration: 37 / 250
Iteration: 38 / 250
Iteration: 39 / 250
Iteration: 40 / 250
Iteration: 41 / 250
Iteration: 42 / 250
Iteration: 43 / 250
Iteration: 44 / 250
Iteration: 45 / 250
Iteration: 46 / 250
Iteration: 47 / 250
Iteration: 48 / 250
Iteration: 49 / 250
Iteration: 50 / 250
Iteration: 51 / 250
Iteration: 52 / 250
Iteration: 53 / 250
Iteration: 54 / 250
Iteration: 55 / 250
Iteration: 56 / 250
Iteration: 57 / 250
Iteration: 58 / 250
Iteration: 59 / 250
Iteration: 60 / 250
Iteration: 61 / 250
Iteration: 62 / 250
Iteration: 63 / 250
Iteration: 64 / 250
Iteration: 65 / 250
Iteration: 66 / 250
Iteration: 67 / 250
Iteration: 68 / 250
Iteration: 69 / 250
Iteration: 70 / 250
Iteration: 71 / 250
Iteration: 72 / 250
Iteration: 73 / 250
Iteration: 74 / 250
Iteration: 75 / 250
Iteration: 76 / 250
Iteration: 77 / 250
Iteration: 78 / 250
Iteration: 79 / 250


In [15]:
use_features

['Risk_Factor_RF',
 'Blood_Group_O+',
 'CONTROL_blood_group_B-',
 'Risk_Factor_HT ',
 'Blood_Group_A+',
 'Risk_Factor_CLD',
 'CONTROL_blood_group_O-',
 'Risk_Factor_COPDHT',
 'Headache',
 'Risk_Factor_DMCKD',
 'Infection_type_Asymtomatic',
 'Fever ',
 'Dyspnea',
 'Loss_of_appetite',
 'Blood_Group_B+',
 'Rhinorrhea',
 'Risk_Factor_DMARF',
 'Infection_type_Symptomatic',
 'Infection_type_Asymptomatic',
 'Anosmia_Ageusia',
 'Nausea_vomiting',
 'Blood_Group_B-',
 'CONTROL_blood_group_AB-',
 'Cough',
 'Muscle_ache',
 'Asthenia',
 'CONTROL_blood_group_A-',
 'Blood_Group_A-',
 'Risk_Factor_DMAT',
 'Blood_Group_O-',
 'CONTROL_blood_group_A+',
 'Infection_type_Symtomatic',
 'Blood_Group_AB-',
 'is_Male']

In [16]:
len(useful)


21

In [17]:
useful

['Age',
 ' Incubation period (days)',
 'Cyanosis',
 'Risk_Factor_AT',
 'Risk_Factor_COPD',
 'Risk_Factor_DM',
 'Risk_Factor_DMHT',
 'Risk_Factor_HT',
 'Risk_Factor_NONE',
 'CONTROL_blood_group_B+',
 'Risk_Factor_AP',
 'Diarrhea',
 'Sore_throat',
 'Risk_Factor_CA',
 'CONTROL_blood_group_AB+',
 'Risk_Factor_COPDDM',
 'Risk_Factor_ARF',
 'CONTROL_blood_group_O+',
 'Risk_Factor_CKD',
 'Blood_Group_AB+',
 'Chills']

Same outcome over multiple itterations

In [18]:
# make a comparison of just the different blood groups
# Single Var pred : Only Blood Type and no other features-
# Pick the 5 and 10 best features and 

In [19]:
STOP

NameError: name 'STOP' is not defined

In [None]:
ascxcv

# Comparison RandomForestClassifier Severeity

## With Blood Data

In [None]:
X = df
y = label_Severeity
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC = 0

all_auc = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC = metrics.auc(fpr, tpr)
    
    avg_AUC = avg_AUC + AUC

    
    all_auc.append([classes[class_index],fpr,tpr,AUC])

for label in all_auc:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

In [None]:
X = df_no_blood_data
y = label_Severeity
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC_no_blood = 0

all_auc_no_blood = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC_no_blood = metrics.auc(fpr, tpr)
    
    avg_AUC_no_blood = avg_AUC_no_blood + AUC_no_blood

    
    all_auc_no_blood.append([classes[class_index],fpr,tpr,AUC_no_blood])

for label in all_auc_no_blood:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

## Without Blood Data

In [None]:
print("No Blood - Blood: \n\n")

for nb, b in zip(all_auc_no_blood, all_auc):
    print("Diff of AUC for ",nb[0],b[0]," ",nb[3]-b[3])

print("Diff of avg AUC",avg_AUC_no_blood - avg_AUC)

# Comparison RandomForestClassifier Outcome

## With Blood Data

In [None]:
X = df
y = label_Outcome
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC = 0

all_auc = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC = metrics.auc(fpr, tpr)
    
    avg_AUC = avg_AUC + AUC

    
    all_auc.append([classes[class_index],fpr,tpr,AUC])

for label in all_auc:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

## Without Blood Data

In [None]:
X = df_no_blood_data
y = label_Outcome
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC_no_blood = 0

all_auc_no_blood = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC_no_blood = metrics.auc(fpr, tpr)
    
    avg_AUC_no_blood = avg_AUC_no_blood + AUC_no_blood

    
    all_auc_no_blood.append([classes[class_index],fpr,tpr,AUC_no_blood])

for label in all_auc_no_blood:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

In [None]:
print("No Blood - Blood: \n\n")

for nb, b in zip(all_auc_no_blood, all_auc):
    print("Diff of AUC for ",nb[0],b[0]," ",nb[3]-b[3])

print("Diff of avg AUC",avg_AUC_no_blood - avg_AUC)

# Comparison Logistic Regression Outcome

In [None]:
from sklearn.linear_model import LogisticRegression

## Without Blood Data

In [None]:
X = df_no_blood_data
y = label_Outcome
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = LogisticRegression(random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC_no_blood = 0

all_auc_no_blood = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC_no_blood = metrics.auc(fpr, tpr)
    
    avg_AUC_no_blood = avg_AUC_no_blood + AUC_no_blood

    
    all_auc_no_blood.append([classes[class_index],fpr,tpr,AUC_no_blood])

for label in all_auc_no_blood:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

## With Blood Data

In [None]:
X = df
y = label_Outcome
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = LogisticRegression(random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC = 0

all_auc = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC = metrics.auc(fpr, tpr)
    
    avg_AUC = avg_AUC + AUC

    
    all_auc.append([classes[class_index],fpr,tpr,AUC])

for label in all_auc:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

In [None]:
print("No Blood - Blood: \n\n")

for nb, b in zip(all_auc_no_blood, all_auc):
    print("Diff of AUC for ",nb[0],b[0]," ",nb[3]-b[3])

print("Diff of avg AUC",avg_AUC_no_blood - avg_AUC)

# Comparison Logistic Regression Severeity

## Without Blood Data

In [None]:
X = df_no_blood_data
y = label_Severeity
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = LogisticRegression(random_state=1,max_iter=500,C=0.5)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC_no_blood = 0

all_auc_no_blood = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC_no_blood = metrics.auc(fpr, tpr)
    
    avg_AUC_no_blood = avg_AUC_no_blood + AUC_no_blood

    
    all_auc_no_blood.append([classes[class_index],fpr,tpr,AUC_no_blood])

for label in all_auc_no_blood:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

## With Blood Data

In [None]:
X = df
y = label_Severeity
np.random.seed(0)
X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
model = LogisticRegression(random_state=1,max_iter=500,C=0.1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_

avg_AUC = 0

all_auc = []


for class_index in range(len(classes)):
    labelProb = pred_prob[:,class_index]
    fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

    AUC = metrics.auc(fpr, tpr)
    
    avg_AUC = avg_AUC + AUC

    
    all_auc.append([classes[class_index],fpr,tpr,AUC])

for label in all_auc:
    label_name = label[0]
    fpr = label[1]
    tpr = label[2]
    AUC = label[3]

    plt.plot(fpr, tpr, lw=2, label= str.title(label_name)+' Curve (area = %0.5f)' % AUC)
    
    plt.xlim([-0.005, 1.005])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('Receiver operating characteristic')

In [None]:
print("No Blood - Blood: \n\n")

for nb, b in zip(all_auc_no_blood, all_auc):
    print("Diff of AUC for ",nb[0],b[0]," ",nb[3]-b[3])

print("Diff of avg AUC",avg_AUC_no_blood - avg_AUC)

In [None]:
blood_types

# Per BloodType

In [None]:
X = df
y = label_Outcome
np.random.seed(0)
model = LogisticRegression(random_state=1)
model.fit(X_Train,y_Train)
predict = model.predict(X_Test)
# accuracy_score(y_Test,predict)
pred_prob = model.predict_proba(X_Test)
classes=model.classes_



for bt in blood_types:
    
    avg_AUC_no_blood = 0

    all_auc_no_blood = []

    X = df[df[blood_types[0]]==1]
    y = label_Outcome[df[blood_types[0]]==1]
    X_Train,X_Test,y_Train,y_Test = train_test_split(X,y,test_size=0.9,random_state=1)
    
    for class_index in range(len(classes)):
        labelProb = pred_prob[:,class_index]
        fpr, tpr, thresholds = metrics.roc_curve(y_Test, labelProb, pos_label=classes[class_index])

        AUC_no_blood = metrics.auc(fpr, tpr)

        avg_AUC_no_blood = avg_AUC_no_blood + AUC_no_blood


        all_auc_no_blood.append([classes[class_index],fpr,tpr,AUC_no_blood])

    for label in all_auc_no_blood:
        label_name = label[0]
        fpr = label[1]
        tpr = label[2]
        AUC = label[3]

        plt.plot(fpr, tpr, lw=2, label= str.title(,label_name)+' Curve (area = %0.5f)' % AUC)

        plt.xlim([-0.005, 1.005])
        plt.ylim([-0.01, 1.01])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc="lower right")
        plt.title('Receiver operating characteristic')