# Construction of a prediction model for Covid19
## Part 2 of the notebook (for part one look at the fork of covidclinicaldata

In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from colorama import Fore #To pain the terminal with different colors
import warnings
warnings.filterwarnings('ignore') #To disable warnings
from IPython.display import display #To print a dataframe like the cell does

In [47]:
all_data = pd.read_csv("covid_clinical_data.csv", index_col=0)
all_data

Unnamed: 0,covid19_test_results,age,high_risk_exposure_occupation,high_risk_interactions,diabetes,chd,htn,cancer,asthma,autoimmune_dis,...,sob,sob_severity,diarrhea,fatigue,headache,loss_of_smell,loss_of_taste,runny_nose,muscle_sore,sore_throat
0,Negative,4,True,,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,False,False,False
1,Negative,2,False,,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,False,False,False
2,Negative,1,,,False,False,False,False,False,False,...,,,,,,,,,,
3,Negative,3,True,True,False,False,False,False,False,False,...,True,2.0,False,True,False,False,False,False,False,True
4,Negative,1,False,,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93989,Negative,3,False,True,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,False,False,False
93990,Negative,3,False,True,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,True,False,True
93991,Negative,3,False,False,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,False,False,False
93993,Negative,3,False,False,False,False,False,False,False,False,...,False,0.0,False,False,False,False,False,False,False,False


In [22]:
X = all_data.iloc[:, 1:]
y = all_data["covid19_test_results"]

In [18]:
def create_different_datasets(data_x):
    e_factors = data_x.iloc[:,:3]
    comorbidities_data = data_x.iloc[:, 3:10] #The columns that contain the comorbidities
    vitals_data = data_x.iloc[:, 10-16] #The columns that contain the vitals
    assesed_symptoms_data = data_x.iloc[:, 1-21] #The columns that contain the assesed symptoms
    reported_symptoms_data = data_x.iloc[:,21:] #The columns that have the patient reported symptoms
    
    return e_factors, comorbidities_data, vitals_data, assesed_symptoms_data, reported_symptoms_data

In [28]:
e_factors, comorb, vitals, a_symptoms, r_symptoms = create_different_datasets(X)

<u>high_risk_exposure_occupation:</u> The most obvious thing to do is to just fill the 169 missing values with the most frequent value. However, it occurred to me that since we ultimately are willing to have (maybe even encourage) false positives, it might be better to just put the ones that tested positive as True and the rest as False. This would require that I split the dataset into training, validation and testing set before I do any imputation to avoid any target leakage (the effect when knowing the result before hand affects how are we imputing the variables). The step to be taken (if I were to follow that route) would be then to impute the training data that tested positive as True, the rest as False, and any incoming unknown data (validation and test data) as True if we dont know the value of the feature. I might need to run an experiment when I do both.

<u>high_risk_interaction:</u> This is easier than the previous one. We will just assign True to anything that has a 'high_risk_expossure_occupation' as True. The reasoning behind is is that if we don't know if the patient has had a high risk interaction, makes sense to say they did if their occupation is of high risk exposure.

In [23]:
def high_risk_exposure_imputation(data_x, data_y, target_imputation=False):
    if target_imputation:
        mode = data_x[data_y == "Positive"].high_risk_exposure_occupation.mode()[0]
    else:
        mode = data_x.high_risk_exposure_occupation.mode()[0]
    data_x.loc[data_x["high_risk_exposure_occupation"].isna(), "high_risk_exposure_occupation"] = mode
    
    data_x.loc[data_x["high_risk_interactions"].isna(), "high_risk_interactions"] = data_x["high_risk_exposure_occupation"]

In [24]:
def reduce_training_set(data_x, data_y):
    selected_negatives = data_x[data_y[data_x.index] == "Negative"].sample((data_y[data_x.index] == "Positive").sum())
    all_positives = data_x[data_y == "Positive"]
    #reduced_train_comorb = pd.merge(all_positives, selected_negatives) #Doesn't work well for reasons...
    reduced_data = pd.concat([all_positives, selected_negatives])

    return reduced_data

def get_metrics(*values):
    accuracy = accuracy_score(*values)
    recall = recall_score(*values, pos_label="Positive")
    precision = precision_score(*values, pos_label="Positive")
    f1 = f1_score(*values, pos_label="Positive")
    return accuracy, recall, precision, f1


In [49]:
class XGBAdapter:
    
    def __init__(self, **params):
        self.model = XGBClassifier(**params, verbosity=0)
        
    def fit(self, data_x, data_y):
        self.model.fit(self.xgb_adapter(data_x), data_y)
        
    def predict(self, data_x):
        return self.model.predict(self.xgb_adapter(data_x))
        
    #This function is because xgb complains about 'object' type columns so I convert them all to boolean
    def xgb_adapter(self, data_x):
        result = data_x.copy()
        for column in result:
            result.loc[:,column] = result[column].astype("bool")
        return result

In [63]:
def cross_validation_normal(data_x, data_y, model, folds, hr_imputation=False):
    results = np.zeros((folds,2,4)) #Shape of folds, training and validation, and number of metrics
    
    fold_size = data_x.shape[0] // folds
    reminder = data_x.shape[0] % folds    
    start = 0    
    for i in range(folds):
        end = start + fold_size + (1 if reminder > 0 else 0)
        reminder-=1
        train_x = pd.concat([data_x.iloc[:start], data_x.iloc[end:]], axis=0)
        train_y = data_y.iloc[:start].append(data_y.iloc[end:])
        valid_x = data_x.iloc[start:end]
        valid_y = data_y.iloc[start:end]
        
        if hr_imputation:
            high_risk_exposure_imputation(train_x, train_y, target_imputation = True)
            high_risk_exposure_imputation(valid_x, valid_y)
            
        model.fit(train_x, train_y)
        train_v = (train_y, model.predict(train_x))
        valid_v = (valid_y, model.predict(valid_x))
        results[i,0] += get_metrics(*train_v)
        results[i,1] += get_metrics(*valid_v)
        start = end
        
        progress_bar = "[" + str("*" * i) + str(" " * (folds -(i+1))) + "]"
        print(progress_bar, end="\r")
    print("")
    return results.mean(axis=0)
        
def run_models_cross_val(data_x, data_y, models, folds=10, hr_imputation=False):
    for model in models:
        print(Fore.RED, model, Fore.BLACK, sep="")
        results = cross_validation_normal(data_x, data_y, models[model], folds, hr_imputation)       
        #print(Fore.BLUE, results, Fore.BLUE, sep="")
        display(pd.DataFrame(results, index=["Training", "Validation"], columns=["Accuracy", "Recall", "Precision", "F1"]))
        print("")

In [67]:
models ={
    "MultinomialNB" : MultinomialNB(alpha=0),
    "Random Forest (100 estimators)" : RandomForestClassifier(min_samples_leaf=30),
    "Random Forest (500 estimators)" : RandomForestClassifier(n_estimators = 500),
    "XGBoost Classifier" : XGBAdapter(n_estimators=500),
    "SVM" : SVC()    
}

In [60]:
run_models_cross_val(comorb, y[comorb.index], models)

[31mMultinomialNB[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.904953,0.083474,0.026178,0.039854
Validation,0.905138,0.082413,0.024478,0.037117



[31mRandom Forest (100 estimators)[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976361,0.0,0.0,0.0
Validation,0.976362,0.0,0.0,0.0



[31mRandom Forest (500 estimators)[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976361,0.0,0.0,0.0
Validation,0.976362,0.0,0.0,0.0



[31mXGBoost Classifier[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976361,0.0,0.0,0.0
Validation,0.976362,0.0,0.0,0.0



[31mSVM[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976361,0.0,0.0,0.0
Validation,0.976362,0.0,0.0,0.0





In [64]:
comorb_ef = pd.concat([comorb, e_factors], axis=1)
run_models_cross_val(comorb_ef, y, models, hr_imputation=True)

[31mMultinomialNB[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.604175,0.459017,0.029377,0.053355
Validation,0.591113,0.405234,0.023402,0.042329



[31mRandom Forest (100 estimators)[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976361,0.0,0.0,0.0
Validation,0.976362,0.0,0.0,0.0



[31mRandom Forest (500 estimators)[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976462,0.004278,1.0,0.008518
Validation,0.976323,0.0,0.0,0.0



[31mXGBoost Classifier[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976398,0.001618,0.975,0.003229
Validation,0.976343,0.0,0.0,0.0



[31mSVM[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.976361,0.0,0.0,0.0
Validation,0.976362,0.0,0.0,0.0





In [68]:
reduced_comorb_ef = reduce_training_set(comorb_ef, y)
run_models_cross_val(reduced_comorb_ef, y[reduced_comorb_ef.index], models, hr_imputation=True)

[31mMultinomialNB[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.56683,0.523504,0.558319,0.441566
Validation,0.09369,0.049096,0.5,0.089393



[31mRandom Forest (100 estimators)[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.565601,0.475173,0.577319,0.384887
Validation,0.079476,0.011887,0.4,0.022992



[31mRandom Forest (500 estimators)[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.587788,0.56006,0.590006,0.513505
Validation,0.177165,0.08506,0.5,0.142236



[31mXGBoost Classifier[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.576513,0.529816,0.571605,0.47812
Validation,0.153496,0.060162,0.5,0.106912



[31mSVM[30m
[*********]


Unnamed: 0,Accuracy,Recall,Precision,F1
Training,0.57324,0.529179,0.572752,0.467447
Validation,0.131375,0.056886,0.5,0.100675





Next step to either add more features or... implement a cross validation method that makes up for the difference between positives and negatives