In [1]:
# Load necessary python packages
import numpy as np
import pandas as pd

# Load train, test datasets
train = pd.read_csv("covid_19_train.csv")
test = pd.read_csv("covid_19_test.csv")
sub = pd.read_csv("covid_19_submission.csv")

In [2]:
# Extract labelled column from train dataset
y = train.covid_19.values

# Drops labelled column, to be ready for model training/ data cleaning
train_test = train.drop("covid_19", axis='columns')

# combines training and testing dataset
x = pd.concat([train_test, test])


# Check for NA or NAN presence in each column
x_check_na = x.isna().any()


x_summary = x.describe()
x_summary = x_summary.T

# Select columns that consist of less than 10% of NA/ NAN
x_cleared = x.loc[:, x.isnull().mean() < 0.9]

# Fill in NA/ NAN fields with each column's median value
x_cleared = x_cleared.fillna(x.median(0)).copy(deep=True)

# Check that all columns do not have NA/ NAN values
x_check_na = x_cleared.isna().any()

# Get correlation matrix, for purpose of removing highly correlated rows
x_cleared1 = x_cleared.corr()
x_cleared1.reset_index(inplace=True)

# Modify correlation matrix to tabular format
x_cleared2 = pd.melt(x_cleared1, id_vars=['index'])
# Remove same column correlations of 1.0
x_cleared2 = x_cleared2[x_cleared2['index']!=x_cleared2['variable']]
# Order correlation in descending value
x_cleared2 = x_cleared2.sort_values(by=['value'], ascending=False)

# Highlight column correlations above 0.85, to remove due to high correlation value
x_cleared2 = x_cleared2[abs(x_cleared2['value'])>0.85]
x_cleared2 = x_cleared2.iloc[::2, :]

# Remove columns that are highly correlated
x_cleared.drop(x_cleared2['index'], axis=1, inplace=True)
x_cleared = x_cleared.drop('id', axis='columns')

# Force remaining columns to numeric type
x_cleared = x_cleared.apply(pd.to_numeric, errors = 'coerce') 
x_check_na = x_cleared.isna().any()

# As train dataset was originally 4000 rows
x_train1 = x_cleared.iloc[:4000, :]
x_test1 = x_cleared.iloc[4000:, :]


In [17]:
x_train1

Unnamed: 0,age_quantile,Patient addmited to regular ward,Patient addmited to semi-intensive unit,Patient addmited to intensive care unit,Hemoglobin,Platelets,Mean platelet volume,Lymphocytes,Mean corpuscular hemoglobin concentration (MCHC),Leukocytes,Basophils,Mean corpuscular hemoglobin (MCH),Eosinophils,Monocytes,Red blood cell distribution width (RDW),Respiratory Syncytial Virus,Influenza A,Influenza B,Parainfluenza 1,CoronavirusNL63,Rhinovirus/Enterovirus,Coronavirus HKU1,Parainfluenza 3,Chlamydophila pneumoniae,Adenovirus,Parainfluenza 4,Coronavirus229E,CoronavirusOC43,Inf A H1N1 2009,Bordetella pertussis,Metapneumovirus,Parainfluenza 2,"Influenza B, rapid test","Influenza A, rapid test"
0,15,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0,0,0,2.045308,-0.768649,1.469188,-0.236022,0.244149,-0.211488,0.081693,-0.030911,-0.245556,2.248498,0.436405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,0,0,0,0.416252,-0.630469,-0.999063,-0.943933,-0.452899,-0.495332,0.387152,0.491805,-0.498393,2.406077,-0.271247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,19,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996,3,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,7,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3998,14,0,0,0,0.040316,-0.121716,-0.101517,-0.014267,-0.054585,-0.212879,-0.223767,0.125903,-0.329835,-0.115191,-0.182790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
model = LogisticRegression(solver='liblinear', random_state=0)
model.fit(x_train1, y)
confusion_matrix(y, model.predict(x_train1))

array([[3589,    5],
       [ 371,   35]])

In [4]:
print(classification_report(y, model.predict(x_train1)))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3594
           1       0.88      0.09      0.16       406

    accuracy                           0.91      4000
   macro avg       0.89      0.54      0.55      4000
weighted avg       0.90      0.91      0.87      4000



In [5]:
sub['covid_19'] = model.predict_proba(x_test1)
sub

Unnamed: 0,id,covid_19
0,4b8e26ec9ac6a73,0.901119
1,72ca7ed61c2a196,0.898198
2,400aecdd7f1a825,0.885736
3,b1ae71a4fe1788a,0.903966
4,654c1ae408ea3ac,0.898198
...,...,...
1639,5ba48fd9d5dd05e,0.864565
1640,731a1b637d73fac,0.912070
1641,30fe0388aafc474,0.898198
1642,5c1421eb95ab55c,0.951444


In [6]:
sub.to_csv("logistic_run.csv", index=False)

In [7]:
# Improvement to Logistic Regression
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(x_train1, y)
print(classification_report(y, model.predict(x_train1)))
sub['covid_19'] = model.predict_proba(x_test1)
sub.to_csv("logistic_run2.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3594
           1       0.85      0.10      0.18       406

    accuracy                           0.91      4000
   macro avg       0.88      0.55      0.56      4000
weighted avg       0.90      0.91      0.87      4000



In [8]:
# Trying Naive Bayes 
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(x_train1, y)
print(classification_report(y, model.predict(x_train1)))
sub['covid_19'] = model.predict_proba(x_test1)
sub.to_csv("naivebayes1.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      3594
           1       0.58      0.09      0.16       406

    accuracy                           0.90      4000
   macro avg       0.75      0.54      0.55      4000
weighted avg       0.87      0.90      0.87      4000



In [9]:
# Trying Decision Tree
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train1, y)
print(classification_report(y, clf.predict(x_train1)))
sub['covid_19'] = clf.predict_proba(x_test1)
sub.to_csv("decisiontree1.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      1.00      0.96      3594
           1       1.00      0.17      0.30       406

    accuracy                           0.92      4000
   macro avg       0.96      0.59      0.63      4000
weighted avg       0.92      0.92      0.89      4000



In [10]:
# Trying Deep Learning
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5, activation = 'relu', learning_rate ='adaptive', hidden_layer_sizes=(100, 2), random_state=1, max_iter = 200)
clf = clf.fit(x_train1, y)
print(classification_report(y, clf.predict(x_train1)))
sub['covid_19'] = clf.predict_proba(x_test1)
sub.to_csv("deeplearning1.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3594
           1       0.94      0.14      0.25       406

    accuracy                           0.91      4000
   macro avg       0.92      0.57      0.60      4000
weighted avg       0.91      0.91      0.88      4000



In [11]:
# Trying Support Vector Machines
from sklearn import svm
clf = svm.SVC(probability = True)
clf = clf.fit(x_train1, y)
print(classification_report(y, clf.predict(x_train1)))
sub['covid_19'] = clf.predict_proba(x_test1)
sub.to_csv("svm1.csv", index=False)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      3594
           1       0.00      0.00      0.00       406

    accuracy                           0.90      4000
   macro avg       0.45      0.50      0.47      4000
weighted avg       0.81      0.90      0.85      4000



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Trying xgb
import xgboost as xgb
xg_class = xgb.XGBClassifier(
    learning_rate=0.02, 
    max_delta_step=0, 
    max_depth=10,
    min_child_weight=0.2, 
    missing=None, 
    n_estimators=300, 
    nthread=4,
    objective='binary:logistic', 
    reg_alpha=0.01, 
    reg_lambda = 0.01,
    scale_pos_weight=1, 
    seed=0, 
    silent=False, 
    subsample=0.9)

xg_fit=xg_class.fit(x_train1, y)
print(classification_report(y, xg_class.predict(x_train1)))


sub['covid_19'] = xg_class.predict_proba(x_test1)[:,1]
sub.to_csv("xgb1.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      1.00      0.96      3594
           1       1.00      0.17      0.29       406

    accuracy                           0.92      4000
   macro avg       0.96      0.59      0.62      4000
weighted avg       0.92      0.92      0.89      4000



In [13]:
# Trying Support Vector Machines 2
from sklearn import svm
clf = svm.SVC(probability = True)
clf = clf.fit(x_train1, y)
print(classification_report(y, clf.predict(x_train1)))
sub['covid_19'] = 1 - clf.predict_proba(x_test1)
sub.to_csv("svm2.csv", index=False)

              precision    recall  f1-score   support

           0       0.90      1.00      0.95      3594
           1       0.00      0.00      0.00       406

    accuracy                           0.90      4000
   macro avg       0.45      0.50      0.47      4000
weighted avg       0.81      0.90      0.85      4000



  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Trying Deep Learning 2
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5, activation = 'relu', learning_rate ='adaptive', hidden_layer_sizes=(100, 2), random_state=1, max_iter = 200)
clf = clf.fit(x_train1, y)
print(classification_report(y, clf.predict(x_train1)))
sub['covid_19'] = 1 - clf.predict_proba(x_test1)
sub.to_csv("deeplearning2.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      3594
           1       0.94      0.14      0.25       406

    accuracy                           0.91      4000
   macro avg       0.92      0.57      0.60      4000
weighted avg       0.91      0.91      0.88      4000



In [15]:
# Trying Decision Tree 2
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train1, y)
print(classification_report(y, clf.predict(x_train1)))
sub['covid_19'] = 1 - clf.predict_proba(x_test1)
sub.to_csv("decisiontree2.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      1.00      0.96      3594
           1       1.00      0.17      0.30       406

    accuracy                           0.92      4000
   macro avg       0.96      0.59      0.63      4000
weighted avg       0.92      0.92      0.89      4000



In [16]:
# Trying Naive Bayes 
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(x_train1, y)
print(classification_report(y, model.predict(x_train1)))
sub['covid_19'] = 1 - model.predict_proba(x_test1)
sub.to_csv("naivebayes2.csv", index=False)

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      3594
           1       0.58      0.09      0.16       406

    accuracy                           0.90      4000
   macro avg       0.75      0.54      0.55      4000
weighted avg       0.87      0.90      0.87      4000

