In [1]:
import pandas as pd  # First, we'll import Pandas, a data processing and CSV file I/O library
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df_covid = pd.read_csv('corona_tested_individuals_ver_006_updated.csv')

In [3]:
df_covid.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication
0,4/30/2020,0.0,0.0,0.0,0.0,0.0,negative,,female,Other
1,4/30/2020,1.0,0.0,0.0,0.0,0.0,negative,,female,Other
2,4/30/2020,0.0,1.0,0.0,0.0,0.0,negative,,male,Other
3,4/30/2020,1.0,0.0,0.0,0.0,0.0,negative,,female,Other
4,4/30/2020,1.0,0.0,0.0,0.0,0.0,negative,,male,Other


In [4]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278848 entries, 0 to 278847
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   test_date            278848 non-null  object 
 1   cough                278596 non-null  float64
 2   fever                278596 non-null  float64
 3   sore_throat          278847 non-null  float64
 4   shortness_of_breath  278847 non-null  float64
 5   head_ache            278847 non-null  float64
 6   corona_result        278848 non-null  object 
 7   age_60_and_above     278848 non-null  object 
 8   gender               278848 non-null  object 
 9   test_indication      278848 non-null  object 
dtypes: float64(5), object(5)
memory usage: 21.3+ MB


In [5]:
df_covid.describe()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache
count,278596.0,278596.0,278847.0,278847.0,278847.0
mean,0.151574,0.078077,0.006907,0.005655,0.008657
std,0.358608,0.268294,0.082821,0.07499,0.09264
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [6]:
df_covid.isnull().sum()

test_date                0
cough                  252
fever                  252
sore_throat              1
shortness_of_breath      1
head_ache                1
corona_result            0
age_60_and_above         0
gender                   0
test_indication          0
dtype: int64

In [7]:
df_covid = df_covid.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [8]:
df_covid.isnull().sum()

test_date              0
cough                  0
fever                  0
sore_throat            0
shortness_of_breath    0
head_ache              0
corona_result          0
age_60_and_above       0
gender                 0
test_indication        0
dtype: int64

In [9]:
df_covid['test_date'] = pd.to_datetime(df_covid['test_date'], format = '%m/%d/%Y')

In [10]:
df_covid['test_date']

0        2020-04-30
1        2020-04-30
2        2020-04-30
3        2020-04-30
4        2020-04-30
            ...    
278843   2020-03-11
278844   2020-03-11
278845   2020-03-11
278846   2020-03-11
278847   2020-03-11
Name: test_date, Length: 278848, dtype: datetime64[ns]

In [11]:
df_covid['year'] = df_covid['test_date'].dt.year
df_covid['month'] = df_covid['test_date'].dt.month
df_covid['day'] = df_covid['test_date'].dt.day

In [12]:
df_covid.loc[:,'corona_result'] = df_covid['corona_result'].apply(lambda x: 0 if x=='negative' else 1)
df_covid['corona_result'].value_counts(0)

0    260227
1     18621
Name: corona_result, dtype: int64

In [13]:
df_covid.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication,year,month,day
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,0,,female,Other,2020,4,30
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,,female,Other,2020,4,30
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,0,,male,Other,2020,4,30
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,,female,Other,2020,4,30
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,,male,Other,2020,4,30


In [14]:
data_dummy = pd.get_dummies(df_covid[['age_60_and_above','gender', 'test_indication']], drop_first=True)
data_dummy.head()

Unnamed: 0,age_60_and_above_None,age_60_and_above_Yes,gender_female,gender_male,test_indication_Contact with confirmed,test_indication_Other
0,1,0,1,0,0,1
1,1,0,1,0,0,1
2,1,0,0,1,0,1
3,1,0,1,0,0,1
4,1,0,0,1,0,1


In [15]:
df_covid = pd.concat([df_covid,data_dummy], axis=1)
df_covid.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above,gender,test_indication,year,month,day,age_60_and_above_None,age_60_and_above_Yes,gender_female,gender_male,test_indication_Contact with confirmed,test_indication_Other
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,0,,female,Other,2020,4,30,1,0,1,0,0,1
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,,female,Other,2020,4,30,1,0,1,0,0,1
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,0,,male,Other,2020,4,30,1,0,0,1,0,1
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,,female,Other,2020,4,30,1,0,1,0,0,1
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,,male,Other,2020,4,30,1,0,0,1,0,1


In [16]:
df_covid = df_covid.drop(['age_60_and_above','gender', 'test_indication'], axis=1)
df_covid.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,year,month,day,age_60_and_above_None,age_60_and_above_Yes,gender_female,gender_male,test_indication_Contact with confirmed,test_indication_Other
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,0,2020,4,30,1,0,1,0,0,1
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,2020,4,30,1,0,1,0,0,1
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,0,2020,4,30,1,0,0,1,0,1
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,2020,4,30,1,0,1,0,0,1
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,2020,4,30,1,0,0,1,0,1


In [17]:
df_covid = df_covid.drop(['month','day','year'], axis=1)
df_covid.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above_None,age_60_and_above_Yes,gender_female,gender_male,test_indication_Contact with confirmed,test_indication_Other
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,0,1,0,0,1,0,1
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,1,0,0,1,0,1


In [18]:
df_covid.rename(columns = { 'age_60_and_above_Yes': 'age_60_and_above', 'gender_male':'gender', 'test_indication_Contact with confirmed': 'contact_with_confirmed'}, inplace = True) 
df_covid.head()

Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age_60_and_above_None,age_60_and_above,gender_female,gender,contact_with_confirmed,test_indication_Other
0,2020-04-30,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1
1,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1
2,2020-04-30,0.0,1.0,0.0,0.0,0.0,0,1,0,0,1,0,1
3,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,1
4,2020-04-30,1.0,0.0,0.0,0.0,0.0,0,1,0,0,1,0,1


In [21]:
Feature = df_covid[['cough','fever','sore_throat','shortness_of_breath','head_ache','age_60_and_above','gender','contact_with_confirmed']]
Feature.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,age_60_and_above,gender,contact_with_confirmed
0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,1.0,0.0,0.0,0.0,0.0,0,0,0
2,0.0,1.0,0.0,0.0,0.0,0,1,0
3,1.0,0.0,0.0,0.0,0.0,0,0,0
4,1.0,0.0,0.0,0.0,0.0,0,1,0


In [22]:
Feature.astype('int64').dtypes

cough                     int64
fever                     int64
sore_throat               int64
shortness_of_breath       int64
head_ache                 int64
age_60_and_above          int64
gender                    int64
contact_with_confirmed    int64
dtype: object

In [23]:
X = Feature

In [24]:
y = df_covid['corona_result']
y[0:5]

0    0
1    0
2    0
3    0
4    0
Name: corona_result, dtype: int64

In [25]:
from sklearn import preprocessing
X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-0.42244924, -0.29087201, -0.0833968 , -0.07541602, -0.09344862,
        -0.31947742, -0.92868231, -0.19916533],
       [ 2.36714831, -0.29087201, -0.0833968 , -0.07541602, -0.09344862,
        -0.31947742, -0.92868231, -0.19916533],
       [-0.42244924,  3.43793818, -0.0833968 , -0.07541602, -0.09344862,
        -0.31947742,  1.0767945 , -0.19916533],
       [ 2.36714831, -0.29087201, -0.0833968 , -0.07541602, -0.09344862,
        -0.31947742, -0.92868231, -0.19916533],
       [ 2.36714831, -0.29087201, -0.0833968 , -0.07541602, -0.09344862,
        -0.31947742,  1.0767945 , -0.19916533]])

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (223078, 8) (223078,)
Test set: (55770, 8) (55770,)


In [37]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

# Initialising the ANN
classifier = keras.Sequential()

# Adding the input layer and the first hidden layer
classifier.add(keras.layers.Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu', input_dim = 8))

# Adding the second hidden layer
classifier.add(keras.layers.Dense(units = 4, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(keras.layers.Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = keras.optimizers.Adam(0.0001), loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

Train on 223078 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1f83960fba8>

In [38]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [39]:
def confusion_metrics (conf_matrix):
# save confusion matrix and slice into four pieces
    TP = conf_matrix[1][1]
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]
    print('True Positives:', TP)
    print('True Negatives:', TN)
    print('False Positives:', FP)
    print('False Negatives:', FN)
    print(" ")
    
    # calculate accuracy
    conf_accuracy = (float (TP+TN) / float(TP + TN + FP + FN))
    
    # calculate mis-classification
    conf_misclassification = 1- conf_accuracy
    
    # calculate the sensitivity
    conf_sensitivity = (TP / float(TP + FN))
    # calculate the specificity
    conf_specificity = (TN / float(TN + FP))
    
    # calculate precision
    conf_precision = (TN / float(TN + FP))
    # calculate f_1 score
    conf_f1 = 2 * ((conf_precision * conf_sensitivity) / (conf_precision + conf_sensitivity))
    #print('-'*50)
    print(f'Accuracy: {round(conf_accuracy,2)}') 
    print(f'Mis-Classification: {round(conf_misclassification,2)}') 
    print(f'Sensitivity: {round(conf_sensitivity,2)}') 
    print(f'Specificity: {round(conf_specificity,2)}') 
    print(f'Precision: {round(conf_precision,2)}')
    print(f'f_1 Score: {round(conf_f1,2)}')

In [40]:
confusion_metrics(cm)

True Positives: 1676
True Negatives: 51568
False Positives: 484
False Negatives: 2042
 
Accuracy: 0.95
Mis-Classification: 0.05
Sensitivity: 0.45
Specificity: 0.99
Precision: 0.99
f_1 Score: 0.62


In [41]:
classifier.save("covid_prediction_model_2.h5")