In [None]:
# Importing the libraries
import numpy as np
import pandas as pd

In [None]:
# Reading the file
covid = pd.read_csv('corona_tested_006.csv', low_memory=False)

In [None]:
# Consolidating the cases of True TRUE False FALSE
covid = covid.applymap(lambda x: str(x).strip().capitalize())

# Changing non binary values to NaN
for column in covid.columns[2:]:
  covid[column] = np.where(covid[column].isin(['None', 'Other']), np.nan, covid[column])

In [None]:
# Creating pipeline to automate the process
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Pipline object
pl = Pipeline(steps=[('mode_imputation', SimpleImputer(strategy='most_frequent')),
                     ('One_hot_encode', OneHotEncoder(drop='first'))])

# Column transformer object
ct = ColumnTransformer(transformers=[('col_trans', pl, ['Cough_symptoms','Fever', 'Sore_throat', 'Shortness_of_breath',
                                                        'Headache', 'Corona', 'Age_60_above', 'Sex', 'Known_contact'])], remainder='passthrough')

covid_transformed = ct.fit_transform(covid.iloc[:, 2:])

# Converting Sparse matrix to regular numpy array
covid_transformed = covid_transformed.toarray()

# creating dataframe out of transformed data
covid_transformed = pd.DataFrame(covid_transformed, columns= ['Cough_symptoms_True', 'Fever_True', 'Sore_throat_True', 'Shortness_of_breath_True',
                                                              'Headache_True', 'Corona_Positive', 'Age_60_above_Yes', 'Sex_Male', 'Known_contact_Contact with confirmed'])

# Concatinating the first two columns
covid_transformed_final = pd.concat([covid_transformed, covid.iloc[:, :2]], axis=1)


In [None]:
# Splitting the dataset in validation and test set

#Convert 'Test_date' to datetime type
covid_transformed_final['Test_date'] = pd.to_datetime(covid_transformed_final['Test_date'], format='%d-%m-%Y')

# Slicing the data frame to create training and validation set
train_validate_set = covid_transformed_final[(covid_transformed_final['Test_date'] >= '2020-03-11') &
                                          (covid_transformed_final['Test_date'] <= '2020-04-16')]


# Slicing the data frame to create test set
test_set = covid_transformed_final[covid_transformed_final['Test_date'] >= '2020-04-17']

In [None]:
from joblib import dump, load
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# loading the model
identifier_model = load('Covid Identifier.joblib')

# Choosing the features and label

X_holdout = test_set.drop(['Ind_ID', 'Test_date', 'Corona_Positive','Cough_symptoms_True', 'Age_60_above_Yes', 'Sex_Male' ], axis=1)

y_holdout = test_set['Corona_Positive']


# Predicting the values
y_holdout_pred = identifier_model.predict(X_holdout)

# Checking the metrices

print('Model performance ')

print(f'- Acuuracy: {round(accuracy_score(y_holdout, y_holdout_pred),2)}')
print(f'- F1 Score: {round(f1_score(y_holdout, y_holdout_pred),2)}')
print(f'- Precision: {round(precision_score(y_holdout, y_holdout_pred),2)}')
print(f'- Recall: {round(recall_score(y_holdout, y_holdout_pred),2)}')

Model performance 
- Acuuracy: 0.99
- F1 Score: 0.65
- Precision: 0.76
- Recall: 0.56
