In [40]:
# ====== packages =======
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from tqdm import tqdm

# ---- data manipulators ----
from sklearn.model_selection import StratifiedShuffleSplit

# ---- evaluation ------
from sklearn.metrics import accuracy_score, f1_score, recall_score

# ---- algorithms ----
from sklearn.ensemble import RandomForestClassifier

# ---- user defined modules ----
import pre_proccessing_helpers as hel

# This line makes matplotlib plot the figures inside the norebook
%matplotlib inline

plt.rcParams['figure.figsize'] = (8.0, 8.0)
plt.rcParams['axes.grid'] = True
plt.rcParams['font.size'] = 10

In [41]:
dataset = pd.read_pickle('pre_proccessed_data.pkl')
dataset

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,test_indication,gender,corona_result
0,no cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,female,0
1,cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,female,0
2,no cough,fever,no sore_throat,no shortness_of_breath,no head_ache,Other,male,0
3,cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,female,0
4,cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,male,0
...,...,...,...,...,...,...,...,...
278842,no cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,male,0
278843,no cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,female,0
278844,no cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Other,female,0
278845,no cough,no fever,no sore_throat,no shortness_of_breath,no head_ache,Contact with confirmed,female,1


# MODEL

## The first algo we'll use is RandodmForest (because all features are categorial) so the encoding will be as follow: (label1, label2, label3,.....) -> (0,1,2, .....).
## This way we dont had more features (like in using one hot encoding) and also tree based algorithems are not sensetive to the magnitude of the labels.


### Encoding:

In [42]:
encode_dict = {
                'cough': {'cough': 1, 'no cough': 0},
                'fever': {'fever': 1, 'no fever': 0},
                'sore_throat': {'sore_throat': 1, 'no sore_throat': 0},
                'shortness_of_breath': {'shortness_of_breath': 1, 'no shortness_of_breath': 0},
                'head_ache': {'head_ache': 1, 'no head_ache': 0},
                'test_indication': {'Abroad': 2, 'Contact with confirmed': 1, 'Other': 0},
                'gender': {'male': 1, 'female': 0}
              }

data_for_tree = dataset.replace(encode_dict)
data_for_tree
#hel.check_values_count(data_for_tree)

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,test_indication,gender,corona_result
0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
278842,0,0,0,0,0,0,1,0
278843,0,0,0,0,0,0,0,0
278844,0,0,0,0,0,0,0,0
278845,0,0,0,0,0,1,0,1


### Split the data to train and test (we will use cross validation instead of having a validation set because of the small number of 'positive number')
### Because the data is imbalanced we'll use stratified random sampling so each set will represeent the whole data.

In [43]:
spliter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8)
X = data_for_tree.drop('corona_result', axis=1)
y = data_for_tree['corona_result']
train_index, test_index = next(spliter.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

### Due to imblanced model we'll oversample the minority class:

In [45]:
# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test)

print(accuracy_score(y_test, rfc_pred))
print(f1_score(y_test, rfc_pred))
print(recall_score(y_test, rfc_pred))





0.9663275149706049
0.6443675509419454
0.5702619938754678
