# Reading data

In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import shuffle

In [5]:
PATH = os.path.join("../data", "train_scaled_2.0.csv")
data = pd.read_csv(PATH)
data = shuffle(data)
data = data.dropna()

In [6]:
# data["gender"] = data["gender"].map({"male": 1, "female": 0})

In [7]:
# data["age_60_and_above"] = data["age_60_and_above"].map({"Yes": 1, "No": 0})

In [8]:
# data["test_indication"] = data["test_indication"].map({"Contact with confirmed": 2, "Abroad": 1, "Other": 0})

In [9]:
data

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,corona_result,age,gender,test_indication
91175,0.0,1.0,0.0,0.0,1.0,0.0,18.0,1.0,0.0
90702,0.0,1.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0
90113,4.0,4.0,0.0,0.0,4.0,1.0,70.0,1.0,0.0
100558,0.0,0.0,0.0,1.0,0.0,0.0,71.0,1.0,0.0
80373,0.0,1.0,1.0,0.0,1.0,0.0,41.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
24841,0.0,0.0,0.0,0.0,0.0,0.0,85.0,0.0,0.0
28890,0.0,1.0,0.0,1.0,0.0,0.0,63.0,1.0,0.0
10104,2.0,1.0,4.0,2.0,4.0,1.0,43.0,0.0,0.0
49758,1.0,1.0,0.0,0.0,1.0,0.0,46.0,0.0,1.0


In [10]:
data["corona_result"].value_counts(dropna = False)

0.0    98586
1.0     8956
Name: corona_result, dtype: int64

In [11]:
y = data["corona_result"]
X = data.drop(["corona_result"], axis = 1)

In [12]:
X["gender"].unique()

array([1., 0.])

In [13]:
y.unique()

array([0., 1.])

In [14]:
X.shape, y.shape, data.shape, data["corona_result"].shape

((107542, 8), (107542,), (107542, 9), (107542,))

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.22, random_state = 11)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((83882, 8), (23660, 8), (83882,), (23660,))

# Logistic Regression

In [1]:
from sklearn.linear_model import LogisticRegressionCV

In [3]:
Cs = [0.0001, 0.00001]
logit = LogisticRegressionCV(Cs = Cs, class_weight = "balanced", \
                             n_jobs = -1, verbose = 5, random_state = 11)

In [17]:
logit.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.2s finished


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=11, solver='lbfgs', tol=0.0001, verbose=5,
                   warm_start=False)

In [19]:
logit.C_

In [20]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
accuracy_score(y_test, logit.predict(X_test)), recall_score(y_test, logit.predict(X_test)), \
            precision_score(y_test, logit.predict(X_test))

(0.8983516483516484, 0.8154311649016641, 0.4422866520787746)

In [21]:
y_test.value_counts()

0.0    21677
1.0     1983
Name: corona_result, dtype: int64

### not the best result

# Random forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(n_estimators = 201, max_depth = 14, \
                            oob_score = True, n_jobs = -1, class_weight = "balanced", random_state = 17)

# params = {
#     "max_depth": np.arange(15, 19)
# }

# # depth = 11
# # depth = 18

# rf = GridSearchCV(rf, params, n_jobs = -1, verbose = 30)

In [44]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=14, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=201,
                       n_jobs=-1, oob_score=True, random_state=17, verbose=0,
                       warm_start=False)

In [45]:
# rf.best_params_

In [46]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
accuracy = accuracy_score(y_test, rf.predict(X_test))
recall = recall_score(y_test, rf.predict(X_test))
precision = precision_score(y_test, rf.predict(X_test))

print("Общая точность: {}".format(accuracy))
print("Полнота (определение больного человека): {}".format(recall))
print("Точность положительного диагноза: {}".format(precision))

Общая точность: 0.9691885038038884
Полнота (определение больного человека): 0.7984455958549223
Точность положительного диагноза: 0.8192450824029771


### Very good

### Writing model into file

In [47]:
import pickle
filename = "Random_Forest.sav"
pickle.dump(rf, open(filename, "wb"))

### Loading from model

In [48]:
import pickle
filename = "Random_Forest.sav"
loaded_model = pickle.load(open(filename, "rb"))

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, loaded_model.predict(X_test))

0.9691885038038884

In [50]:
lst = [2, 3, 1, 1, 0, 0, 0, 0]
loaded_model.predict_proba([lst])

array([[0.09704826, 0.90295174]])

# KNN

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier(n_neighbors = 23, n_jobs = -1)

# params = {
#     "n_neighbors": np.arange(7, 14, 2),
# }
# knn = GridSearchCV(knn, params, n_jobs = -1, verbose = 10)

In [53]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=23, p=2,
                     weights='uniform')

In [55]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
accuracy_score(y_test, knn.predict(X_test)), recall_score(y_test, knn.predict(X_test)), precision_score(y_test, knn.predict(X_test))

(0.9617497886728656, 0.5532022188603126, 0.9829749103942652)

# SVM

In [101]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm = SVC(verbose = 10, random_state = 11)

# params = {
#     "C": [10, 1, 0.1, 0.01],
#     "degree": np.arange(2, 5)
# }

# svm = GridSearchCV(svm, params, verbose = 10, n_jobs = -1)

In [None]:
svm.fit(X_train, y_train)

[LibSVM]

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
accuracy_score(y_test, svm.predict(X_test)), recall_score(y_test, svm.predict(X_test)), \
         precision_score(y_test, svm.predict(X_test))

# XGBClassifier

In [15]:
import xgboost as xgb
from xgboost import XGBClassifier

params = {'booster': 'gbtree', 'max_depth': 3, 'eta': 0.1, 
    'silent': 1, 'objective': 'binary:logistic', 'nthread': 1, "learning_rate": 0.1}

model = XGBClassifier(**params)
model.fit(X_train, y_train)

# make predictions for test data
y_pred = model.predict(X_test)

In [16]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print("Общая точность: {}".format(accuracy))
print("Полнота (определение больного человека): {}".format(recall))
print("Точность положительного диагноза: {}".format(precision))

Общая точность: 0.9730346576500423
Полнота (определение больного человека): 0.7686567164179104
Точность положительного диагноза: 0.8993015133876601


## Saving model

In [18]:
import pickle
filename = "boosting.sav"
pickle.dump(model, open(filename, "wb"))

## Loading Model

In [19]:
import pickle
filename = "boosting.sav"
loaded_model = pickle.load(open(filename, "rb"))

In [23]:
import xgboost as xgb

lst = [1, 2, 2, 1, 0, 0, 0, 0]

columns = ['cough', 'fever', 'sore_throat', 'shortness_of_breath', 'head_ache',
       'age', 'gender', 'test_indication']
lst = pd.DataFrame(lst, index = columns).T

loaded_model.predict_proba(lst)

array([[0.6034082 , 0.39659178]], dtype=float32)

# Deep Net

### Reading csv

In [18]:
import torch
import torchvision
from torchvision import datasets, transforms
import pandas as pd
import numpy as np
import helper

In [19]:
X_train_tensor = torch.tensor(X_train.values)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
# dtype=torch.long

### Net

In [20]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, hidden_layers = 32, hidden_size = 32):
        super().__init__()
        self.fc1 = nn.Linear(8, hidden_size)
        self.layers = [nn.Linear(hidden_size, hidden_size)] * hidden_layers
        self.fc12 = nn.Linear(hidden_size, 2)
        
    def forward(self, X):
        X = X.view(-1, 8).float()
        
        X = F.relu(self.fc1(X))
        for layer in self.layers:
            X = F.relu(layer(X))
            
        X = self.fc12(X)
        
        return F.log_softmax(X, dim = 1)

In [41]:
import torch.optim as optim
net = Net(hidden_layers = 1, hidden_size = 32)

In [66]:
optimizer = optim.Adam(net.parameters(), lr = 0.001)
# optimizer = optim.SGD(net.parameters(), lr = 0.34, momentum = 0.038)

In [69]:
%%time
from tqdm import tqdm
EPOCHS = 100

for epoch in tqdm(range(EPOCHS)):
#     for i in range(0, len(X_train_tensor), 64):
    X = X_train_tensor
    y = y_train_tensor
    net.zero_grad()
    output = torch.FloatTensor(net(X))
    loss = F.nll_loss(output, y)
    loss.backward()
    optimizer.step()
    print(loss)

  2%|█                                                   | 2/100 [00:00<00:11,  8.57it/s]

tensor(0.1124, grad_fn=<NllLossBackward>)
tensor(0.1124, grad_fn=<NllLossBackward>)


  3%|█▌                                                  | 3/100 [00:00<00:11,  8.59it/s]

tensor(0.1124, grad_fn=<NllLossBackward>)
tensor(0.1124, grad_fn=<NllLossBackward>)


  5%|██▌                                                 | 5/100 [00:00<00:10,  8.66it/s]

tensor(0.1123, grad_fn=<NllLossBackward>)
tensor(0.1123, grad_fn=<NllLossBackward>)


  7%|███▋                                                | 7/100 [00:00<00:10,  8.91it/s]

tensor(0.1123, grad_fn=<NllLossBackward>)
tensor(0.1123, grad_fn=<NllLossBackward>)


  9%|████▋                                               | 9/100 [00:01<00:09,  9.11it/s]

tensor(0.1123, grad_fn=<NllLossBackward>)
tensor(0.1122, grad_fn=<NllLossBackward>)


 11%|█████▌                                             | 11/100 [00:01<00:09,  9.24it/s]

tensor(0.1122, grad_fn=<NllLossBackward>)
tensor(0.1122, grad_fn=<NllLossBackward>)
tensor(0.1122, grad_fn=<NllLossBackward>)

 14%|███████▏                                           | 14/100 [00:01<00:09,  9.30it/s]


tensor(0.1122, grad_fn=<NllLossBackward>)


 16%|████████▏                                          | 16/100 [00:01<00:09,  8.87it/s]

tensor(0.1122, grad_fn=<NllLossBackward>)
tensor(0.1121, grad_fn=<NllLossBackward>)


 18%|█████████▏                                         | 18/100 [00:01<00:09,  8.67it/s]

tensor(0.1121, grad_fn=<NllLossBackward>)
tensor(0.1121, grad_fn=<NllLossBackward>)


 20%|██████████▏                                        | 20/100 [00:02<00:09,  8.80it/s]

tensor(0.1121, grad_fn=<NllLossBackward>)
tensor(0.1121, grad_fn=<NllLossBackward>)


 22%|███████████▏                                       | 22/100 [00:02<00:08,  8.83it/s]

tensor(0.1120, grad_fn=<NllLossBackward>)
tensor(0.1120, grad_fn=<NllLossBackward>)


 24%|████████████▏                                      | 24/100 [00:02<00:08,  8.59it/s]

tensor(0.1120, grad_fn=<NllLossBackward>)
tensor(0.1120, grad_fn=<NllLossBackward>)


 26%|█████████████▎                                     | 26/100 [00:02<00:08,  8.63it/s]

tensor(0.1119, grad_fn=<NllLossBackward>)
tensor(0.1119, grad_fn=<NllLossBackward>)


 28%|██████████████▎                                    | 28/100 [00:03<00:08,  8.46it/s]

tensor(0.1119, grad_fn=<NllLossBackward>)
tensor(0.1119, grad_fn=<NllLossBackward>)


 30%|███████████████▎                                   | 30/100 [00:03<00:08,  8.34it/s]

tensor(0.1119, grad_fn=<NllLossBackward>)
tensor(0.1118, grad_fn=<NllLossBackward>)


 32%|████████████████▎                                  | 32/100 [00:03<00:07,  8.62it/s]

tensor(0.1118, grad_fn=<NllLossBackward>)
tensor(0.1118, grad_fn=<NllLossBackward>)


 34%|█████████████████▎                                 | 34/100 [00:03<00:07,  8.51it/s]

tensor(0.1117, grad_fn=<NllLossBackward>)
tensor(0.1117, grad_fn=<NllLossBackward>)


 36%|██████████████████▎                                | 36/100 [00:04<00:07,  8.49it/s]

tensor(0.1117, grad_fn=<NllLossBackward>)
tensor(0.1117, grad_fn=<NllLossBackward>)


 38%|███████████████████▍                               | 38/100 [00:04<00:07,  8.39it/s]

tensor(0.1116, grad_fn=<NllLossBackward>)
tensor(0.1116, grad_fn=<NllLossBackward>)


 40%|████████████████████▍                              | 40/100 [00:04<00:06,  8.87it/s]

tensor(0.1116, grad_fn=<NllLossBackward>)
tensor(0.1116, grad_fn=<NllLossBackward>)


 42%|█████████████████████▍                             | 42/100 [00:04<00:06,  8.87it/s]

tensor(0.1115, grad_fn=<NllLossBackward>)
tensor(0.1115, grad_fn=<NllLossBackward>)


 44%|██████████████████████▍                            | 44/100 [00:04<00:06,  8.89it/s]

tensor(0.1115, grad_fn=<NllLossBackward>)
tensor(0.1114, grad_fn=<NllLossBackward>)


 46%|███████████████████████▍                           | 46/100 [00:05<00:06,  8.87it/s]

tensor(0.1114, grad_fn=<NllLossBackward>)
tensor(0.1114, grad_fn=<NllLossBackward>)


 48%|████████████████████████▍                          | 48/100 [00:05<00:05,  8.86it/s]

tensor(0.1113, grad_fn=<NllLossBackward>)
tensor(0.1113, grad_fn=<NllLossBackward>)


 50%|█████████████████████████▌                         | 50/100 [00:05<00:05,  8.78it/s]

tensor(0.1113, grad_fn=<NllLossBackward>)
tensor(0.1112, grad_fn=<NllLossBackward>)


 52%|██████████████████████████▌                        | 52/100 [00:05<00:05,  8.48it/s]

tensor(0.1112, grad_fn=<NllLossBackward>)
tensor(0.1112, grad_fn=<NllLossBackward>)


 54%|███████████████████████████▌                       | 54/100 [00:06<00:05,  8.72it/s]

tensor(0.1111, grad_fn=<NllLossBackward>)
tensor(0.1111, grad_fn=<NllLossBackward>)


 56%|████████████████████████████▌                      | 56/100 [00:06<00:05,  8.60it/s]

tensor(0.1111, grad_fn=<NllLossBackward>)
tensor(0.1110, grad_fn=<NllLossBackward>)


 58%|█████████████████████████████▌                     | 58/100 [00:06<00:04,  8.72it/s]

tensor(0.1110, grad_fn=<NllLossBackward>)
tensor(0.1109, grad_fn=<NllLossBackward>)


 60%|██████████████████████████████▌                    | 60/100 [00:06<00:04,  8.72it/s]

tensor(0.1109, grad_fn=<NllLossBackward>)
tensor(0.1109, grad_fn=<NllLossBackward>)


 62%|███████████████████████████████▌                   | 62/100 [00:07<00:04,  8.76it/s]

tensor(0.1108, grad_fn=<NllLossBackward>)
tensor(0.1108, grad_fn=<NllLossBackward>)


 64%|████████████████████████████████▋                  | 64/100 [00:07<00:04,  8.60it/s]

tensor(0.1108, grad_fn=<NllLossBackward>)
tensor(0.1107, grad_fn=<NllLossBackward>)


 66%|█████████████████████████████████▋                 | 66/100 [00:07<00:03,  8.58it/s]

tensor(0.1107, grad_fn=<NllLossBackward>)
tensor(0.1106, grad_fn=<NllLossBackward>)


 68%|██████████████████████████████████▋                | 68/100 [00:07<00:03,  8.76it/s]

tensor(0.1106, grad_fn=<NllLossBackward>)
tensor(0.1106, grad_fn=<NllLossBackward>)


 70%|███████████████████████████████████▋               | 70/100 [00:07<00:03,  8.63it/s]

tensor(0.1105, grad_fn=<NllLossBackward>)
tensor(0.1105, grad_fn=<NllLossBackward>)


 72%|████████████████████████████████████▋              | 72/100 [00:08<00:03,  8.20it/s]

tensor(0.1104, grad_fn=<NllLossBackward>)
tensor(0.1104, grad_fn=<NllLossBackward>)


 74%|█████████████████████████████████████▋             | 74/100 [00:08<00:03,  8.65it/s]

tensor(0.1104, grad_fn=<NllLossBackward>)
tensor(0.1103, grad_fn=<NllLossBackward>)


 76%|██████████████████████████████████████▊            | 76/100 [00:08<00:02,  8.62it/s]

tensor(0.1103, grad_fn=<NllLossBackward>)
tensor(0.1103, grad_fn=<NllLossBackward>)


 78%|███████████████████████████████████████▊           | 78/100 [00:08<00:02,  8.54it/s]

tensor(0.1102, grad_fn=<NllLossBackward>)
tensor(0.1102, grad_fn=<NllLossBackward>)


 80%|████████████████████████████████████████▊          | 80/100 [00:09<00:02,  8.63it/s]

tensor(0.1102, grad_fn=<NllLossBackward>)
tensor(0.1101, grad_fn=<NllLossBackward>)


 82%|█████████████████████████████████████████▊         | 82/100 [00:09<00:02,  8.51it/s]

tensor(0.1101, grad_fn=<NllLossBackward>)
tensor(0.1101, grad_fn=<NllLossBackward>)


 84%|██████████████████████████████████████████▊        | 84/100 [00:09<00:01,  8.44it/s]

tensor(0.1101, grad_fn=<NllLossBackward>)
tensor(0.1100, grad_fn=<NllLossBackward>)


 86%|███████████████████████████████████████████▊       | 86/100 [00:09<00:01,  8.46it/s]

tensor(0.1100, grad_fn=<NllLossBackward>)
tensor(0.1100, grad_fn=<NllLossBackward>)


 88%|████████████████████████████████████████████▉      | 88/100 [00:10<00:01,  8.21it/s]

tensor(0.1099, grad_fn=<NllLossBackward>)
tensor(0.1099, grad_fn=<NllLossBackward>)


 90%|█████████████████████████████████████████████▉     | 90/100 [00:10<00:01,  8.24it/s]

tensor(0.1099, grad_fn=<NllLossBackward>)
tensor(0.1099, grad_fn=<NllLossBackward>)


 92%|██████████████████████████████████████████████▉    | 92/100 [00:10<00:00,  8.58it/s]

tensor(0.1098, grad_fn=<NllLossBackward>)
tensor(0.1098, grad_fn=<NllLossBackward>)


 94%|███████████████████████████████████████████████▉   | 94/100 [00:10<00:00,  8.41it/s]

tensor(0.1098, grad_fn=<NllLossBackward>)
tensor(0.1098, grad_fn=<NllLossBackward>)


 96%|████████████████████████████████████████████████▉  | 96/100 [00:11<00:00,  8.57it/s]

tensor(0.1097, grad_fn=<NllLossBackward>)
tensor(0.1097, grad_fn=<NllLossBackward>)


 98%|█████████████████████████████████████████████████▉ | 98/100 [00:11<00:00,  8.59it/s]

tensor(0.1097, grad_fn=<NllLossBackward>)
tensor(0.1097, grad_fn=<NllLossBackward>)


100%|██████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.71it/s]

tensor(0.1096, grad_fn=<NllLossBackward>)
tensor(0.1096, grad_fn=<NllLossBackward>)
Wall time: 11.5 s





In [70]:
X_test_tensor = torch.tensor(X_test.values)

result = net(X_test_tensor)
print(torch.argmax(result))
lst = [int(torch.argmax(elem)) for elem in result]
# lst = [1 if elem[1] > -1.2 else int(torch.argmax(elem)) for elem in result]    ## -1.39; -1.89

print(set(lst))
print(torch.exp(result))

tensor(47143)
{0, 1}
tensor([[0.9912, 0.0088],
        [0.9830, 0.0170],
        [0.9933, 0.0067],
        ...,
        [0.9858, 0.0142],
        [0.9745, 0.0255],
        [0.9802, 0.0198]], grad_fn=<ExpBackward>)


### all the metrics are not bad

In [71]:
from sklearn.metrics import recall_score, accuracy_score, precision_score
accuracy_score(y_test, lst), recall_score(y_test, lst), \
            precision_score(y_test, lst)

(0.9700338123415047, 0.7060020345879959, 0.9137590520079)

### Seems good, but boosting and random forest were better

# Saving model

In [61]:
torch.save(net, "net.pt")

  "type " + obj.__name__ + ". It won't be checked "


# loading and checking the net

In [337]:
loaded_model = torch.load("net.pt")

In [338]:
lst = [1, 1, 0, 1, 0, 0, 0, 1]
lst = torch.Tensor(lst)
lst

tensor([1., 1., 0., 1., 0., 0., 0., 1.])

In [267]:
lst = [1, 0, 0, 0, 0, 0, 0, 2]


CONSTANT = -1.1
lst = torch.Tensor(lst)
result = loaded_model(lst)
print(torch.argmax(result))
lst = [1 if elem[1] > CONSTANT else int(torch.argmax(elem)) for elem in result]    ## -1.39; -1.89
print(lst)
print(torch.exp(result))

tensor(0)
[0]
tensor([[0.9015, 0.0985]], grad_fn=<ExpBackward>)


In [355]:
X_test_tensor = torch.tensor(X_test.values)

result = loaded_model(X_test_tensor)
print(torch.argmax(result))
# lst = [int(torch.argmax(elem)) for elem in result]
lst = [1 if elem[1] > -1.6 else int(torch.argmax(elem)) for elem in result]    ## -1.39; -1.89

print(set(lst))
print(torch.exp(result))

tensor(47313)
{0, 1}
tensor([[0.9847, 0.0153],
        [0.9885, 0.0115],
        [0.9482, 0.0518],
        ...,
        [0.9789, 0.0211],
        [0.9936, 0.0064],
        [0.9894, 0.0106]], grad_fn=<ExpBackward>)


In [356]:
from sklearn.metrics import recall_score, accuracy_score, precision_score
accuracy_score(y_test, lst), \
            recall_score(y_test, lst), \
            precision_score(y_test, lst)

(0.9672020287404903, 0.7862595419847328, 0.8127301420305103)