In [1]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.cuda.is_available()

True

In [2]:
import pandas as pd
import numpy as np
import json

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import re
from tqdm.notebook import tqdm
from pymorphy2 import MorphAnalyzer
from functools import partial
from stop_words import get_stop_words

tqdm.pandas()

SEED = 21

In [3]:
PATH = '../data'

In [4]:
df_train = pd.read_csv(os.path.join(PATH, 'train.csv'))
df_val = pd.read_csv(os.path.join(PATH, 'val.csv'))
df_test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [5]:
def norm_form(list_words, morph):
    return [morph.parse(word)[0].normal_form for word in list_words]

def del_stopwords(list_words, stop_words):
    return [word for word in list_words if word not in stop_words]

def transform_data(df):
    df = df.copy()
    df['level_2'] = df['icd10'].str.split('.').apply(lambda x: x[0])
    df['level_1'] = df['icd10'].apply(lambda x: x[0])
    df['symptoms_tokens'] = df['symptoms'] \
        .str.lower() \
        .str.split('[^a-zа-яё]+') \
        .progress_apply(partial(norm_form, morph=MorphAnalyzer())) \
        .progress_apply(partial(del_stopwords, stop_words=get_stop_words('russian')))
    return df

In [6]:
class TargetEncoder:
    def __init__(self):
        self.i2l = None
        self.l2i = None
    
    def fit(self, y):
        self.i2l = dict(enumerate(sorted(set(y))))
        self.l2i = {label: i for i, label in self.i2l.items()}
        
    def transform(self, y):
        default_index = max(self.l2i.values()) + 1
        label = np.array([
            self.l2i[label] if label in self.l2i else default_index 
            for label in y
        ])
        
    def inverse_transforn(self, y):
        pass

In [8]:
# i = 3


# level_2 = cats_level_2[i]
# level_1 = dict_levels[level_2]
# idx = dict_level2idx[level_1]
# idx

In [9]:
X_train = transform_data(df_train)
X_val = transform_data(df_val)
X_test = transform_data(df_test)

  0%|          | 0/5604 [00:00<?, ?it/s]

  0%|          | 0/5604 [00:00<?, ?it/s]

  0%|          | 0/1010 [00:00<?, ?it/s]

  0%|          | 0/1010 [00:00<?, ?it/s]

  0%|          | 0/1011 [00:00<?, ?it/s]

  0%|          | 0/1011 [00:00<?, ?it/s]

In [10]:
target_enc_1 = OneHotEncoder(sparse=False, handle_unknown='ignore')
target_enc_1.fit(X_train[['level_1']])

target_enc_2 = OneHotEncoder(sparse=False, handle_unknown='ignore')
target_enc_2.fit(X_train[['level_2']])

cats_level_1 = target_enc_1.categories_[0]
cats_level_2 = target_enc_2.categories_[0]

dict_levels = dict(zip(*X_train[['level_2', 'level_1']].drop_duplicates().T.values))
dict_level2idx = {cat: i for i, cat in enumerate(cats_level_1)}

In [11]:
tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf.fit(X_train['symptoms_tokens'])

X_train_new = tfidf.transform(X_train['symptoms_tokens']).toarray()
X_val_new = tfidf.transform(X_val['symptoms_tokens']).toarray()
X_test_new = tfidf.transform(X_test['symptoms_tokens']).toarray()



In [12]:
y_train_1 = target_enc_1.transform(X_train[['level_1']])
y_train_2 = target_enc_2.transform(X_train[['level_2']])

y_val_1 = target_enc_1.transform(X_val[['level_1']])
y_val_2 = target_enc_2.transform(X_val[['level_2']])

y_test_1 = target_enc_1.transform(X_test[['level_1']])
y_test_2 = target_enc_2.transform(X_test[['level_2']])

In [13]:
class TextDataset(Dataset):
    def __init__(self, X, y1, y2):
        self.X = X
        self.y1 = y1
        self.y2 = y2

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return {
            'X': torch.FloatTensor(self.X[idx]), 
            'y1': torch.FloatTensor(self.y1[idx]),
            'y2': torch.FloatTensor(self.y2[idx]),
        }

In [14]:
# class ModelClassification(nn.Module):
#     def __init__(self, num_feature, num_classes):
#         super(ModelClassification, self).__init__()
#         self.num_classes = num_classes
#         self.fc1 = nn.Linear(num_feature, 512) # 12 is the number of features
#         self.fc2 = nn.Linear(512, 256)
#         self.fc3 = nn.Linear(256, 128)
#         self.fc4 = nn.Linear(128, 64)
        
#         self.out = nn.Linear(64, 1)
    
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x)) 
#         x = F.relu(self.fc3(x))
#         x = F.relu(self.fc4(x))
        
#         out = torch.hstack([torch.sigmoid(self.out(x)) for i in range(self.num_classes)])
                
#         return out

In [124]:
# loss_fn = nn.CrossEntropyLoss()

In [125]:
# def binary_loss_fn(outputs, targets):
    
#     first = True
#     for i in range(targets.shape[1]):
#         o = outputs[:, i]
#         t = targets[:, i]
#         if first:
#             loss = nn.BCELoss()(o, t)
#             first = False
#         else:
#             loss += nn.BCELoss()(o, t)
            
#     loss /= len(targets)
    
#     return loss

In [212]:
def hit_at_n(y_true, y_pred, n=3):
    assert len(y_true) == len(y_pred)
    
    score = np.mean(np.any(
        np.argsort(-y_pred, axis=1)[:, :n] == y_true.reshape(-1,1), 
        axis=1
    ))
    return score

In [262]:
from IPython.display import clear_output

In [305]:
# кастомные функции для обучения моделей
import gc

def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()


class Trainer():
    def __init__(self, model, loss_func, opt, device='cpu'):
        self.model = model.to(device)
        self.loss_func = loss_func
        self.opt = opt
        self.device = device
            
    def train_epoch(self, train_iter, epoch):
        loss_value = 0.0
        
        y1 = []
        y2 = []
        y_pred1 = []
        y_pred2 = []
        self.model.train()
        pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
        pbar.set_description(f"Epoch {epoch}")
        for it, batch in pbar: 
            self.opt.zero_grad()
            
            X = batch['X'].to(self.device)
            y = batch['y1'].to(self.device), batch['y2'].to(self.device)
            outputs = self.model(X)

            loss = self.loss_func(outputs, y)
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 20)
            self.opt.step()

            loss_value += loss.item()

            y1.append(batch['y1'].numpy().argmax(axis=1))
            y2.append(batch['y2'].numpy().argmax(axis=1))

            y_pred1.append(outputs[0].cpu().detach().numpy())
            y_pred2.append(outputs[1].cpu().detach().numpy())

            pbar.set_description(f"""
                Train Loss: {loss:.4}
            """)

        y1 = np.hstack(y1)
        y2 = np.hstack(y2)

        y_pred1 = np.vstack(y_pred1)
        y_pred2 = np.vstack(y_pred2)
        
        metrics = dict(
            loss = loss_value / len(train_iter),
            hit3_label1 = hit_at_n(y1, y_pred1, n=3),
            hit3_label2 = hit_at_n(y2, y_pred2, n=3),
            precision_label1 = hit_at_n(y1, y_pred1, n=1),
            precision_label2 = hit_at_n(y2, y_pred2, n=1),
        )
        
        return metrics


    def eval_epoch(self, val_iter, epoch):
        loss_value = 0.0
        
        y1 = []
        y2 = []
        y_pred1 = []
        y_pred2 = []
        
        self.model.eval()
        pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
        pbar.set_description(f"Epoch {epoch}")
        with torch.no_grad():
            for it, batch in pbar:
                X = batch['X'].to(self.device)
                y = batch['y1'].to(self.device), batch['y2'].to(self.device)
                outputs = self.model(X)
                loss = self.loss_func(outputs, y)
                loss_value += loss.item()

                y1.append(batch['y1'].numpy().argmax(axis=1))
                y2.append(batch['y2'].numpy().argmax(axis=1))

                y_pred1.append(outputs[0].cpu().detach().numpy())
                y_pred2.append(outputs[1].cpu().detach().numpy())
                
                pbar.set_description(f"""
                    Test Loss: {loss:.4}
                """)

        y1 = np.hstack(y1)
        y2 = np.hstack(y2)

        y_pred1 = np.vstack(y_pred1)
        y_pred2 = np.vstack(y_pred2)
        
        metrics = dict(
            loss = loss_value / len(val_iter),
            hit3_label1 = hit_at_n(y1, y_pred1, n=3),
            hit3_label2 = hit_at_n(y2, y_pred2, n=3),
            precision_label1 = hit_at_n(y1, y_pred1, n=1),
            precision_label2 = hit_at_n(y2, y_pred2, n=1),
        )
        
        return metrics


    def train_loop(self, train_iter, valid_iter, max_epochs, patience):

        min_loss = np.inf

        cur_patience = 0

        for epoch in range(1, max_epochs + 1):
            train_metrics = self.train_epoch(train_iter, epoch)
            clear_cache()
            
            val_metrics = self.eval_epoch(valid_iter, epoch)
            clear_cache()
            
            val_loss = val_metrics['loss']
            if val_loss < min_loss:
                min_loss = val_loss
                best_model = self.model.state_dict()
            else:
                cur_patience += 1
                if cur_patience == patience:
                    cur_patience = 0
                    break
            clear_output()
            print('%20s: %2d' % ('epoch', epoch))
            print()
            print('%20s: %7.4f %3.4f' % ('loss', train_metrics['loss'], val_metrics['loss']))
            print()
            print('%20s: %7.4f %3.4f' % ('hit3_label1', train_metrics['hit3_label1'], val_metrics['hit3_label1']))
            print('%20s: %7.4f %3.4f' % ('precision_label1', train_metrics['precision_label1'], val_metrics['precision_label1']))
            print()
            print('%20s: %7.4f %3.4f' % ('hit3_label2', train_metrics['hit3_label2'], val_metrics['hit3_label2']))
            print('%20s: %7.4f %3.4f' % ('precision_label2', train_metrics['precision_label2'], val_metrics['precision_label2']))

#             print(*[f'{k}: {v}' for k, v in train_metrics.items()])
#             print(*[f'{k}: {v}' for k, v in val_metrics.items()])

        self.model.load_state_dict(best_model)
        
        return None

In [416]:
class ModelClassification(nn.Module):
    def __init__(self, num_feature, num_classes_1=len(cats_level_1), num_classes_2=len(cats_level_2)):
        super(ModelClassification, self).__init__()
        
#         self.fc1 = nn.Linear(num_feature, 256)
#         self.fc2 = nn.Linear(256, 128)
        
        self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(p=0.2)
#         self.batchnorm1 = nn.BatchNorm1d(256)
#         self.batchnorm2 = nn.BatchNorm1d(128)
        
        self.out_2 = nn.Linear(num_feature, num_classes_2)
        self.out_1 = nn.Linear(num_feature, num_classes_1)
        self.fc = nn.Linear(num_classes_1, num_classes_2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
    
    def forward(self, x):
#         x = self.fc1(x)
#         x = self.batchnorm1(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
#         x = self.fc2(x)
#         x = self.batchnorm2(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        #x = F.tanh(self.fc2(x))
        #x = F.tanh(self.fc3(x))
        
#         out_1 = F.softmax(self.out_1(x))
#         out_2 = F.softmax(self.relu(self.out_2(x)))
        out_1 = self.out_1(x)
        out_2 = self.out_2(x)
#         out_2_new = []
#         for i in range(out_2.shape[1]):
#             level_2 = cats_level_2[i]
#             level_1 = dict_levels[level_2]
#             j = dict_level2idx[level_1]
#             out_2_new.append(out_2[:, i] * out_1[:, j])
#         out_2_new = torch.vstack(out_2_new).T
        return out_1, out_2

In [417]:
# criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

In [418]:
def loss_fn(outputs, targets):
    o1, o2 = outputs
    t1, t2 = targets
    l1 = nn.CrossEntropyLoss()(o1, t1)
    l2 = nn.CrossEntropyLoss()(o2, t2)
    loss = 0.1*l1 + 0.9*l2
    return l2

In [419]:
train_dataset = TextDataset(X_train_new, y_train_1, y_train_2)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1024)

val_dataset = TextDataset(X_val_new, y_val_1, y_val_2)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=1024)

test_dataset = TextDataset(X_test_new, y_test_1, y_test_2)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=1024)



model = ModelClassification(X_train_new.shape[1]) 
optimizer = optim.Adam(params=model.parameters(), lr=0.01, weight_decay=1e-5)
epochs = 100

trainer = Trainer(
    model, 
    loss_func=loss_fn, 
    opt=optimizer, device=torch.device('cuda'))
trainer.train_loop(
    train_dataloader, 
    val_dataloader, 
    max_epochs=1000, 
    patience=100
)

               epoch: 297

                loss:  0.4693 2.5291

         hit3_label1:  0.2068 0.2040
    precision_label1:  0.0814 0.0842

         hit3_label2:  0.9982 0.6317
    precision_label2:  0.9599 0.4059


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [420]:
model.eval()

batch = next(iter(val_dataloader))
X = batch['X'].to(device)
y1 = batch['y1'].numpy().argmax(axis=1)
y2 = batch['y2'].numpy().argmax(axis=1)

y_pred1 = F.softmax(model(X)[0].cpu().detach()).numpy()
y_pred2 = F.softmax(model(X)[1].cpu().detach()).numpy()

print(hit_at_n(y2, y_pred2, n=3), hit_at_n(y2, y_pred2, n=1))

for i in range(y_pred2.shape[1]):
    level_2 = cats_level_2[i]
    level_1 = dict_levels[level_2]
    j = dict_level2idx[level_1]
    y_pred2[:, i] = y_pred2[:, i] * y_pred1[:, j]

print(hit_at_n(y2, y_pred2, n=3), hit_at_n(y2, y_pred2, n=1))

0.6326732673267327 0.404950495049505
0.6326732673267327 0.404950495049505


  y_pred1 = F.softmax(model(X)[0].cpu().detach()).numpy()
  y_pred2 = F.softmax(model(X)[1].cpu().detach()).numpy()


In [421]:
model.eval()

batch = next(iter(test_dataloader))
X = batch['X'].to(device)
y1 = batch['y1'].numpy().argmax(axis=1)
y2 = batch['y2'].numpy().argmax(axis=1)

y_pred1 = F.softmax(model(X)[0].cpu().detach()).numpy()
y_pred2 = F.softmax(model(X)[1].cpu().detach()).numpy()

print(hit_at_n(y2, y_pred2, n=3), hit_at_n(y2, y_pred2, n=1))

for i in range(y_pred2.shape[1]):
    level_2 = cats_level_2[i]
    level_1 = dict_levels[level_2]
    j = dict_level2idx[level_1]
    y_pred2[:, i] = y_pred2[:, i] * y_pred1[:, j]

print(hit_at_n(y2, y_pred2, n=3), hit_at_n(y2, y_pred2, n=1))

0.629080118694362 0.3837784371909001
0.6271018793273986 0.3837784371909001


  y_pred1 = F.softmax(model(X)[0].cpu().detach()).numpy()
  y_pred2 = F.softmax(model(X)[1].cpu().detach()).numpy()
