In [2]:
import yaml
import os

In [3]:
with open(os.path.expanduser('~/develop/ClearML_ML_SD.yml'), 'r') as f:
    keys = yaml.safe_load(f)

In [4]:
os.environ["CLEARML_WEB_HOST"] = "https://app.clear.ml"
os.environ["CLEARML_API_HOST"] = "https://api.clear.ml"
os.environ["CLEARML_FILES_HOST"] = "https://files.clear.ml"
os.environ["CLEARML_API_ACCESS_KEY"] = keys['access_key']
os.environ["CLEARML_API_SECRET_KEY"] = keys['secret_key']

In [5]:
from clearml import Task, Logger

In [6]:
task = Task.init(
    project_name='ML_SD', 
    task_name='bert', 
    tags=['bert'])

ClearML Task: overwriting (reusing) task id=a80888e22dfb4a67b33be4ff8ae23846
2022-10-25 15:02:11,989 - clearml.Task - INFO - No repository found, storing script code instead
ClearML results page: https://app.clear.ml/projects/922c69dbd48249b183708fef50f18e10/experiments/a80888e22dfb4a67b33be4ff8ae23846/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [7]:
from pathlib import Path
from time import time

import numpy as np
import pandas as pd
import sklearn.metrics as skm
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
# Adam с исправлениями и планировщик learning rate
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

import numpy as np
import os

from sklearn.model_selection import train_test_split

In [8]:
torch.cuda.is_available()

True

In [9]:
SEED = 21
PATH = '../data'

In [10]:
df_train = pd.read_csv(os.path.join(PATH, 'train.csv'))
df_val = pd.read_csv(os.path.join(PATH, 'val.csv'))
df_test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [11]:
def norm_form(list_words, morph):
    return [morph.parse(word)[0].normal_form for word in list_words]

def del_stopwords(list_words, stop_words):
    return [word for word in list_words if word not in stop_words]

def transform_data(df):
    df = df.copy()
    df['level_2'] = df['icd10'].str.split('.').apply(lambda x: x[0])
    df['level_1'] = df['icd10'].apply(lambda x: x[0])
    # df['symptoms_tokens'] = df['symptoms'] \
    #     .str.lower() \
    #     .str.split('[^a-zа-яё]+') \
    #     .progress_apply(partial(norm_form, morph=MorphAnalyzer())) \
    #     .progress_apply(partial(del_stopwords, stop_words=get_stop_words('russian')))
    return df

In [12]:
df_train = transform_data(df_train)
df_val = transform_data(df_val)
df_test = transform_data(df_test)

In [13]:
# mask = df_train.groupby('level_2').transform('size') > 10
# df_train = df_train[mask]

In [14]:
df_train.head()

Unnamed: 0,symptoms,anamnesis,icd10,new_patient_id,new_event_id,new_event_time,level_2,level_1
0,"Состояние улучшилось, жалоб нет.",Принимала - ничего ФЛГ от *ДАТА* - без патолог...,J00,q30c3b31,qad30faf,2027-01-16,J00,J
1,"49лет, активно жалоб не предъявляет Пришла с р...",цикл нерегулярный. Принимает ци-клим непрерывн...,D25.1,q56209d0,q9c869f7,2022-12-09,D25,D
2,появление пигиентных пятен на лице прибавка ве...,Вышеперечисленные жалобы с *ДАТА* ( с периода ...,E04.1,q599e008,q1e98632,2028-11-26,E04,E
3,"Дискомфорт в области верх трети шеи, эпизодиче...","Множественная миома матки, паратубарная киста ...",E06.3,qd92e2f1,q6ad490c,2024-01-07,E06,E
4,Состояние без изменений Жалобы при первичном о...,"Впервые появились боли после подъема тяжесьти,...",K21.0,qc286856,q22bea7e,2024-06-21,K21,K


In [15]:
tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')

In [16]:
def convert_comments_to_tensors(comments):
    features = []
    for comment in comments:
        # full preparation for input to BERT model, including BPE-encoding,
        # converting tokens to ids, padding, adding special tokens in the beginning and end of a sequence 
        items = tokenizer.encode_plus(
            comment, 
            max_length=100, 
            truncation=True, 
            add_special_tokens=True, 
            pad_to_max_length=True
        )
        features.append(items)

    input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    # a mask, it has 1 - where a token exists and 0 where it's a padding index
    attention_mask = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    return input_ids, attention_mask


In [17]:
label_col = 'level_2'
text_col = 'symptoms'

In [18]:
X_train = convert_comments_to_tensors(df_train[text_col].values)
X_val = convert_comments_to_tensors(df_val[text_col].values)
X_test = convert_comments_to_tensors(df_test[text_col].values)

i2l = dict(enumerate(sorted(df_train['level_2'].unique())))
l2i = {label: i for i, label in i2l.items()}

y_train = df_train['level_2'].map(l2i).values
y_val = df_val['level_2'].map(l2i)
y_val = y_val.fillna(list(set(y_train) - set(y_val))[0]).values
y_test = df_test['level_2'].map(l2i)
y_test = y_test.fillna(list(set(y_train) - set(y_test))[0]).values

print("{}/{}/{} - train/val/test split".format(y_train.shape[0], y_val.shape[0], y_test.shape[0]))



5604/1010/1011 - train/val/test split


In [19]:
X_train

(tensor([[    2, 46039, 75281,  ...,     0,     0,     0],
         [    2,  1562, 11273,  ...,     0,     0,     0],
         [    2, 32279, 51393,  ...,     0,     0,     0],
         ...,
         [    2, 35282,   626,  ...,     0,     0,     0],
         [    2,    17, 70991,  ...,     0,     0,     0],
         [    2,   548, 33073,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))

In [20]:
y_train

array([185,  32,  52, ..., 353,  50, 196])

In [21]:
device = torch.device('cuda')

config = BertConfig.from_pretrained(
    'cointegrated/rubert-tiny2', # bert-base-uncased
    num_labels=len(set(y_train))
)

model = BertForSequenceClassification.from_pretrained(
    'cointegrated/rubert-tiny2',
    from_tf=False,
    config=config
)
model.to(device)

2022-10-25 15:02:30,348 - clearml.model - INFO - Selected model id: 748b4796efdf4dd5812a028ad474e0bb


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elemen

In [22]:
def evaluate(data_loader):
    total_loss = 0.
    y_true = []
    y_pred = []

    model.eval()  # Set mode to evaluation to disable dropout & freeze BN
    with torch.no_grad():
        for step, batch in enumerate(data_loader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            outputs = model(**inputs)
            total_loss += outputs[0]
            y_pred.extend(outputs[1].cpu().numpy())
            y_true.extend(batch[2].cpu().numpy())

    y_pred = np.asarray(y_pred)
    y_true = np.asarray(y_true)

    results = {
        'val_hit3': hit_at_n(y_true, y_pred, n=3), 
        'val_precision': hit_at_n(y_true, y_pred, n=1), 
        'val_loss': total_loss / len(data_loader)
    }

    return results


class EarlyStopping:
    """
    Identify whether metric has not been improved for certain number of epochs
    """

    def __init__(self,
                 mode: str = 'min',
                 min_delta: float = 0,
                 patience: int = 10):
        self.mode = mode
        self.min_delta = min_delta
        self.patience = patience

        self.is_better = None
        if patience == 0:
            self.is_better = lambda *_: True
        else:
            self._init_is_better(mode, min_delta)

        self.best = None
        self.num_bad_epochs = 0

    def step(self, current) -> bool:
        """
        Make decision whether to stop training

        :param current: new metric value
        :return: whether to stop
        """
        if isinstance(current, torch.Tensor):
            current = current.cpu()
        if np.isnan(current):
            return True

        if self.best is None:
            self.best = current
        else:
            if self.is_better(current, self.best):
                self.num_bad_epochs = 0
                self.best = current
            else:
                self.num_bad_epochs += 1

        if self.num_bad_epochs >= self.patience:
            return True
        else:
            return False

    def _init_is_better(self, mode, min_delta):
        if mode not in {'min', 'max'}:
            raise ValueError('mode ' + mode + ' is unknown!')
        if mode == 'min':
            self.is_better = lambda value, best: value < best - min_delta
        if mode == 'max':
            self.is_better = lambda value, best: value > best + min_delta
            
            
class ModelCheckpoint:
    """Save the model after every epoch.
    `filepath` can contain named formatting options,
    which will be filled the value of `epoch` and `val_loss`.
    For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`,
    then the model checkpoints will be saved with the epoch number and
    the validation loss in the filename.
    # Arguments
        model: PyTorch model object
        filepath: string, path to save the model file.
        save_best_only: if `save_best_only=True`,
            the latest best model according to
            the quantity monitored will not be overwritten.
        mode: one of {min, max}.
            If `save_best_only=True`, the decision
            to overwrite the current save file is made
            based on either the maximization or the
            minimization of the monitored quantity. For `val_acc`,
            this should be `max`, for `val_loss` this should
            be `min`, etc.
        save_weights_only: if True, then only the model's weights will be
            saved, else the full model is saved.
    """

    def __init__(
        self,
        model: torch.nn.Module,
        filepath: str,
        mode: str = "min",
        save_best_only: bool = True,
        save_weights_only: bool = False,
    ):
        self.model = model
        self.filepath = filepath
        self.mode = mode
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.num_saves = 0

        if mode == "min":
            self.monitor_op = np.less
            self.best = np.Inf
        elif mode == "max":
            self.monitor_op = np.greater
            self.best = -np.Inf
        else:
            raise ValueError("mode " + mode + " is unknown!")

        Path(self.filepath).parent.mkdir(exist_ok=True, parents=True)

    def _save_model(self):
        if self.save_weights_only:
            torch.save(self.model.state_dict(), self.filepath)
        else:
            torch.save(self.model, self.filepath)
        self.num_saves += 1

    def step(self, current, epoch=None):
        if isinstance(current, torch.Tensor):
            current = current.cpu()
        if self.save_best_only:
            if self.monitor_op(current, self.best):
                self.best = current
                self._save_model()
        else:
            self._save_model()

In [23]:
lr = 0.0000125  # usually from 1e-5 until 8e-5
warmup_steps = 50
num_steps = 12000

optimizer = AdamW([p for p in model.parameters() if p.requires_grad],
                   lr=lr, weight_decay=0)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_steps)

early_stopping = EarlyStopping(patience=8, mode='max')
model_checkpoint = ModelCheckpoint(model, 'models/cointegrated_rubert_tiny2.pt', mode="max")



In [24]:
batch_size = 512
gradient_accumulation_steps = 1
logging_steps = 100  # периодичность проверки качества модели, чтобы во время остановить обучение
max_grad_norm = 1

train_dataset = TensorDataset(X_train[0], X_train[1], torch.LongTensor(y_train))
val_dataset = TensorDataset(X_val[0], X_val[1], torch.LongTensor(y_val))
test_dataset = TensorDataset(X_test[0], X_test[1], torch.LongTensor(y_test))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

num_train_epochs = num_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
print('Count of epochs: %s' % num_train_epochs)

Count of epochs: 1201


In [25]:
def hit_at_n(y_true, y_pred, n=3):
    assert len(y_true) == len(y_pred)
    
    score = np.mean(np.any(
        np.argsort(-y_pred, axis=1)[:, :n] == y_true.reshape(-1,1), 
        axis=1
    ))
    return score

In [None]:
for _ in range(num_train_epochs):
    for step, batch in enumerate(train_dataloader):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        outputs = model(**inputs) # model outputs are tuple: (loss, logits)
        loss = outputs[0]

        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        tr_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            # Log metrics
            if global_step % logging_steps == 0:
                results = evaluate(val_dataloader)
                results.update({'train_loss': (tr_loss - logging_loss) / logging_steps})
                print('Step {:3}, {}'.format(global_step, ' '.join(['{}: {:<6.4f}'.format(k, v) for k, v in
                                                                  results.items()])))
                logging_loss = tr_loss

                # Saving model checkpoint here if we have improvement
                model_checkpoint.step(results["val_hit3"])

                if early_stopping.step(results['val_hit3']):
                    global_step = num_steps + 1
                    print('Early training stopping!')
                    break

    if global_step > num_steps:
        break

Step 100, val_hit3: 0.1762 val_precision: 0.0980 val_loss: 5.7553 train_loss: 5.9687
2022-10-25 15:03:06,053 - clearml.frameworks - INFO - Found existing registered model id=c34ee1e5e1964e1886b01ec9b6f3733d [/home/dima/files/projects/med/notebooks/models/cointegrated_rubert_tiny2.pt] reusing it.
Step 200, val_hit3: 0.1772 val_precision: 0.0980 val_loss: 5.3528 train_loss: 5.5481
Step 300, val_hit3: 0.1802 val_precision: 0.0980 val_loss: 5.0640 train_loss: 5.1962
Step 400, val_hit3: 0.2089 val_precision: 0.1158 val_loss: 4.8840 train_loss: 4.9538
Step 500, val_hit3: 0.2287 val_precision: 0.1386 val_loss: 4.7361 train_loss: 4.7917
Step 600, val_hit3: 0.2752 val_precision: 0.1416 val_loss: 4.5984 train_loss: 4.6331
Step 700, val_hit3: 0.2970 val_precision: 0.1634 val_loss: 4.4689 train_loss: 4.4822
Step 800, val_hit3: 0.3168 val_precision: 0.1733 val_loss: 4.3581 train_loss: 4.3312
Step 900, val_hit3: 0.3347 val_precision: 0.1911 val_loss: 4.2544 train_loss: 4.1859
Step 1000, val_hit3: 0.