In [1]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import wandb
import os
import random

from datasets import load_dataset, Dataset
from IPython.display import clear_output
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from torch import nn
from torchmetrics.classification import MulticlassF1Score
from pytorch_lightning.loggers import WandbLogger

In [4]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(42)

Random seed set as 42


In [5]:
ent_idx2labels = ['B-MET', 'B-ECO', 'B-BIN', 'B-CMP', 'B-QUA', 'B-ACT', 'B-INST', 'B-SOC',
                  'I-MET', 'I-ECO', 'I-BIN', 'I-CMP', 'I-QUA', 'I-ACT', 'I-INST', 'I-SOC', 'O']
ent_labels2idx = dict(zip(ent_idx2labels, range(len(ent_idx2labels))))
ent_labels2idx

{'B-MET': 0,
 'B-ECO': 1,
 'B-BIN': 2,
 'B-CMP': 3,
 'B-QUA': 4,
 'B-ACT': 5,
 'B-INST': 6,
 'B-SOC': 7,
 'I-MET': 8,
 'I-ECO': 9,
 'I-BIN': 10,
 'I-CMP': 11,
 'I-QUA': 12,
 'I-ACT': 13,
 'I-INST': 14,
 'I-SOC': 15,
 'O': 16}

In [6]:
train_dataset = load_dataset('MalakhovIlya/RuREBus', split='train')
clear_output()

In [7]:
train_df = pd.DataFrame(train_dataset)
train_df.drop(columns='id', inplace=True)
train_df.reset_index(drop=True, inplace=True)
train_df.head(1)

Unnamed: 0,text,entities,relations
0,"ГОСУДАРСТВЕННАЯ ПРОГРАММА \nРЕСПУБЛИКИ КОМИ "" ...","[T1\tSOC 259 280\tПравовая защищенность, T4\tI...","[R1\tTSK Arg1:T71 Arg2:T72, R2\tGOL Arg1:T74 A..."


In [8]:
def get_relation_list(df: pd.DataFrame, idx: int):
    relation_list = []
    for relation in df['relations'].iloc[idx]:
        vals = relation.replace(':', ' ').split()
        rel_id, cls_id, _, subj_id, _, obj_id = vals
        relation_list.append([
            rel_id, cls_id, subj_id, obj_id
        ])
    return relation_list

def get_sorted_entity_list(df: pd.DataFrame, idx: int):
    entity_list = []
    for entity in df['entities'].iloc[idx]:
        vals = entity.split()
        ent_id, cls_id, b_idx, e_idx = vals[0:4]
        b_idx, e_idx = int(b_idx), int(e_idx)
        words = vals[4:]
        
        entity_list.append([
            b_idx, e_idx, cls_id, words, ent_id
        ])
    
    entity_list = sorted(entity_list)
    return entity_list

def replace_entities_and_relations(df: pd.DataFrame):
    df = df.copy()
    for i in range(len(df)):
        df.at[i, 'entities'] = get_sorted_entity_list(df, i)
        df.at[i, 'relations'] = get_relation_list(df, i)
    return df

In [9]:
train_df = replace_entities_and_relations(train_df)
print('Entity ex.:', train_df.at[0, 'entities'][0])
print('Relation ex.:', train_df.at[0, 'relations'][0])
train_df.head(1)

Entity ex.: [45, 52, 'SOC', ['ЮСТИЦИЯ'], 'T82']
Relation ex.: ['R1', 'TSK', 'T71', 'T72']


Unnamed: 0,text,entities,relations
0,"ГОСУДАРСТВЕННАЯ ПРОГРАММА \nРЕСПУБЛИКИ КОМИ "" ...","[[45, 52, SOC, [ЮСТИЦИЯ], T82], [55, 66, BIN, ...","[[R1, TSK, T71, T72], [R2, GOL, T74, T73], [R3..."


In [10]:
def get_entity_labeled_text(df: pd.DataFrame, idx: int):
    margin = 0
    text = df.at[idx, 'text']
    for entity in df.at[idx, 'entities']:
        b_idx, e_idx, cls_id, words, _ = entity
        b_idx, e_idx = b_idx + margin, e_idx + margin
        
        text_len_in_beginning = len(text)
        text = text[:b_idx] + f'__B-{cls_id}__' + text[b_idx:e_idx] + f'__E-{cls_id}__' + text[e_idx:]
        margin += len(text) - text_len_in_beginning
        
    return text

def get_splited_text_and_ner_tags(df: pd.DataFrame, idx: int):
    ner_tags = []
    words = get_entity_labeled_text(df, idx).split()
    
    e_padded_label = None
    for word_idx, word in enumerate(words):
        if e_padded_label is not None:
            i_label = e_padded_label.replace('_', '').replace('E-', 'I-')
            ner_tags.append(ent_labels2idx[i_label])
            if e_padded_label in word:
                words[word_idx] = words[word_idx].replace(e_padded_label, '')
                e_padded_label = None
            continue
        
        b_padded_label = None
        for b_label in ent_idx2labels[:8]:
            b_padded_label = f'__{b_label}__'
            if b_padded_label in word:
                words[word_idx] = words[word_idx].replace(b_padded_label, '')
                ner_tags.append(ent_labels2idx[b_label])
                e_padded_label = b_padded_label.replace('B-', 'E-')

        if e_padded_label is not None and e_padded_label in word:
            words[word_idx] = words[word_idx].replace(e_padded_label, '')
            e_padded_label = None
            continue
        elif e_padded_label is None:
            ner_tags.append(ent_labels2idx['O'])
    
    return words, ner_tags

def replace_text_and_add_entity_tags(df: pd.DataFrame):
    df = df.copy()
    df['ner_tags'] = None
    for i in range(len(df)):
        words, ner_tags = get_splited_text_and_ner_tags(df, i)
        df.at[i, 'text'] = words
        df.at[i, 'ner_tags'] = ner_tags
    return df            

In [11]:
train_df = replace_text_and_add_entity_tags(train_df)
print('NER tags ex.:', train_df.at[2, 'ner_tags'])
print('Text ex.:', train_df.at[2, 'text'])
train_df.head(1)

NER tags ex.: [16, 16, 4, 16, 16, 6, 16, 16, 16, 16, 16, 6, 14, 14, 16, 16, 1, 9, 16, 4, 16, 16, 16, 16, 16, 16, 16, 16, 16, 4, 12, 16, 1, 9, 16, 2, 5, 13, 16, 16, 16, 16, 16, 16, 16, 16, 5, 13, 13, 13, 13, 2, 16, 16, 16, 6, 16, 6, 14, 14, 14, 16]
Text ex.: ['Ответственность', 'за', 'достоверность', 'представляемых', 'в', 'Министерство', 'сведений', 'и', 'отчетов', 'возлагается', 'на', 'органы', 'местного', 'самоуправления', '.', '4.', 'Средства', 'субвенций', 'являются', 'целевыми', 'и', 'не', 'могут', 'быть', 'использованы', 'по', 'иному', 'назначению', '.', 'Не', 'целевое', 'использование', 'средств', 'субвенций', 'влечет', 'применение', 'мер', 'ответственности', 'в', 'соответствии', 'с', 'законодательством', 'Российской', 'Федерации', '.', '5.', 'Контроль', 'за', 'целевым', 'использованием', 'субвенций', 'осуществляется', 'в', 'установленном', 'порядке', 'Министерством', 'и', 'Министерством', 'финансов', 'Республики', 'Коми', '.']


Unnamed: 0,text,entities,relations,ner_tags
0,"[ГОСУДАРСТВЕННАЯ, ПРОГРАММА, РЕСПУБЛИКИ, КОМИ,...","[[45, 52, SOC, [ЮСТИЦИЯ], T82], [55, 66, BIN, ...","[[R1, TSK, T71, T72], [R2, GOL, T74, T73], [R3...","[16, 16, 16, 16, 16, 7, 16, 2, 7, 16, 16, 16, ..."


In [12]:
def tokenize_and_align_labels(example, label_all_tokens=False):
    tokenized_inputs = tokenizer(example['text'], truncation=True, is_split_into_words=True, padding='max_length', max_length=512) 
    labels = [] 
    for i, label in enumerate(example['ner_tags']): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token. 
        previous_word_idx = None 
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None 
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token                 
                label_ids.append(label[word_idx]) 
            else: 
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                # mask the subword representations after the first subword
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs


def split_texts(df: pd.DataFrame):
    df = df.copy()
    rows_to_drop = []
    
    for i in range(len(df)):
        text = df.at[i, 'text']
        ner_tags = df.at[i, 'ner_tags']
        
        if len(text) > 30:
            rows_to_drop.append(i)
            
            num_of_parts = len(text) // 30
            part_size = len(text) // num_of_parts
            
            past_margin = 0
            for i in range(num_of_parts - 1):
                part_end = part_size*i + part_size - 1
                cur_margin = 0
                while ner_tags[part_end - cur_margin] != ent_labels2idx['O']:
                    cur_margin += 1
                text_part = text[part_size*i - past_margin:part_size*i + part_size - cur_margin]
                ner_tags_part = ner_tags[part_size*i - past_margin:part_size*i + part_size - cur_margin]
                if len(text_part) > 0:
                    df = df.append({'text': text_part, 'ner_tags': ner_tags_part}, ignore_index=True)
                past_margin = cur_margin
            
            text_part = text[part_size*(num_of_parts - 1) - past_margin:]
            ner_tags_part = ner_tags[part_size*(num_of_parts - 1) - past_margin:]
            if len(text_part) > 0:
                df = df.append({'text': text_part, 'ner_tags': ner_tags_part}, ignore_index=True)
        
    df.drop(rows_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)
    # return Dataset.from_pandas(df)
    return df


tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
train_dataset_formated = Dataset.from_pandas(split_texts(train_df[['text', 'ner_tags']]))
tokenized_train_dataset = train_dataset_formated.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

# Baseline Models (???)

In [13]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), pd.Series(tokenized_train_dataset['labels']).explode().astype(str))
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), pd.Series(tokenized_train_dataset['labels']).explode().astype(str))

0.9407650615794819

In [14]:
exploded_values=pd.Series(tokenized_train_dataset['labels']).explode()
exploded_values=pd.DataFrame(exploded_values,columns=['B'])

most_frequent_elem_by_doc=pd.Series(tokenized_train_dataset['labels']).apply(lambda x:  max(set(x), key=x.count))
most_frequent_elem_by_doc=pd.DataFrame(most_frequent_elem_by_doc,columns=list('A'))

df_most_freq_token=exploded_values.merge(most_frequent_elem_by_doc, how='right', left_index=True, right_index=True)

dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])
dummy_clf.score(pd.Series(tokenized_train_dataset['input_ids']).explode(), df_most_freq_token['A'])

1.0

# LaBSE NER

In [15]:
class NerDataset(torch.utils.data.Dataset):

    def __init__(self, hf_dataset, num_classes=10):
        self.dataset = hf_dataset
        self.num_classes = num_classes

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        input_ids = torch.Tensor(self.dataset['input_ids'][idx]).type(torch.LongTensor)
        attention_mask = torch.Tensor(self.dataset['attention_mask'][idx])
        labels = torch.Tensor(self.dataset['labels'][idx])
        
        model_input = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }
        sample = [model_input, labels]

        return sample

In [16]:
class NerModule(pl.LightningDataModule):
    def __init__(self, hf_dataset, batch_size, num_classes):
        super().__init__()
        self.dataset = hf_dataset
        self.batch_size = batch_size
        self.num_classes = num_classes
    
    def prepare_data(self):
        pass
    
    def setup(self, stage=None):
        ner_full = NerDataset(hf_dataset=self.dataset, num_classes=self.num_classes)
        train_size = int(0.8*len(ner_full))
        test_size = int(0.15*len(ner_full))
        val_size = len(ner_full) - train_size - test_size
        self.ner_train, self.ner_test, self.ner_val = torch.utils.data.random_split(ner_full, [train_size, test_size, val_size])
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.ner_train, batch_size=self.batch_size, shuffle=True)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.ner_test, batch_size=self.batch_size)
    
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.ner_val, batch_size=self.batch_size)

In [17]:
class NERPredictionLogger(pl.callbacks.Callback):
    def __init__(self, val_samples, num_samples=32):
        super().__init__()
        self.num_samples = num_samples
        self.val_inputs, self.val_labels = val_samples
    
    def on_validation_epoch_end(self, trainer, pl_module):
        # Bring the tensors to CPU
        val_inputs = {
            'input_ids': self.val_inputs['input_ids'].to(device=pl_module.device),
            'attention_mask': self.val_inputs['attention_mask'].to(device=pl_module.device),
        }
        val_labels = self.val_labels.to(device=pl_module.device)
        # Get model prediction
        logits = pl_module(val_inputs)['logits']
        gr_truths = []
        preds = []
        texts = []
        for i in range(logits.shape[0]):
            clean_inst = val_labels[i] != -100
            preds.append(logits[i][clean_inst].argmax(dim=1))
            gr_truths.append(val_labels[i][clean_inst].type(torch.LongTensor))
            # print(val_labels[i])
            # print(val_inputs['input_ids'][i])
            # print(val_inputs['input_ids'][i][clean_inst])
            # print(tokenizer.decode(val_inputs['input_ids'][i][clean_inst]))
            texts.append(tokenizer.decode(val_inputs['input_ids'][i], skip_special_tokens=True, clean_up_tokenization_spaces=True))
        # Log the results as wandb Table
        
        table = wandb.Table(columns=['Text', 'True Entities', 'Predicted Entities'])
        for i in range(logits.shape[0]):
            text = ''.join(texts[i])
            gr_truth = ', '.join(list(map(str, gr_truths[i].tolist())))
            pred = ', '.join(list(map(str, preds[i].tolist())))
            table.add_data(text, gr_truth, pred)
        
        trainer.logger.experiment.log({"examples": table})

In [18]:
class NerNet(pl.LightningModule):
    def __init__(self, num_classes=10, learning_rate=2e-5):
        super().__init__()
        
        # log hyperparameters
        self.save_hyperparameters()
        
        self.model = AutoModelForTokenClassification.from_pretrained('surdan/LaBSE_ner_nerel')
        self.model.classifier = nn.Sequential(
            # nn.Linear(768, 2048),
            # nn.BatchNorm1d(512),
            # nn.ReLU(),
            # nn.Linear(2048, 2048),
            # nn.BatchNorm1d(512),
            # nn.ReLU(),
            # nn.Dropout(0.5),
            nn.Linear(768, num_classes),
        )
        
        for param in self.model.parameters():
            param.requires_grad = True
        
        # for child_idx, child in enumerate(self.model.children()):
        #     if child_idx == 0:
        #         pass
        #         for g_child_idx, g_child in enumerate(child.children()):
        #             if g_child_idx == 1:
        #                 for g_g_child_idx, g_g_child in enumerate(next(g_child.children())):
        #                     if g_g_child_idx >= 11:
        #                         for param in g_g_child.parameters():
        #                             param.requires_grad = True
        #     else:
        #         for param in child.parameters():
        #             param.requires_grad = True
        
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.loss = nn.BCEWithLogitsLoss()
        self.f1 = MulticlassF1Score(num_classes=num_classes)
    
    def forward(self, x):
        x = self.model(**x)
        return x
    
    def count_loss_and_metrics(self, y_hat, y):
        f1_score = 0
        loss = 0
        y_hat = y_hat['logits']
        for i in range(y.shape[0]):
            clean_inst = y[i] != -100
            y_hat_cleaned = y_hat[i][clean_inst].cuda()
            y_cleaned = y[i][clean_inst].type(torch.LongTensor).cuda()
            
            f1_score += self.f1(y_hat_cleaned.argmax(dim=1), y_cleaned)
            loss += self.loss(y_hat_cleaned.type(torch.FloatTensor), 
                              F.one_hot(y_cleaned, num_classes=self.num_classes).type(torch.FloatTensor))

        return loss, loss / y.shape[0], f1_score / y.shape[0]
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss, loss_meaned, f1_score = self.count_loss_and_metrics(y_hat, y)
        
        self.log('train_loss', loss_meaned, on_step=True, on_epoch=True, logger=True)
        self.log('train_F1', f1_score, on_step=True, on_epoch=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss, loss_meaned, f1_score = self.count_loss_and_metrics(y_hat, y)

        self.log('val_loss', loss_meaned, prog_bar=True)
        self.log('val_F1', f1_score, prog_bar=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss, loss_meaned, f1_score = self.count_loss_and_metrics(y_hat, y)

        self.log('test_loss', loss_meaned, prog_bar=True)
        self.log('test_F1', f1_score, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

In [19]:
dm = NerModule(hf_dataset=tokenized_train_dataset,
               batch_size=2,
               num_classes=len(ent_idx2labels))
# To access the x_dataloader we need to call prepare_data and setup.
dm.prepare_data()
dm.setup()

# Samples required by the custom ImagePredictionLogger callback to log image predictions.
val_samples = next(iter(dm.val_dataloader()))

In [20]:
model = NerNet(num_classes=dm.num_classes)

# Initialize wandb logger
wandb_logger = WandbLogger(project='rurebus-ner', job_type='train')

# Initialize Callbacks
early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_loss")
checkpoint_callback = pl.callbacks.ModelCheckpoint()

# Initialize a trainer
trainer = pl.Trainer(max_epochs=5,
                     accelerator='gpu',
                     devices=1,
                     logger=wandb_logger,
                     resume_from_checkpoint='rurebus-ner/1uxtfhyy/checkpoints/epoch=1-step=1970.ckpt',
                     callbacks=[NERPredictionLogger(val_samples),
                                checkpoint_callback],
                     log_every_n_steps=1,
                     )

# Train the model ⚡🚅⚡
trainer.fit(model, dm)

# Evaluate the model on the held-out test set ⚡⚡
trainer.test(dataloaders=dm.test_dataloader())

# Close wandb run
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mleon-1402[0m ([33mcorgi-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  ckpt_path = ckpt_path or self.resume_from_checkpoint
Restoring states from the checkpoint path at rurebus-ner/1uxtfhyy/checkpoints/epoch=1-step=1970.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | BertForTokenClassification | 127 M 
1 | loss  | BCEWithLogitsLoss          | 0     
2 | f1    | MulticlassF1Score          | 0     
-----------------------------------------------------
127 M     Trainable params
0         Non-trainable params
127 M     Total params
511.070   Total estimated model params size (MB)
Restored all states from the checkpoint file at rurebus-ner/1uxtfhyy/checkpoints/epoch=1-step=1970.ckpt


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 985it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
  rank_zero_warn(
Restoring states from the checkpoint path at .\rurebus-ner\22wbvvh8\checkpoints\epoch=2-step=5908.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at .\rurebus-ner\22wbvvh8\checkpoints\epoch=2-step=5908.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]