# Dataset Preparation

In [1]:
from pathlib import Path
import pandas as pd

DATASET_PATH = Path('public_data')

In [2]:
train = pd.read_json(DATASET_PATH / 'train.jsonl', lines=True)

In [3]:
train.head()

Unnamed: 0,ners,sentences,id
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4


In [31]:
full_labels = set(train.ners.apply(lambda x: [y[2] for y in x]).sum())

To train the model to predict continuous labels, I splitted labels into two groups. One starts with `B-` and denote a first token within an entity. Other starts with `I-` and denote a token in the middle of the entity. Using this strategy, I will be able to build a token classifier.

In [5]:
splitted_labels = {
    'B-' + x for x in full_labels
}.union({'I-' + x for x in full_labels})

label2id = {
    v: k
    for k, v in enumerate(splitted_labels)
}

id2label = {
    k: v
    for k, v in enumerate(splitted_labels)
}

# Tokenization

Tokenization is performed with a usual Treebank tokenizer from NLTK package.

In [None]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

In [9]:
def build_ner(ner, spans):
    x = 0
    while x + 1 < len(spans) and spans[x + 1][0] <= ner[0]:
        x += 1
    ls = x
    
    x = len(spans) - 1
    while x - 1 >= 0 and spans[x - 1][1] >= ner[1]:
        x -= 1

    rp = x
    
    return ls, rp

def get_labels(row):
    """Converts (TAG, BEGIN, END) to label vectors for each token"""
    ners, spans = row.ners, row.spans
    labels = [[0 for _ in splitted_labels] for _ in spans]
    for _, n in enumerate(ners):
        i, j = build_ner(n, spans)
        labels[i][label2id['B-' + n[2]]] = 1        # first token is B-
        
        for k in range(i + 1, j + 1):               # rest are I-
            labels[k][label2id['I-' + n[2]]] = 1
    
    return labels

In [8]:
train['sentences'] = train.sentences.apply(lambda x: x.replace('«', '\"').replace('»', '\"'))
train['tokens'] = train.sentences.apply(lambda s: [(y, x[0], x[1] - 1) for y, x in zip(tokenizer.tokenize(s), tokenizer.span_tokenize(s))])
train['spans'] =  train.sentences.apply(lambda s: [(x, y - 1) for x, y in tokenizer.span_tokenize(s)])
train['labels'] = train.apply(get_labels, axis=1)
train

Unnamed: 0,ners,sentences,id,tokens,spans
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0,"[(Бостон, 0, 5), (взорвали, 7, 14), (Тамерлан,...","[(0, 5), (7, 14), (16, 23), (25, 25), (27, 32)..."
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1,"[(Умер, 0, 3), (избитый, 5, 11), (до, 13, 14),...","[(0, 3), (5, 11), (13, 14), (16, 19), (21, 28)..."
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2,"[(Путин, 0, 4), (подписал, 6, 13), (распоряжен...","[(0, 4), (6, 13), (15, 26), (28, 28), (30, 35)..."
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3,"[(Бенедикт, 0, 7), (XVI, 9, 11), (носил, 13, 1...","[(0, 7), (9, 11), (13, 17), (19, 34), (36, 39)..."
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4,"[(Обама, 0, 4), (назначит, 6, 13), (в, 15, 15)...","[(0, 4), (6, 13), (15, 15), (17, 25), (27, 29)..."
...,...,...,...,...,...
514,"[[42, 46, COUNTRY], [82, 87, COUNTRY], [104, 1...",Глава Малайзии: мы не хотим противостоять Кита...,514,"[(Глава, 0, 4), (Малайзии, 6, 13), (:, 14, 14)...","[(0, 4), (6, 13), (14, 14), (16, 17), (19, 20)..."
515,"[[1, 4, PRODUCT], [31, 33, FACILITY], [35, 44,...","""Союз"" впервые пристыковался к МКС за 6 часов\...",515,"[(``, 0, 0), (Союз, 1, 4), ('', 5, 5), (впервы...","[(0, 0), (1, 4), (5, 5), (7, 13), (15, 27), (2..."
516,"[[0, 4, PERSON], [8, 12, PERSON], [45, 52, AGE...",Трамп и Путин сделали совместное заявление к 7...,516,"[(Трамп, 0, 4), (и, 6, 6), (Путин, 8, 12), (сд...","[(0, 4), (6, 6), (8, 12), (14, 20), (22, 31), ..."
517,"[[0, 9, NATIONALITY], [58, 72, PERSON], [101, ...",Российский магнат устроил самую дорогую свадьб...,517,"[(Российский, 0, 9), (магнат, 11, 16), (устрои...","[(0, 9), (11, 16), (18, 24), (26, 30), (32, 38..."


In [13]:
t = train.copy()
t.tokens = t.tokens.apply(lambda x: [y[0] for y in x])
t = t[['tokens', 'labels']]

all_tokens = t.tokens.sum()
full_labels = t.labels.sum()

len(all_tokens), len(full_labels)

(135505, 135505)

In [14]:
WINDOW = 32 # A number of words per one model run

sequences, labels = [], []

for t in range(len(all_tokens) - WINDOW):
    sequences.append(all_tokens[t:t+WINDOW])
    labels.append(full_labels[t:t+WINDOW])
    
len(sequences), len(labels)

(135473, 135473)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

Pair = tuple[int, int]

class NERDataset(Dataset):
    def __init__(
        self,
        batch_tokens: list[list[tuple[int, int, str]]],
        batch_labels: list[list[int]],
    ):
        self._tokens = sum(batch_tokens, [])
        self._ners = sum(batch_labels, [])
        
    def __len__(self):
        return len(self._tokens)

    def __getitem__(self, index):
        return self._tokens[index], self._ners[index]
    

def vectorize(word: str) -> torch.Tensor:
    if word not in embeddings:
        # If not found, return sum of embeddings of all characters
        vec = torch.stack([
            torch.as_tensor(embeddings.get(x, embeddings['<unk>']))
            for x in word
        ], dim=0).mean(dim=0)
    else:
        vec = embeddings.get(word)
    return torch.as_tensor(vec)
        
class NERLoader(DataLoader):

    def __init__(self, dataset: NERDataset, *args, **kwargs):
        if not isinstance(dataset, NERDataset):
            raise ValueError('NERLoader only supports NERDataset')
        
        def collate(batch):
            tokens, ners = zip(*batch)
            inputs = torch.stack([
                vectorize(x[0])
                for x in tokens
            ])

            # (batch, tokens)
            # (batch, cls)
            return inputs, torch.tensor(ners, dtype=torch.long)
        
        super().__init__(dataset, *args, **kwargs, collate_fn=collate)

In [None]:
train_dataset = NERDataset(train.tokens.to_list(), train.labels.to_list())
train_loader = NERLoader(train_dataset, batch_size=32)

In [None]:
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.gru = nn.GRU(300, 512, 16, batch_first=True)
        self.class_head = nn.Sequential(
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 58)
        )
        
    def forward(self, input_sequence: torch.Tensor):
        output, _ = self.gru(input_sequence)
        return self.class_head(output)

In [None]:
model = SimpleModel()

In [None]:
losses = []
crit = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.train()
model = model.to(device)
for spans, labels in train_loader:
    spans = spans.to(device)
    labels = labels.to(device)
    outputs = model(spans)
    
    loss = crit(outputs, labels.to(torch.float))
    losses.append(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    

In [15]:
batched = pd.DataFrame([*zip(sequences, labels)], columns=['tokens', 'label'])
batched.head()

Unnamed: 0,tokens,label
0,"[Бостон, взорвали, Тамерлан, и, Джохар, Царнае...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[взорвали, Тамерлан, и, Джохар, Царнаевы, из, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[Тамерлан, и, Джохар, Царнаевы, из, Северного,...","[[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"[и, Джохар, Царнаевы, из, Северного, Кавказа, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,"[Джохар, Царнаевы, из, Северного, Кавказа, 19,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [16]:
train_split = batched.sample(frac=0.9)
val_split = batched[~batched.index.isin(train_split.index)]

len(train_split), len(val_split), len(train)

(121926, 13547, 519)

In [17]:
token_label = train_split.copy()


In [18]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_split.reset_index(drop=True)),
    'val': Dataset.from_pandas(val_split.reset_index(drop=True))
})

In [19]:
from transformers import AutoTokenizer, BertModel, BertConfig

bert_tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
import torch
import torch.nn as nn

class BertForNER(BertModel):
    def __init__(self, config: BertConfig):
        super().__init__(config)
        self.classifier_head = nn.Sequential(
            *[nn.Sequential(
                nn.Linear(config.hidden_size, config.hidden_size),
                nn.LeakyReLU(),
                nn.Dropout(.2),
            )] * 6,
            nn.Linear(config.hidden_size, 58)
        )
        self.__pos_weight = torch.full((1, 1, 58), 5)
        
    def forward(self, return_loss = True, **kwargs):
        labels = kwargs.pop('labels', None)
        kwargs.pop('output_hidden_states', None)
        
        output = super().forward(**kwargs, return_dict=True, output_hidden_states=True)
        preds = self.classifier_head(output.hidden_states[-1])
        
        output['predictions'] = preds
        if labels is not None:
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                preds,
                labels,
                pos_weight=self.__pos_weight.to(preds.device)
            )     
        else:
            loss = None       
    
        return loss, output if labels is not None and return_loss else output

In [21]:
model = BertForNER.from_pretrained('DeepPavlov/rubert-base-cased')

  return self.fget.__get__(instance, owner)()


Some weights of BertForNER were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['bert.classifier_head.0.0.bias', 'bert.classifier_head.0.0.weight', 'bert.classifier_head.1.0.bias', 'bert.classifier_head.1.0.weight', 'bert.classifier_head.2.0.bias', 'bert.classifier_head.2.0.weight', 'bert.classifier_head.3.0.bias', 'bert.classifier_head.3.0.weight', 'bert.classifier_head.4.0.bias', 'bert.classifier_head.4.0.weight', 'bert.classifier_head.5.0.bias', 'bert.classifier_head.5.0.weight', 'bert.classifier_head.6.bias', 'bert.classifier_head.6.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:

def prepare_dataset(batch):
    tokens = [bert_tokenizer.convert_tokens_to_ids(x) for x in batch['tokens']]
        
    input_ids = torch.tensor(tokens, dtype=torch.long)
    labels = torch.tensor(batch['label'], dtype=torch.float32)
    
    return {
        'input_ids': input_ids,
        'labels': labels
    }    

In [23]:
ds = dataset.map(prepare_dataset, batched=True, batch_size=256).remove_columns('tokens').save_to_disk('./dataset/tokenized')

In [None]:
# from datasets import DatasetDict

# ds = DatasetDict.load_from_disk('./dataset/tokenized')

In [29]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='./bert',
    per_device_eval_batch_size=700,
    per_device_train_batch_size=700,
    num_train_epochs=15,
    run_name='bert-ner-class',
    evaluation_strategy='steps',
    logging_steps=25,
    save_steps=1000,
    eval_steps=150,
    remove_unused_columns=False,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds['train'],
    eval_dataset=ds['val'],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
trainer.train()



Step,Training Loss,Validation Loss
150,0.1076,0.100966
300,0.0958,0.092055
450,0.0878,0.083609
600,0.0718,0.067447
750,0.0634,0.05826


TrainOutput(global_step=885, training_loss=0.10720483192616263, metrics={'train_runtime': 3170.7864, 'train_samples_per_second': 576.794, 'train_steps_per_second': 0.279, 'total_flos': 3.029757995472768e+16, 'train_loss': 0.10720483192616263, 'epoch': 15.0})

In [None]:
model = BertForNER.from_pretrained('/shared/nlp/bert/checkpoint-400')

Some weights of the model checkpoint at /shared/nlp/bert/checkpoint-400 were not used when initializing BertForNER: ['classifier_head.bias', 'classifier_head.weight']
- This IS expected if you are initializing BertForNER from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNER from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForNER were not initialized from the model checkpoint at /shared/nlp/bert/checkpoint-400 and are newly initialized: ['classifier_head.0.0.bias', 'classifier_head.0.0.weight', 'classifier_head.1.0.bias', 'classifier_head.1.0.weight', 'classifier_head.2.0.bias', 'classifier_head.2.0.weight', 'classifier_head.3.0.bias', 'classifier_head.3.0.weight', 'classifi

In [32]:
from tqdm import tqdm

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'
model.to(device)
accuracy, recall, precision, n = 0, 0, 0, 0
for batch in tqdm(ds['val']):
    tokens = torch.tensor(batch['input_ids'], dtype=torch.long, device=device).unsqueeze(0)
    labels = torch.tensor(batch['labels'], dtype=torch.long, device=device).unsqueeze(0)
    outputs = model(input_ids=tokens)[1].predictions

    
    tags = (torch.nn.functional.sigmoid(outputs).squeeze() > .5).to(torch.long)
    tp = ((labels == 1) & (tags == 1)).sum()
    tn = ((labels == 0) & (tags == 0)).sum()
    fp = ((labels == 0) & (tags == 1)).sum()
    fn = ((labels == 1) & (tags == 0)).sum()
    precision += tp / max(1, (tp + fp))
    recall += tp / max(1, (tp + fn))
    accuracy += (tp + tn) / labels.numel()
    n += 1
    
accuracy / n, precision / n, recall / n

  0%|          | 2/13547 [00:00<11:41, 19.30it/s]

  1%|          | 120/13547 [00:12<23:34,  9.49it/s]


KeyboardInterrupt: 

In [45]:
import torch

def convert_to_submit(labels: list[torch.Tensor], spans: list[tuple[int, int]]):
    # labels of shape (sequence_length, num_classes), binary tensor (0 or 1)
    # spans are pairs (begin, end)
    assert len(labels) == len(spans), (len(labels), len(spans))
    
    start_label_ids = {v for k, v in label2id.items() if k.startswith('B-')}
    # segment_label_ids = {v for k, v in label2id if k.startswith('I-')}
    
    ners = []
    current_ners = []
    for i, label in enumerate(labels):
        index = torch.arange(58)
        predicted = index[label.cpu() == 1].tolist()
        expanded = [False] * len(current_ners)
        new_ners = []
        for p in predicted:
            if p not in start_label_ids:
                for j, c in enumerate(current_ners):
                    if c[0] == id2label[p][2:]:
                        # expanding
                        c[2] = spans[i][1]
                        expanded[j] = True
            else:
                new_ners.append([id2label[p][2:], *spans[i]])
        
        ners.extend([c for j, c in enumerate(current_ners) if not expanded[j]])
        current_ners = [c for j, c in enumerate(current_ners) if expanded[j]]
        current_ners += new_ners
        
    return ners
        

In [53]:
text = train.sentences.iloc[0][:256]
tokens = tokenizer.tokenize(text)
spans = [*tokenizer.span_tokenize(text)]
bert_tokens = list(map(bert_tokenizer.convert_tokens_to_ids, tokens))

text, tokens, spans, bert_tokens

('Бостон взорвали Тамерлан и Джохар Царнаевы из Северного Кавказа\n\n19 апреля 2013 года в пригороде Бостона  проходит спецоперация по поимке 19-летнего Джохара Царнаева, подозреваемого в теракте на Бостонском марафоне 15 апреля и в смертельном ранении полицей',
 ['Бостон',
  'взорвали',
  'Тамерлан',
  'и',
  'Джохар',
  'Царнаевы',
  'из',
  'Северного',
  'Кавказа',
  '19',
  'апреля',
  '2013',
  'года',
  'в',
  'пригороде',
  'Бостона',
  'проходит',
  'спецоперация',
  'по',
  'поимке',
  '19',
  '-',
  'летнего',
  'Джохара',
  'Царнаева',
  ',',
  'подозреваемого',
  'в',
  'теракте',
  'на',
  'Бостонском',
  'марафоне',
  '15',
  'апреля',
  'и',
  'в',
  'смертельном',
  'ранении',
  'полицей'],
 [(0, 6),
  (7, 15),
  (16, 24),
  (25, 26),
  (27, 33),
  (34, 42),
  (43, 45),
  (46, 55),
  (56, 63),
  (65, 67),
  (68, 74),
  (75, 79),
  (80, 84),
  (85, 86),
  (87, 96),
  (97, 104),
  (106, 114),
  (115, 127),
  (128, 130),
  (131, 137),
  (138, 140),
  (140, 141),
  (141, 1

In [54]:
outputs = model(input_ids=torch.tensor(bert_tokens, dtype=torch.long, device=device).unsqueeze(0))[1].predictions
outputs.shape

torch.Size([1, 39, 58])

In [55]:
torch.nn.functional.sigmoid(outputs)

tensor([[[4.5447e-01, 4.2160e-01, 1.0310e-02,  ..., 6.9357e-02,
          1.1827e-02, 1.3931e-01],
         [6.2095e-02, 2.4527e-02, 9.8676e-04,  ..., 3.5091e-03,
          6.2504e-04, 3.8328e-01],
         [3.1347e-02, 8.7105e-02, 6.0223e-02,  ..., 1.7785e-01,
          9.8626e-02, 1.1556e-01],
         ...,
         [2.9908e-01, 2.2549e-01, 1.5095e-02,  ..., 5.4263e-02,
          1.6378e-02, 1.6989e-01],
         [7.8707e-02, 1.6943e-01, 6.8879e-02,  ..., 1.2658e-01,
          6.1039e-02, 4.2061e-01],
         [2.6907e-04, 9.4651e-04, 2.0710e-04,  ..., 7.1986e-03,
          1.2778e-03, 2.5372e-02]]], device='cuda:1',
       grad_fn=<SigmoidBackward0>)

In [60]:
tags = (torch.nn.functional.sigmoid(outputs) > 0.5).to(torch.long)
tags.shape

torch.Size([1, 39, 58])

In [61]:
(tags == 1).sum()

tensor(9, device='cuda:1')

In [62]:
preds = convert_to_submit(tags.squeeze(), spans)

In [63]:
print([(x[0], text[x[1]:x[2]]) for x in preds])

[]


In [38]:
test = pd.read_json(DATASET_PATH / 'test.jsonl', lines=True)
test['senences'] = test.senences.apply(lambda x: x.replace('«', '\"').replace('»', '\"'))
test['tokens'] = test.senences.apply(lambda s: [(y, x[0], x[1] - 1) for y, x in zip(tokenizer.tokenize(s), tokenizer.span_tokenize(s))])
test['spans'] =  test.senences.apply(lambda s: [(x, y - 1) for x, y in tokenizer.span_tokenize(s)])
test

Unnamed: 0,senences,id,tokens,spans
0,"Владелец ""Бирмингема"" получил шесть лет тюрьмы...",584,"[(Владелец, 0, 7), (``, 9, 9), (Бирмингема, 10...","[(0, 7), (9, 9), (10, 19), (20, 20), (22, 28),..."
1,Акция протеста на Майдане Независимости объявл...,585,"[(Акция, 0, 4), (протеста, 6, 13), (на, 15, 16...","[(0, 4), (6, 13), (15, 16), (18, 24), (26, 38)..."
2,Фольксваген может перейти под контроль Порше \...,586,"[(Фольксваген, 0, 10), (может, 12, 16), (перей...","[(0, 10), (12, 16), (18, 24), (26, 28), (30, 3..."
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587,"[(В, 0, 0), (Москве, 2, 7), (покажут, 9, 15), ...","[(0, 0), (2, 7), (9, 15), (17, 22), (24, 28), ..."
4,Чулпан Хаматова сыграет главную роль в фильме ...,588,"[(Чулпан, 0, 5), (Хаматова, 7, 14), (сыграет, ...","[(0, 5), (7, 14), (16, 22), (24, 30), (32, 35)..."
...,...,...,...,...
60,ОБСЕ назвала референдум о статусе Крыма незако...,644,"[(ОБСЕ, 0, 3), (назвала, 5, 11), (референдум, ...","[(0, 3), (5, 11), (13, 22), (24, 24), (26, 32)..."
61,Египетского студента могут выслать из страны з...,645,"[(Египетского, 0, 10), (студента, 12, 19), (мо...","[(0, 10), (12, 19), (21, 25), (27, 33), (35, 3..."
62,Геннадий Онищенко отправлен в отставку\nГеннад...,646,"[(Геннадий, 0, 7), (Онищенко, 9, 16), (отправл...","[(0, 7), (9, 16), (18, 26), (28, 28), (30, 37)..."
63,Племянник Алишера Усманова разбился в ДТП\nВид...,647,"[(Племянник, 0, 8), (Алишера, 10, 16), (Усмано...","[(0, 8), (10, 16), (18, 25), (27, 34), (36, 36..."


In [41]:
test.tokens.apply(len).describe()

count     65.000000
mean     259.461538
std      124.075214
min       95.000000
25%      174.000000
50%      255.000000
75%      312.000000
max      892.000000
Name: tokens, dtype: float64

In [65]:
predicted_ners = []
for _, row in test.iterrows():
    tokens = row.tokens
    spans = row.spans
    for i in range(0, len(tokens), 32):
        batch_tokens = tokens[i:i+32]
        bert_tokens = list(map(bert_tokenizer.convert_tokens_to_ids, map(str, batch_tokens)))
        outputs = model(input_ids=torch.tensor(bert_tokens, dtype=torch.long, device=device).unsqueeze(0))[1].predictions
        tags = (torch.nn.functional.sigmoid(outputs) > 0.5).to(torch.long)
        predicted_ners.append(convert_to_submit(tags.squeeze(dim=0), spans[i:i+32]))

test['ners'] = pd.Series([[(x[1], x[2], x[0],) for x in y] for y in predicted_ners])

In [71]:
to_submit = test.rename(columns={'senences': 'sentences'})

In [72]:
to_submit = to_submit[['sentences', 'id', 'ners']]

In [76]:
to_submit.to_json('test.jsonl', orient='records', lines=True, force_ascii=False)