In [269]:
from data import NERDataset
from data import collate_batch

train_dataset = NERDataset(mode='train', tokenizer='google-bert/bert-base-multilingual-cased')
test_dataset = NERDataset(mode='test', tokenizer='google-bert/bert-base-multilingual-cased')
dev_dataset = NERDataset(mode='dev', tokenizer='google-bert/bert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [270]:
from tqdm import tqdm
import torch.nn as nn
import torch

# train the model for only one epoch
def train_epoch(model, train_dataloader, optimizer, epoch=None, clip=None, device='cpu'):
    loop = tqdm(
        enumerate(train_dataloader),
        total=len(train_dataloader),
        desc=f'Training {epoch if epoch else ""}',
    )
    
    model.train()
    train_loss = 0
    for i, batch in loop:
        input_ids, token_type_ids, tags_ids = batch[:3]
        input_ids, token_type_ids, tags_ids = input_ids.to(device), token_type_ids.to(device), tags_ids.to(device)
        
        optimizer.zero_grad()

        # make the prediction
        loss = model(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=(input_ids != 0).long().to(device),
            labels=tags_ids,
        )[0]

        loss.backward()
        if clip:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            
        optimizer.step()
        
        train_loss += loss.item()
        loop.set_postfix(**{"loss": train_loss / (i + 1)})
    return train_loss / len(train_dataloader)

# evaluate the model for only one epoch
def eval_epoch(model, eval_dataloader, epoch=None, device='cpu'):
    loop = tqdm(
        enumerate(eval_dataloader),
        total=len(eval_dataloader),
        desc=f'Evaluating {epoch if epoch else ""}',
    )
    
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for i, batch in loop:
            input_ids, token_type_ids, tags_ids = batch[:3]
            input_ids, token_type_ids, tags_ids = input_ids.to(device), token_type_ids.to(device), tags_ids.to(device)
            
            # make the prediction
            loss = model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=(input_ids != 0).long().to(device),
                labels=tags_ids,
            )[0]

            eval_loss += loss.item()
            loop.set_postfix(**{"loss": eval_loss / (i + 1)})
    return eval_loss / len(eval_dataloader)


def train(
    model=None,
    loaders=None,
    optimizer=None,
    epochs=10,
    device=None,
    clip_grad=None,
    ckpt_path='best.pt',
    best_loss=float('inf'),
    cur_epoch=1,
    return_model=False,
):      
    epoch_cnt = 0
    for epoch in range(cur_epoch, epochs + cur_epoch):
        train_loss = train_epoch(model, loaders[0], optimizer, epoch, clip_grad, device)
        if len(loaders) > 1:
            val_loss = eval_epoch(model, loaders[1], epoch, device)
        else:
            val_loss = train_loss
        
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model, ckpt_path)
    
    if return_model:
        return best_loss, model
    return best_loss

In [271]:
# get the dataloaders
from torch.utils.data import DataLoader

BATCH_SIZE = 16
NUM_WORKERS = 0
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_batch)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_batch)

device: cuda


In [275]:
# Fine-tuning the model

In [272]:
# get the model
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels = 29 * 4 + 1, return_dict = False)
model.to(device)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model ch

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [273]:
LEARNING_RATE = 3e-4

parameters = [p for n, p in model.named_parameters() if n.startswith('classifier')]
optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)

In [276]:
train(model, loaders=(train_dataloader, test_dataloader), optimizer=optimizer, epochs=3, device=device)

Training 1: 100%|██████████| 29/29 [02:36<00:00,  5.40s/it, loss=2.13]
Evaluating 1: 100%|██████████| 6/6 [00:04<00:00,  1.33it/s, loss=2.07]
Training 2: 100%|██████████| 29/29 [02:41<00:00,  5.57s/it, loss=1.9] 
Evaluating 2: 100%|██████████| 6/6 [00:03<00:00,  1.75it/s, loss=1.9] 
Training 3: 100%|██████████| 29/29 [02:40<00:00,  5.53s/it, loss=1.74]
Evaluating 3: 100%|██████████| 6/6 [00:04<00:00,  1.29it/s, loss=1.76]


1.7619936267534893

In [277]:
def val(text, model, dataset, addit=0):
    t = dataset.tokenizer(text)

    input_ids = torch.LongTensor(t['input_ids']).reshape(1, -1)
    token_type_ids = torch.LongTensor(t['token_type_ids']).reshape(1, -1)
    attention_mask = (input_ids != 0).long()

    input_ids = input_ids.to(device)
    token_type_ids = token_type_ids.to(device)
    attention_mask = attention_mask.to(device)

    model.eval()
    output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    tag2id = dataset.tags2id
    id2tag = {v: k for k, v in tag2id.items()}
    typs = output[0].argmax(-1).squeeze()

    res = []
    start = 0
    ltag = []
    i = 0
    while i < len(typs):
        if typs[i] == 0:
            i += 1
            continue
        
        j = i + 1
        while j < len(typs):
            if id2tag[typs[j].item()][2:] != id2tag[typs[i].item()][2:]:
                break 
            j += 1
        
        dataset.tokenizer.decode(t['input_ids'][i:j])
        target = dataset.tokenizer.decode(t['input_ids'][i:j])

        res.append([text.find(target, start) + addit, text.find(target, start) + len(target) + addit - 1, id2tag[typs[i].item()][2:]])
        i = j
    
    return res

In [278]:
import json

write = open("test.jsonl", "w")

with open("public_dat/test.jsonl", "r") as f:
    for line in f.readlines():
        start = 0
        proverka = json.loads(line)
        sentences = proverka['senences']
        proverka["ners"] = []
        for sentence in sentences.split('\n'):
            proverka['ners'].extend(val(sentence, model, train_dataset, addit=start))
            start += len(sentence) + 1
        write.write(json.dumps(proverka))
        write.write('\n')
write.close()