In [None]:
!pip install datasets

In [61]:
# get the dataset
from dataset import NERDataset
from dataset import collate_batch

train_dataset = NERDataset(mode='train', tokenizer="google-bert/bert-base-multilingual-cased")
test_dataset = NERDataset(mode='test', tokenizer="google-bert/bert-base-multilingual-cased")
dev_dataset = NERDataset(mode='dev', tokenizer="google-bert/bert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [62]:
from tqdm import tqdm
import torch.nn as nn
import torch

def train_epoch(model, train_dataloader, optimizer, epoch=None, clip=None, device='cpu'):
    """
    Trains the model for one epoch using the provided training dataloader and optimizer.

    Args:
        model (torch.nn.Module): The model to be trained.
        train_dataloader (torch.utils.data.DataLoader): The dataloader containing the training data.
        optimizer (torch.optim.Optimizer): The optimizer used for training.
        epoch (int, optional): The current epoch number (default: None).
        clip (float, optional): The maximum gradient norm for gradient clipping (default: None).
        device (str, optional): The device to be used for training (default: 'cpu').

    Returns:
        float: The average training loss for the epoch.
    """
    loop = tqdm(
        enumerate(train_dataloader),
        total=len(train_dataloader),
        desc=f'Training {epoch if epoch else ""}',
    )

    model.train()
    train_loss = 0
    for i, batch in loop:
        input_ids, token_type_ids, tags_ids = batch[:3]
        input_ids, token_type_ids, tags_ids = input_ids.to(device), token_type_ids.to(device), tags_ids.to(device)

        optimizer.zero_grad()

        # make the prediction
        loss = model(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=(input_ids != 0).long().to(device),
            labels=tags_ids,
        )[0]

        loss.backward()
        if clip:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix(**{"loss": train_loss / (i + 1)})
    return train_loss / len(train_dataloader)


def eval_epoch(model, eval_dataloader, epoch=None, device='cpu'):
    """
    Evaluates the model for one epoch using the provided evaluation dataloader.

    Args:
        model (torch.nn.Module): The model to be evaluated.
        eval_dataloader (torch.utils.data.DataLoader): The dataloader containing the evaluation data.
        epoch (int, optional): The current epoch number (default: None).
        device (str, optional): The device to be used for evaluation (default: 'cpu').

    Returns:
        float: The average evaluation loss for the epoch.
    """
    loop = tqdm(
        enumerate(eval_dataloader),
        total=len(eval_dataloader),
        desc=f'Evaluating {epoch if epoch else ""}',
    )

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for i, batch in loop:
            input_ids, token_type_ids, tags_ids = batch[:3]
            input_ids, token_type_ids, tags_ids = input_ids.to(device), token_type_ids.to(device), tags_ids.to(device)

            # make the prediction
            loss = model(
                input_ids,
                token_type_ids=token_type_ids,
                attention_mask=(input_ids != 0).long().to(device),
                labels=tags_ids,
            )[0]

            eval_loss += loss.item()
            loop.set_postfix(**{"loss": eval_loss / (i + 1)})
    return eval_loss / len(eval_dataloader)


def train(
    model=None,
    loaders=None,
    optimizer=None,
    epochs=10,
    device=None,
    clip_grad=None,
    ckpt_path='best.pt',
    best_loss=float('inf'),
    cur_epoch=1,
    return_model=False,
):
    """
    Trains the model for the specified number of epochs using the provided loaders and optimizer.

    Args:
        model (torch.nn.Module, optional): The model to be trained (default: None).
        loaders (list, optional): The list of dataloaders containing the training and evaluation data (default: None).
        optimizer (torch.optim.Optimizer, optional): The optimizer used for training (default: None).
        epochs (int, optional): The number of epochs to train the model (default: 10).
        device (str, optional): The device to be used for training (default: None).
        clip_grad (float, optional): The maximum gradient norm for gradient clipping (default: None).
        ckpt_path (str, optional): The path to save the best model checkpoint (default: 'best.pt').
        best_loss (float, optional): The best loss achieved during training (default: float('inf')).
        cur_epoch (int, optional): The current epoch number (default: 1).
        return_model (bool, optional): Whether to return the best model (default: False).

    Returns:
        float or tuple: The best loss achieved during training, and optionally the best model.
    """
    epoch_cnt = 0
    for epoch in range(cur_epoch, epochs + cur_epoch):
        train_loss = train_epoch(model, loaders[0], optimizer, epoch, clip_grad, device)
        if len(loaders) > 1:
            val_loss = eval_epoch(model, loaders[1], epoch, device)
        else:
            val_loss = train_loss

        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model, ckpt_path)

    if return_model:
        return best_loss, model
    return best_loss

In [63]:
# get the dataloaders
from torch.utils.data import DataLoader

BATCH_SIZE = 16
NUM_WORKERS = 0

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_batch)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_batch)

In [64]:
for batch in train_dataloader:
    input_ids, token_type_ids, tags_ids = batch
    print('input_ids.shape:', input_ids.shape)
    print('token_type_ids.shape:', token_type_ids.shape)
    print('tags_ids.shape:', tags_ids.shape)
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors


input_ids.shape: torch.Size([16, 128])
token_type_ids.shape: torch.Size([16, 128])
tags_ids.shape: torch.Size([16, 128])


In [65]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Fine-tuning the model

In [66]:
# get the model
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels = 29 * 4 + 1, return_dict = False).to(device)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
# parameters
LEARNING_RATE = 1e-3
EPOCHS = 10

parameters = [p for n, p in model.named_parameters() if n.startswith('classifier')]
optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)

In [68]:
train(model, loaders=(train_dataloader, test_dataloader), optimizer=optimizer, epochs=EPOCHS, device=device)

Training 1: 100%|██████████| 29/29 [00:27<00:00,  1.07it/s, loss=2.88]
Evaluating 1:   0%|          | 0/6 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors
Evaluating 1: 100%|██████████| 6/6 [00:01<00:00,  3.05it/s, loss=2.09]
Training 2: 100%|██████████| 29/29 [00:26<00:00,  1.08it/s, loss=1.7]
Evaluating 2: 100%|██████████| 6/6 [00:01<00:00,  3.75it/s, loss=1.6]
Training 3: 100%|██████████| 29/29 [00:25<00:00,  1.16it/s, loss=1.34]
Evaluating 3: 100%|██████████| 6/6 [00:01<00:00,  3.67it/s, loss=1.36]
Training 4: 100%|██████████| 29/29 [00:25<00:00,  1.14it/s, loss=1.13]
Evaluating 4: 100%|██████████| 6/6 [00:01<00:00,  3.73it/s, loss=1.21]
Training 5: 100%|██████████| 29/29 [00:26<00:00,  1.10it/s, loss=0.996]
Evaluating 5: 100%|██████████| 6/6 [00:01<00:00,  3.77it/s, loss=1.1]
Training 6: 100%|██████████| 29/29 [00:25<00:00,  1.15it/s,

0.8818587561448415

In [69]:
model = torch.load('best.pt').to(device)

In [70]:

def validate(text, model, dataset, addit=0):
    """
    Validates the given text using a fine-tuned BERT model.

    Args:
        text (str): The input text to be validated.
        model: The fine-tuned BERT model.
        dataset: The dataset used for tokenization and decoding.
        addit (int, optional): Additional value to be added to the start and end indices of the target. 
                               Defaults to 0.

    Returns:
        list: A list of lists containing the start and end indices of the targets found in the text, 
              along with their corresponding tags.

    """

    # Tokenize the input text
    t = dataset.tokenizer(text)

    # Convert tokenized inputs to tensors
    input_ids = torch.LongTensor(t['input_ids']).reshape(1, -1)
    token_type_ids = torch.LongTensor(t['token_type_ids']).reshape(1, -1)
    attention_mask = (input_ids != 0).long()

    # Move tensors to the device (e.g. GPU)
    input_ids = input_ids.to(device)
    token_type_ids = token_type_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Set the model to evaluation mode
    model.eval()

    # Perform forward pass through the model
    output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)

    # Get the predicted tags
    id2tag = dataset.id2tags
    typs = output[0].argmax(-1).squeeze()

    res = []
    start = 0
    i = 0
    while i < len(typs):
        if typs[i] == 0:
            i += 1
            continue

        j = i + 1
        while j < len(typs):
            if id2tag[typs[j].item()][2:] != id2tag[typs[i].item()][2:]:
                break
            j += 1

        # Decode the target from the input_ids
        target = dataset.tokenizer.decode(t['input_ids'][i:j])

        # Append the target and its corresponding start and end indices to the result list
        res.append([text.find(target, start) + addit, text.find(target, start) + len(target) + addit - 1, id2tag[typs[i].item()][2:]])

        i = j

    return res

In [71]:
validate("Привет Максим, я в Париже", model, train_dataset)

[[7, 12, 'PERSON']]

In [72]:
import json
write = open("test.jsonl", "w")

with open("target_test.jsonl", "r") as f:
    for line in f.readlines():
        start = 0
        proverka = json.loads(line)
        sentences = proverka['senences']
        proverka['ners'] = []
        for sentence in sentences.split('\n'):
            proverka['ners'].extend(validate(sentence, model, train_dataset, addit=start))
            start += len(sentence) + 1
        write.write(json.dumps(proverka))
        write.write('\n')
write.close()