In [2]:
import torch
import evaluate
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer, XLMRobertaModel, XLMRobertaTokenizerFast, XLMRobertaForTokenClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import torch.nn as nn
import pandas as pd
import numpy as np
#from sklearn.metrics import classification_report
from seqeval.metrics import f1_score, classification_report
import random
from tqdm import tqdm
import ast
import os

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [1]:
!pip install torch
!pip install transformers
!pip install seqeval
!pip install evaluate

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [5]:
SEED = 42

In [6]:
set_random_seed(SEED) # seed LOCKED !!!!!!!!!!!!

In [11]:
df = pd.read_csv('FULL_COMBO_IOBtags.csv')
df['translated_text'] = df['translated_text'].apply(ast.literal_eval)
df['iob_tags'] = df['iob_tags'].apply(ast.literal_eval)

In [12]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
# datasets
train_dataset = Dataset.from_pandas(train_df[['translated_text', 'iob_tags']])
val_dataset = Dataset.from_pandas(val_df[['translated_text', 'iob_tags']])

In [16]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [17]:
label_list = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

In [18]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example['translated_text'],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example['iob_tags'][word_idx]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [13]:
mismatch = train_df[train_df['translated_text'].str.len() != train_df['iob_tags'].str.len()]
print(f'Mismatches found: {len(mismatch)}')

Mismatches found: 1369


In [14]:
train_df = train_df[train_df['translated_text'].str.len() == train_df['iob_tags'].str.len()]
val_df = val_df[val_df['translated_text'].str.len() == val_df['iob_tags'].str.len()]

In [19]:
train_dataset = train_dataset.map(tokenize_and_align_labels)
val_dataset = val_dataset.map(tokenize_and_align_labels)

Map:   0%|          | 0/15435 [00:00<?, ? examples/s]

Map:   0%|          | 0/3857 [00:00<?, ? examples/s]

In [20]:
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)

    true_labels = []
    true_preds = []

    for pred, label in zip(preds, labels):
        temp_pred = []
        temp_label = []
        for p, l in zip(pred, label):
            if l != -100:
                temp_label.append(id_to_label[l])
                temp_pred.append(id_to_label[p])
        true_labels.append(temp_label)
        true_preds.append(temp_pred)

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        'f1': f1_score(true_labels, true_preds),
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall']
    }

In [22]:
model = XLMRobertaForTokenClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

training_args = TrainingArguments(
    output_dir='./ner-xlm-roberta',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [23]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msol-arina[0m ([33msol-arina-russian-state-university-for-the-humanities[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.1018,0.022629,0.885332,0.841626,0.87116
2,0.0186,0.019129,0.91481,0.879789,0.905403
3,0.0093,0.01741,0.923604,0.894466,0.914748


TrainOutput(global_step=2895, training_loss=0.03118752222604702, metrics={'train_runtime': 1771.6236, 'train_samples_per_second': 26.137, 'train_steps_per_second': 1.634, 'total_flos': 3024973316977920.0, 'train_loss': 0.03118752222604702, 'epoch': 3.0})

In [61]:
label_list = ['O', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG']
id_to_label = {i: label for i, label in enumerate(label_list)}

In [64]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = []
    true_labels = []

    for pred, label in zip(predictions, labels):
        sentence_preds = []
        sentence_labels = []

        for p_id, l_id in zip(pred, label):
            if l_id == -100:
                continue
            sentence_preds.append(id_to_label[p_id])
            sentence_labels.append(id_to_label[l_id])

        true_predictions.append(sentence_preds)
        true_labels.append(sentence_labels)

    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)

    results = {}
    for label, scores in report.items():
        if label == 'O' or label in ['accuracy', 'macro avg', 'weighted avg']:
            continue
        results[f'{label}_f1'] = scores['f1-score']
        results[f'{label}_precision'] = scores['precision']
        results[f'{label}_recall'] = scores['recall']

    # без 'O'
    f1_scores = [v for k, v in results.items() if k.endswith('_f1')]
    results['macro_f1_excl_O'] = np.mean(f1_scores)

    return results

In [65]:
trainer.compute_metrics = compute_metrics
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.017410319298505783, 'eval_LOC_f1': 0.9087003222341568, 'eval_LOC_precision': 0.8980891719745223, 'eval_LOC_recall': 0.9195652173913044, 'eval_ORG_f1': 0.8499999999999999, 'eval_ORG_precision': 0.8406593406593407, 'eval_ORG_recall': 0.8595505617977528, 'eval_PER_f1': 0.9547785547785548, 'eval_PER_precision': 0.9446494464944649, 'eval_PER_recall': 0.9651272384542884, 'eval_micro avg_f1': 0.9236037934668073, 'eval_micro avg_precision': 0.9134966128191766, 'eval_micro avg_recall': 0.9339371337240278, 'eval_macro_f1_excl_O': 0.9092706676198797, 'eval_runtime': 27.2045, 'eval_samples_per_second': 141.778, 'eval_steps_per_second': 8.896, 'epoch': 3.0}


## all predictions

In [66]:
predictions_output = trainer.predict(val_dataset)
raw_preds = predictions_output.predictions
true_labels = predictions_output.label_ids

predicted_ids = np.argmax(raw_preds, axis=2)

In [67]:
predictions_text = []
labels_text = []

for preds, labels in zip(predicted_ids, true_labels):
    pred_seq = []
    label_seq = []
    for p, l in zip(preds, labels):
        if l != -100:
            pred_seq.append(id_to_label[p])
            label_seq.append(id_to_label[l])
    predictions_text.append(pred_seq)
    labels_text.append(label_seq)

In [68]:
results_df = pd.DataFrame({
    'true_labels': labels_text,
    'predicted_labels': predictions_text
})

results_df.to_csv('ner_predictions.csv', index=False) # можно на них посмотреть

In [24]:
model.save_pretrained('ner-xlm-besksl-baseline')
tokenizer.save_pretrained('ner-xlm-besksl-baseline')

('ner-xlm-besksl-baseline/tokenizer_config.json',
 'ner-xlm-besksl-baseline/special_tokens_map.json',
 'ner-xlm-besksl-baseline/sentencepiece.bpe.model',
 'ner-xlm-besksl-baseline/added_tokens.json',
 'ner-xlm-besksl-baseline/tokenizer.json')

### проверка предсказаний

In [31]:
def predict(sentence, tokenizer, model):
    model.eval()
    tokens = sentence.split()

    encoding = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors='pt',
        return_offsets_mapping=True,
        truncation=True,
        padding=True
    )

    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)
    word_ids = encoding.word_ids(batch_index=0)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()
    id2label = model.config.id2label

    token_labels = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        label_id = predictions[idx]
        label = id2label[label_id]
        token_labels.append((tokens[word_idx], label))
        previous_word_idx = word_idx

    return token_labels

In [34]:
sentence = 'Сёння Вася пайшоў гуляць'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Сёння: O
Вася: B-PER
пайшоў: O
гуляць: O


In [35]:
sentence = 'Dnes sa Vasja išiel prejsť'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Dnes: O
sa: O
Vasja: B-LOC
išiel: O
prejsť: O


In [36]:
sentence = 'Danes je šel Vasja na sprehod'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Danes: O
je: O
šel: O
Vasja: B-LOC
na: O
sprehod: O


In [37]:
sentence = 'Аляксандр Сяргеевіч Пушкін паехаў на канферэнцыю ў ААН'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Аляксандр: B-PER
Сяргеевіч: I-PER
Пушкін: I-PER
паехаў: O
на: O
канферэнцыю: O
ў: O
ААН: O


In [38]:
sentence = 'Alexander Sergejevič Puškin sa zúčastnil konferencie OSN'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Alexander: B-PER
Sergejevič: I-PER
Puškin: I-PER
sa: O
zúčastnil: O
konferencie: O
OSN: O


In [39]:
sentence = 'Aleksander Sergejevič Puškin se je udeležil konference ZN'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Aleksander: B-PER
Sergejevič: I-PER
Puškin: I-PER
se: O
je: O
udeležil: O
konference: O
ZN: O


In [41]:
sentence = 'Мы паехалі на канферэнцыю ў КГБ'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Мы: O
паехалі: O
на: O
канферэнцыю: O
ў: O
КГБ: B-ORG


In [42]:
sentence = 'Мы паехалі на канферэнцыю ў ААН'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Мы: O
паехалі: O
на: O
канферэнцыю: O
ў: O
ААН: O


In [56]:
sentence = "Арганізацыя аб'яднаных нацый гэта мая любімая арганізацыя"
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Арганізацыя: B-ORG
аб'яднаных: I-ORG
нацый: I-ORG
гэта: O
мая: O
любімая: O
арганізацыя: O


In [57]:
sentence = 'ААН гэта мая любімая арганізацыя'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

ААН: O
гэта: O
мая: O
любімая: O
арганізацыя: O


In [48]:
sentence = 'Išli sme na konferenciu v organizácii „Romashka“'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Išli: O
sme: O
na: O
konferenciu: O
v: O
organizácii: O
„Romashka“: O


In [47]:
sentence = 'Šli smo na konferenco v organizaciji "Romashka"'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Šli: O
smo: O
na: O
konferenco: O
v: O
organizaciji: O
"Romashka": O


In [49]:
sentence = 'Універсітэт МДУ стаў найлепшым універсітэтам 2025 года ў Егіпце.'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Універсітэт: B-ORG
МДУ: I-ORG
стаў: O
найлепшым: O
універсітэтам: O
2025: O
года: O
ў: O
Егіпце.: O


In [50]:
sentence = 'MSU bola v Egypte vyhlásená za najlepšiu univerzitu roku 2025'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

MSU: O
bola: O
v: O
Egypte: O
vyhlásená: O
za: O
najlepšiu: O
univerzitu: O
roku: O
2025: O


In [51]:
sentence = 'Univerza MSU imenovana za najboljšo univerzo leta 2025 v Egiptu'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Univerza: O
MSU: O
imenovana: O
za: O
najboljšo: O
univerzo: O
leta: O
2025: O
v: O
Egiptu: B-LOC


In [52]:
sentence = 'Лондан гэта сталіца'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Лондан: B-LOC
гэта: O
сталіца: O


In [53]:
sentence = 'Londýn je hlavné mesto'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

Londýn: B-LOC
je: O
hlavné: O
mesto: O


In [54]:
sentence = 'London je glavno mesto'
for token, label in predict(sentence, tokenizer, model):
    print(f'{token}: {label}')

London: B-LOC
je: O
glavno: O
mesto: O
