In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate
!pip install scikit-learn


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from typing import List, Tuple
from datasets import load_dataset, Dataset
import evaluate
import numpy as np
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline
)


In [3]:
MODEL_NAME = 'ai-forever/ruRoberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

In [4]:
DATASET_NAME = 'Davlan/sib200'
DATASET_LANGUAGE = 'rus_Cyrl'

train_set = load_dataset(DATASET_NAME, DATASET_LANGUAGE, split='train')
validation_set = load_dataset(DATASET_NAME, DATASET_LANGUAGE, split='validation')
test_set = load_dataset(DATASET_NAME, DATASET_LANGUAGE, split='test')

print(train_set)


README.md: 0.00B [00:00, ?B/s]

train.tsv: 0.00B [00:00, ?B/s]

dev.tsv: 0.00B [00:00, ?B/s]

test.tsv: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['index_id', 'category', 'text'],
    num_rows: 701
})


In [None]:
MINIBATCH_SIZE = 32   # reduced for grad accumulation


In [6]:
tokenized_train_set = train_set.map(
    lambda it: tokenizer(it['text'], truncation=True),
    batched=True,
    batch_size=MINIBATCH_SIZE
)

tokenized_validation_set = validation_set.map(
    lambda it: tokenizer(it['text'], truncation=True),
    batched=True,
    batch_size=MINIBATCH_SIZE
)


Map:   0%|          | 0/701 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

In [7]:
cls_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return cls_metric.compute(
        predictions=predictions,
        references=labels,
        average='macro'
    )


Downloading builder script: 0.00B [00:00, ?B/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [9]:
list_of_categories = sorted(list(
    set(train_set['category']) |
    set(validation_set['category']) |
    set(test_set['category'])
))

indices_of_categories = list(range(len(list_of_categories)))
n_categories = len(list_of_categories)

id2label = dict(zip(indices_of_categories, list_of_categories))
label2id = dict(zip(list_of_categories, indices_of_categories))

print(f'Categories: {list_of_categories}')


Categories: ['entertainment', 'geography', 'health', 'politics', 'science/technology', 'sports', 'travel']


In [10]:
labeled_train_set = tokenized_train_set.add_column(
    'label',
    [label2id[val] for val in tokenized_train_set['category']]
)

labeled_validation_set = tokenized_validation_set.add_column(
    'label',
    [label2id[val] for val in tokenized_validation_set['category']]
)


In [11]:
# ===== Class Weights (IMPORTANT) =====
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(labeled_train_set['label']),
    y=labeled_train_set['label']
)

class_weights = torch.tensor(class_weights, dtype=torch.float).cuda()


In [12]:
classifier = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=n_categories,
    id2label=id2label,
    label2id=label2id
).cuda()


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# Freeze first 2 layers (stability)
for name, param in classifier.named_parameters():
    if name.startswith("roberta.encoder.layer.0") or \
       name.startswith("roberta.encoder.layer.1"):
        param.requires_grad = False


In [26]:
class WeightedTrainer(Trainer):
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
        num_items_in_batch=None   # <-- FIX
    ):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fn = torch.nn.CrossEntropyLoss(
            weight=class_weights,
            label_smoothing=0.1
        )

        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss



In [27]:
training_args = TrainingArguments(
    output_dir='rubert_sib200',

    learning_rate=1e-5,
    warmup_ratio=0.1,
    weight_decay=1e-3,

    per_device_train_batch_size=MINIBATCH_SIZE,
    per_device_eval_batch_size=MINIBATCH_SIZE,
    gradient_accumulation_steps=2,  # effective batch = 64

    num_train_epochs=16,
    fp16=True,

    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1',
    greater_is_better=True,

    report_to='none'
)


In [28]:
trainer = WeightedTrainer(
    model=classifier,
    args=training_args,
    train_dataset=labeled_train_set,
    eval_dataset=labeled_validation_set,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = WeightedTrainer(


In [29]:
trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,No log,1.967421,0.101648
2,No log,1.785022,0.386139
3,No log,1.508789,0.641378
4,No log,1.239281,0.730772
5,No log,1.025208,0.763733
6,No log,0.904646,0.75267
7,No log,0.863742,0.791909
8,No log,0.858777,0.809885
9,No log,0.861851,0.838125
10,No log,0.864793,0.846541


Epoch,Training Loss,Validation Loss,F1
1,No log,1.967421,0.101648
2,No log,1.785022,0.386139
3,No log,1.508789,0.641378
4,No log,1.239281,0.730772
5,No log,1.025208,0.763733
6,No log,0.904646,0.75267
7,No log,0.863742,0.791909
8,No log,0.858777,0.809885
9,No log,0.861851,0.838125
10,No log,0.864793,0.846541


TrainOutput(global_step=176, training_loss=0.9492710286920721, metrics={'train_runtime': 672.7669, 'train_samples_per_second': 16.671, 'train_steps_per_second': 0.262, 'total_flos': 1262992193859168.0, 'train_loss': 0.9492710286920721, 'epoch': 16.0})

In [30]:
classification_pipeline = pipeline(
    "text-classification",
    model=classifier,
    tokenizer=tokenizer,
    device=0
)


Device set to use cuda:0


In [31]:
# ===== Validation =====
texts = list(validation_set["text"])

y_pred = [x["label"] for x in classification_pipeline(texts, batch_size=32)]
y_true = validation_set["category"]

print(classification_report(y_true=y_true, y_pred=y_pred))


                    precision    recall  f1-score   support

     entertainment       0.70      0.78      0.74         9
         geography       0.73      1.00      0.84         8
            health       0.89      0.73      0.80        11
          politics       0.93      0.93      0.93        14
science/technology       0.92      0.92      0.92        25
            sports       1.00      0.92      0.96        12
            travel       0.79      0.75      0.77        20

          accuracy                           0.86        99
         macro avg       0.85      0.86      0.85        99
      weighted avg       0.87      0.86      0.86        99



In [32]:
# ===== Test =====
y_pred = [x["label"] for x in classification_pipeline(
    list(test_set["text"]), batch_size=32
)]
y_true = test_set["category"]

print(classification_report(y_true=y_true, y_pred=y_pred))


                    precision    recall  f1-score   support

     entertainment       0.79      0.79      0.79        19
         geography       0.88      0.88      0.88        17
            health       0.96      1.00      0.98        22
          politics       0.97      0.93      0.95        30
science/technology       0.93      0.98      0.95        51
            sports       0.88      0.88      0.88        25
            travel       0.92      0.85      0.88        40

          accuracy                           0.91       204
         macro avg       0.90      0.90      0.90       204
      weighted avg       0.91      0.91      0.91       204

