In [1]:
!pip install transformers datasets seqeval accelerate evaluate -q

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install numpy==1.26.4 --force-reinstall


Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments, TrainerCallback)
import evaluate
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import torch

In [3]:
# ========== 2. ЗАГРУЗКА ДАННЫХ ==========
df = pd.read_csv("/content/drive/MyDrive/BCC/Методы оптимизации /kaz-similarity-dataset.csv")
df = df.dropna(subset=['label']).reset_index(drop=True)

# Кодируем метки
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])

# Делим на train/test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)


In [11]:
# ========== 3. TOKENIZER ==========
model_name = "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 256

def tokenize_function(batch):
    text1 = ["query: " + str(t) for t in batch["text"]]
    text2 = ["passage: " + str(t) for t in batch["text_pair"]]
    return tokenizer(text1, text2,
                     padding="max_length", truncation=True, max_length=MAX_LEN)

train_ds = train_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

if "label_id" in train_ds.column_names:
    train_ds = train_ds.rename_column("label_id", "labels")
    test_ds = test_ds.rename_column("label_id", "labels")

train_ds.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
test_ds.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Map:   0%|          | 0/18465 [00:00<?, ? examples/s]

Map:   0%|          | 0/4617 [00:00<?, ? examples/s]

In [12]:
# ========== 4. МОДЕЛЬ ==========
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(le.classes_)
)

# ========== 5. МЕТРИКИ ==========
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# ========== 6. CALLBACK ДЛЯ ЛОГИРОВАНИЯ ==========
class MetricsLoggerCallback(TrainerCallback):
    def __init__(self):
        self.epoch_logs = []

    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        if logs:
            entry = {
                "epoch": int(state.epoch),
                "train_loss": logs.get("loss", float("nan")),
                "eval_loss": logs.get("eval_loss", float("nan")),
                "f1": logs.get("eval_f1_macro", float("nan")),
                "accuracy": logs.get("eval_accuracy", float("nan")),
            }
            self.epoch_logs.append(entry)
            print(f"{entry['epoch']}\t{entry['train_loss']:.6f}\t{entry['eval_loss']:.6f}\t"
                  f"{entry['f1']:.6f}\t{entry['accuracy']:.6f}")

metrics_logger = MetricsLoggerCallback()

In [14]:
# ========== 7. TRAINING ARGS ==========
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/BCC/Методы оптимизации ",
    run_name="xlm-roberta-base",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    save_total_limit=1,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to="none"
)

# ========== 8. TRAINER ==========
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[metrics_logger]
)


In [8]:
# ========== 9. ОБУЧЕНИЕ ========== xlmr-base 3e-5 128
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.7345,0.741861,0.726229,0.727172
2,0.6246,0.653689,0.747888,0.748741
3,0.5489,0.602038,0.778428,0.77853
4,0.4603,0.557422,0.801819,0.803308
5,0.4288,0.570203,0.793806,0.794571
6,0.4236,0.537311,0.81005,0.81287
7,0.3673,0.560795,0.815465,0.817223
8,0.3587,0.570328,0.813515,0.815551


TrainOutput(global_step=9240, training_loss=0.5294272483685316, metrics={'train_runtime': 2382.2984, 'train_samples_per_second': 62.007, 'train_steps_per_second': 3.879, 'total_flos': 9716953001195520.0, 'train_loss': 0.5294272483685316, 'epoch': 8.0})

In [15]:
# ========== 9. ОБУЧЕНИЕ ========== e5 - base  2e-5
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.6455,0.612879,0.763699,0.766883
2,0.5475,0.571992,0.783409,0.785839
3,0.4811,0.543848,0.794455,0.796338
4,0.429,0.539449,0.809183,0.811479
5,0.3917,0.562926,0.802036,0.803992
6,0.3779,0.556401,0.813299,0.815426
7,0.3526,0.58467,0.815248,0.817294
8,0.3318,0.602675,0.813948,0.816049


TrainOutput(global_step=9240, training_loss=0.46088567234220956, metrics={'train_runtime': 3215.9165, 'train_samples_per_second': 45.934, 'train_steps_per_second': 2.873, 'total_flos': 1.943390600239104e+16, 'train_loss': 0.46088567234220956, 'epoch': 8.0})

In [16]:
# ========== 11. ПОЛНЫЙ ОТЧЕТ НА ТЕСТЕ ========== e5-base 2e-5
predictions = trainer.predict(test_ds)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\nClassification Report (Test):")
print(classification_report(true_labels, pred_labels, target_names=le.classes_))


Classification Report (Test):
              precision    recall  f1-score   support

  contextual       0.83      0.86      0.84       912
       exact       0.81      0.81      0.81       850
  paraphrase       0.76      0.75      0.76      1026
     partial       0.73      0.72      0.72       930
   unrelated       0.96      0.95      0.95       899

    accuracy                           0.82      4617
   macro avg       0.82      0.82      0.82      4617
weighted avg       0.82      0.82      0.82      4617



In [9]:
# ========== 11. ПОЛНЫЙ ОТЧЕТ НА ТЕСТЕ ==========
predictions = trainer.predict(test_ds)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

print("\nClassification Report (Test):")
print(classification_report(true_labels, pred_labels, target_names=le.classes_))



Classification Report (Test):
              precision    recall  f1-score   support

  contextual       0.83      0.85      0.84       912
       exact       0.81      0.83      0.82       850
  paraphrase       0.75      0.75      0.75      1026
     partial       0.75      0.70      0.73       930
   unrelated       0.95      0.95      0.95       899

    accuracy                           0.82      4617
   macro avg       0.82      0.82      0.82      4617
weighted avg       0.81      0.82      0.81      4617



In [10]:
# Сохранение модели
model_save_path = "/content/drive/MyDrive/BCC/Методы оптимизации /xlmr_base(kazclass 3e-5)-teacher"
trainer.save_model(model_save_path)

# Сохранение токенайзера
tokenizer.save_pretrained(model_save_path)

print(f"Модель и токенайзер сохранены в {model_save_path}")


Модель и токенайзер сохранены в /content/drive/MyDrive/BCC/Методы оптимизации /xlmr_base(kazclass 3e-5)-teacher
