In [18]:
import ast
import pandas as pd
import transformers
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate
import torch
from statistics import mean
from pprint import pprint

In [2]:
# === 1. Читаем Excel ===
df = pd.read_csv("train.csv", sep=';')
df_t = pd.read_csv("submission.csv", sep=';')

# Список меток
labels = ['O', 'B-BRAND', 'I-BRAND', 'B-TYPE', 'I-TYPE',
          'B-PERCENT', 'I-PERCENT', 'B-VOLUME', 'I-VOLUME']
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [3]:
# === 2. Преобразуем данные в токен+метки ===
data = []
for _, row in df.iterrows():
    text = row[0]
    spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]

    char_labels = ["O"] * len(text)
    for start, end, tag in spans:
        end = min(end, len(text))  # не выходим за границу
        for i in range(start, end):
            char_labels[i] = tag


    tokens = text.split()
    token_labels = []
    offset = 0
    for token in tokens:
        token_len = len(token)
        token_span_labels = char_labels[offset:offset + token_len]
        tag = "O"
        for l in token_span_labels:
            if l != "O":
                tag = l
                break
        token_labels.append(tag)
        offset += token_len + 1

    token_labels = [l if l != "0" else "O" for l in token_labels]
    data.append({"tokens": tokens, "labels": [label2id[l] for l in token_labels]})

dataset = Dataset.from_list(data)

In [4]:
data_t = []
for _, row in df_t.iterrows():
    text = row[0]
    spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]

    char_labels = ["O"] * len(text)
    for start, end, tag in spans:
        end = min(end, len(text))  # не выходим за границу
        for i in range(start, end):
            char_labels[i] = tag


    tokens = text.split()
    token_labels = []
    offset = 0
    for token in tokens:
        token_len = len(token)
        token_span_labels = char_labels[offset:offset + token_len]
        tag = "O"
        for l in token_span_labels:
            if l != "O":
                tag = l
                break
        token_labels.append(tag)
        offset += token_len + 1

    #token_labels = [l if l != "0" else "O" for l in token_labels]
    data_t.append({"tokens": tokens, "labels": [label2id[l] for l in token_labels]})

dataset_t = Dataset.from_list(data_t)

In [5]:
# === 3. Загружаем токенайзер и модель ===
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"],
                          is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

    word_ids = tokenized.word_ids(batch_index=0)  # соответствие токен -> слово
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  # игнорируем паддинг
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])
        else:
            # Для сабтокенов ставим I-* если это не "O"
            label_ids.append(
                example["labels"][word_idx]
                if labels[example["labels"][word_idx]] != "O"
                else -100
            )
        previous_word_idx = word_idx

    tokenized["labels"] = label_ids
    return tokenized

In [7]:
dataset = dataset.train_test_split(test_size=0.1, seed=42)

encoded_dataset = dataset.map(tokenize_and_align_labels)
#encoded_dataset_t = dataset_t.map(tokenize_and_align_labels)

#combined_dataset = DatasetDict({
#    "train": encoded_dataset,
#    "test": encoded_dataset_t
#})

Map: 100%|███████████████████████████████████████████████████████████████| 24525/24525 [00:06<00:00, 3739.89 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 2726/2726 [00:00<00:00, 3759.88 examples/s]


In [21]:
# === 4. Готовим модель ===
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    output_hidden_states=True
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# === 5. Тренировка ===
training_args = TrainingArguments(
    output_dir="./ner_rubert",
    #evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [23]:
# === 6. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [24]:
trainer.train()

Step,Training Loss
500,0.3823
1000,0.2268
1500,0.1997
2000,0.1441
2500,0.1313
3000,0.1351
3500,0.0868
4000,0.0889
4500,0.0862
5000,0.06


TrainOutput(global_step=7665, training_loss=0.11605108495016776, metrics={'train_runtime': 278.4359, 'train_samples_per_second': 440.407, 'train_steps_per_second': 27.529, 'total_flos': 8010885662507008.0, 'train_loss': 0.11605108495016776, 'epoch': 5.0})

In [13]:
# Не забываем увеличивать каждый раз номер, чтобы сохранить все вариации
model.save_pretrained("./saved_model_4")
tokenizer.save_pretrained("./saved_model_4")

('./saved_model_4/tokenizer_config.json',
 './saved_model_4/special_tokens_map.json',
 './saved_model_4/vocab.txt',
 './saved_model_4/added_tokens.json',
 './saved_model_4/tokenizer.json')

In [32]:
model_path = "./saved_model_4"  # путь к сохранённой модели

tokenizer1 = AutoTokenizer.from_pretrained(model_path)
model1 = AutoModelForTokenClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device)
model1.eval()  # переводим в режим инференса
print()




In [25]:
def predict_ner(text, model, tokenizer):
    # Токенизация
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Предсказание
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

    # Сопоставление токенов с метками
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    labels = [model.config.id2label[i] for i in pred_ids]

    # Игнорируем спец-токены [CLS] и [SEP]
    tokens_labels = [(t, l) for t, l in zip(tokens, labels) if t not in ("[CLS]", "[SEP]")]

    # Склеиваем подслова
    merged_tokens, merged_labels = [], []
    for t, l in tokens_labels:
        if t.startswith("##"):
            merged_tokens[-1] += t[2:]
        else:
            merged_tokens.append(t)
            merged_labels.append(l)

    return list(zip(merged_tokens, merged_labels))


In [39]:
text1 = "b-brand с клубникой"
text = "alpin gold he"
result = predict_ner(text, model, tokenizer)

for token, label in result:
    print(f"{token:15} -> {label}")

alpin           -> B-BRAND
gold            -> I-BRAND
he              -> O


In [33]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    # Восстанавливаем word-level метки, игнорируя паддинги (-100)
    for pred, lab in zip(predictions, labels):
        cur_preds = []
        cur_labels = []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                cur_preds.append(id2label[p_i])
                cur_labels.append(id2label[l_i])
        true_predictions.append(cur_preds)
        true_labels.append(cur_labels)

    # Вычисляем метрики seqeval
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Собираем micro метрики (overall_*)
    metrics = {
        "precision_micro": results.get("overall_precision", 0.0),
        "recall_micro": results.get("overall_recall", 0.0),
        "f1_micro": results.get("overall_f1", 0.0),
        "accuracy": results.get("overall_accuracy", 0.0),
    }

    # Macro-F1 = среднее F1 по всем существующим классам
    f1_per_class = [
        results[label].get("f1", 0.0)   # <- используем ключ 'f1', не 'f1-score'
        for label in results
        if label not in ["overall_precision","overall_recall","overall_f1","overall_accuracy"]
    ]
    metrics["f1_macro"] = mean(f1_per_class) if f1_per_class else 0.0

    # F1/precision/recall по каждому классу
    for label in results:
        if label not in ["overall_precision","overall_recall","overall_f1","overall_accuracy"]:
            metrics[f"{label}_precision"] = results[label].get("precision", 0.0)
            metrics[f"{label}_recall"] = results[label].get("recall", 0.0)
            metrics[f"{label}_f1"] = results[label].get("f1", 0.0)

    return metrics

In [34]:
metric = evaluate.load("seqeval")

In [19]:
encoded_dataset_t = dataset_t.map(tokenize_and_align_labels)

Map: 100%|█████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 4327.97 examples/s]


In [35]:
trainer_t = Trainer(
    model=model1,
    tokenizer=tokenizer1,
    compute_metrics=compute_metrics
)

# Прогоняем тестовый датасет
results = trainer_t.evaluate(encoded_dataset["test"])
pprint(results)

  trainer_t = Trainer(


{'eval_BRAND_f1': 0.9333333333333333,
 'eval_BRAND_precision': 0.9321824907521579,
 'eval_BRAND_recall': 0.934487021013597,
 'eval_PERCENT_f1': 1.0,
 'eval_PERCENT_precision': 1.0,
 'eval_PERCENT_recall': 1.0,
 'eval_TYPE_f1': 0.9756581319870177,
 'eval_TYPE_precision': 0.9725017972681524,
 'eval_TYPE_recall': 0.97883502170767,
 'eval_VOLUME_f1': 0.9,
 'eval_VOLUME_precision': 0.8181818181818182,
 'eval_VOLUME_recall': 1.0,
 'eval_accuracy': 0.96415770609319,
 'eval_f1_macro': 0.9522478663300877,
 'eval_f1_micro': 0.966016713091922,
 'eval_loss': 0.20582997798919678,
 'eval_model_preparation_time': 0.0028,
 'eval_precision_micro': 0.9631995556172753,
 'eval_recall_micro': 0.9688503981002934,
 'eval_runtime': 3.6982,
 'eval_samples_per_second': 737.118,
 'eval_steps_per_second': 92.207}
