In [1]:
import ast
import pandas as pd
import transformers
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === 1. Читаем Excel ===
df = pd.read_csv("train.csv", sep=';')
df_t = pd.read_csv("submission.csv", sep=';')

# Список меток
labels = ['O', 'B-BRAND', 'I-BRAND', 'B-TYPE', 'I-TYPE',
          'B-PERCENT', 'I-PERCENT', 'B-VOLUME', 'I-VOLUME']
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [3]:
# === 2. Преобразуем данные в токен+метки ===
data = []
for _, row in df.iterrows():
    text = row[0]
    spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]

    char_labels = ["O"] * len(text)
    for start, end, tag in spans:
        end = min(end, len(text))  # не выходим за границу
        for i in range(start, end):
            char_labels[i] = tag


    tokens = text.split()
    token_labels = []
    offset = 0
    for token in tokens:
        token_len = len(token)
        token_span_labels = char_labels[offset:offset + token_len]
        tag = "O"
        for l in token_span_labels:
            if l != "O":
                tag = l
                break
        token_labels.append(tag)
        offset += token_len + 1

    token_labels = [l if l != "0" else "O" for l in token_labels]
    data.append({"tokens": tokens, "labels": [label2id[l] for l in token_labels]})

dataset = Dataset.from_list(data)

  text = row[0]
  spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]


In [4]:
data_t = []
for _, row in df_t.iterrows():
    text = row[0]
    spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]

    char_labels = ["O"] * len(text)
    for start, end, tag in spans:
        end = min(end, len(text))  # не выходим за границу
        for i in range(start, end):
            char_labels[i] = tag


    tokens = text.split()
    token_labels = []
    offset = 0
    for token in tokens:
        token_len = len(token)
        token_span_labels = char_labels[offset:offset + token_len]
        tag = "O"
        for l in token_span_labels:
            if l != "O":
                tag = l
                break
        token_labels.append(tag)
        offset += token_len + 1

    #token_labels = [l if l != "0" else "O" for l in token_labels]
    data_t.append({"tokens": tokens, "labels": [label2id[l] for l in token_labels]})

dataset_t = Dataset.from_list(data_t)

  text = row[0]
  spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]


In [5]:
# === 3. Загружаем токенайзер и модель ===
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"],
                          is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

    word_ids = tokenized.word_ids(batch_index=0)  # соответствие токен -> слово
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  # игнорируем паддинг
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])
        else:
            # Для сабтокенов ставим I-* если это не "O"
            label_ids.append(
                example["labels"][word_idx]
                if labels[example["labels"][word_idx]] != "O"
                else -100
            )
        previous_word_idx = word_idx

    tokenized["labels"] = label_ids
    return tokenized

In [7]:
#dataset = dataset.train_test_split(test_size=0.1, seed=42)

encoded_dataset = dataset.map(tokenize_and_align_labels)
#encoded_dataset_t = dataset_t.map(tokenize_and_align_labels)

#combined_dataset = DatasetDict({
#    "train": encoded_dataset,
#    "test": encoded_dataset_t
#})

Map: 100%|███████████████████████████████████████████████████████████████| 27251/27251 [00:06<00:00, 4431.51 examples/s]


In [8]:
# === 4. Готовим модель ===
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# === 5. Тренировка ===
training_args = TrainingArguments(
    output_dir="./ner_rubert",
    #evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [11]:
# === 6. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    #eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [12]:
trainer.train()

Step,Training Loss
500,0.4053
1000,0.2421
1500,0.1917
2000,0.1495
2500,0.1284
3000,0.1272
3500,0.1175
4000,0.0834
4500,0.0862
5000,0.0849


TrainOutput(global_step=8520, training_loss=0.11345251320288495, metrics={'train_runtime': 301.0263, 'train_samples_per_second': 452.635, 'train_steps_per_second': 28.303, 'total_flos': 8901310678040576.0, 'train_loss': 0.11345251320288495, 'epoch': 5.0})

In [13]:
# Не забываем увеличивать каждый раз номер, чтобы сохранить все вариации
model.save_pretrained("./saved_model_2")
tokenizer.save_pretrained("./saved_model_2")

('./saved_model_2/tokenizer_config.json',
 './saved_model_2/special_tokens_map.json',
 './saved_model_2/vocab.txt',
 './saved_model_2/added_tokens.json',
 './saved_model_2/tokenizer.json')

In [14]:
model_path = "./saved_model_2"  # путь к сохранённой модели

tokenizer1 = AutoTokenizer.from_pretrained(model_path)
model1 = AutoModelForTokenClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device)
model1.eval()  # переводим в режим инференса
print()




In [15]:
def predict_ner(text, model, tokenizer):
    # Токенизация
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Предсказание
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

    # Сопоставление токенов с метками
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    labels = [model.config.id2label[i] for i in pred_ids]

    # Игнорируем спец-токены [CLS] и [SEP]
    tokens_labels = [(t, l) for t, l in zip(tokens, labels) if t not in ("[CLS]", "[SEP]")]

    # Склеиваем подслова
    merged_tokens, merged_labels = [], []
    for t, l in tokens_labels:
        if t.startswith("##"):
            merged_tokens[-1] += t[2:]
        else:
            merged_tokens.append(t)
            merged_labels.append(l)

    return list(zip(merged_tokens, merged_labels))


In [16]:
text = "абрикосы 500г global village"
result = predict_ner(text, model1, tokenizer1)

for token, label in result:
    print(f"{token:15} -> {label}")

абрикосы        -> B-TYPE
500г            -> B-VOLUME
global          -> B-BRAND
village         -> I-BRAND


In [17]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    for pred, lab in zip(predictions, labels):
        cur_preds = []
        cur_labels = []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:  # маскируем паддинги
                cur_preds.append(id2label[p_i])
                cur_labels.append(id2label[l_i])
        true_predictions.append(cur_preds)
        true_labels.append(cur_labels)

    results = metric.compute(predictions=true_predictions, references=true_labels)
    # берём усреднённый F1 по всем классам
    return {"precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"]}

In [18]:
metric = evaluate.load("seqeval")

In [19]:
encoded_dataset_t = dataset_t.map(tokenize_and_align_labels)

Map: 100%|█████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 4327.97 examples/s]


In [20]:
trainer_t = Trainer(
    model=model1,
    tokenizer=tokenizer1,
    compute_metrics=compute_metrics
)

# Прогоняем тестовый датасет
results = trainer_t.evaluate(encoded_dataset_t)
print(results)

  trainer_t = Trainer(


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.409075140953064, 'eval_model_preparation_time': 0.0026, 'eval_precision': 0.796537753222836, 'eval_recall': 0.8700514966205343, 'eval_f1': 0.8316732684690228, 'eval_accuracy': 0.8134724510342303, 'eval_runtime': 8.692, 'eval_samples_per_second': 575.241, 'eval_steps_per_second': 71.905}
