In [25]:
import ast
import pandas as pd
import transformers
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import RobertaTokenizerFast
import evaluate
import torch
from statistics import mean
from pprint import pprint
import time

In [26]:
# === 1. Читаем Excel ===
df = pd.read_csv("train.csv", sep=';')
df_t = pd.read_csv("submission.csv", sep=';')

# Список меток
labels = ['O', 'B-BRAND', 'I-BRAND', 'B-TYPE', 'I-TYPE',
          'B-PERCENT', 'I-PERCENT', 'B-VOLUME', 'I-VOLUME']
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}

In [27]:
# === 2. Преобразуем данные в токен+метки ===
data = []
for _, row in df.iterrows():
    text = row[0]
    spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]

    char_labels = ["O"] * len(text)
    for start, end, tag in spans:
        end = min(end, len(text))  # не выходим за границу
        for i in range(start, end):
            char_labels[i] = tag


    tokens = text.split()
    token_labels = []
    offset = 0
    for token in tokens:
        token_len = len(token)
        token_span_labels = char_labels[offset:offset + token_len]
        tag = "O"
        for l in token_span_labels:
            if l != "O":
                tag = l
                break
        token_labels.append(tag)
        offset += token_len + 1

    token_labels = [l if l != "0" else "O" for l in token_labels]
    data.append({"tokens": tokens, "labels": [label2id[l] for l in token_labels]})

dataset = Dataset.from_list(data)

In [28]:
data_t = []
for _, row in df_t.iterrows():
    text = row[0]
    spans = ast.literal_eval(row[1])  # список [(start, end, label), ...]

    char_labels = ["O"] * len(text)
    for start, end, tag in spans:
        end = min(end, len(text))  # не выходим за границу
        for i in range(start, end):
            char_labels[i] = tag


    tokens = text.split()
    token_labels = []
    offset = 0
    for token in tokens:
        token_len = len(token)
        token_span_labels = char_labels[offset:offset + token_len]
        tag = "O"
        for l in token_span_labels:
            if l != "O":
                tag = l
                break
        token_labels.append(tag)
        offset += token_len + 1

    #token_labels = [l if l != "0" else "O" for l in token_labels]
    data_t.append({"tokens": tokens, "labels": [label2id[l] for l in token_labels]})

dataset_t = Dataset.from_list(data_t)

In [29]:
# === 3. Загружаем токенайзер и модель ===
model_name = "DeepPavlov/rubert-base-cased"
#model_name = "FacebookAI/roberta-base"
#tokenizer = RobertaTokenizerFast.from_pretrained(
#    "roberta-base",
#    add_prefix_space=True  # ← ключевой параметр
#)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [30]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"],
                          is_split_into_words=True,
                          truncation=True,
                          padding="max_length",
                          max_length=128)

    word_ids = tokenized.word_ids(batch_index=0)  # соответствие токен -> слово
    label_ids = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)  # игнорируем паддинг
        elif word_idx != previous_word_idx:
            label_ids.append(example["labels"][word_idx])
        else:
            # Для сабтокенов ставим I-* если это не "O"
            label_ids.append(
                example["labels"][word_idx]
                if labels[example["labels"][word_idx]] != "O"
                else -100
            )
        previous_word_idx = word_idx

    tokenized["labels"] = label_ids
    return tokenized

In [31]:
def tokenize_and_align_labels1(example, max_length=128):
    # токенизация только по пробелам
    tokens = example["tokens"].split(" ")  # <-- важное изменение
    
    # обрезаем/дополняем до max_length
    tokens = tokens[:max_length]
    labels = example["labels"][:max_length]

    # padding, если нужно
    padding_length = max_length - len(tokens)
    if padding_length > 0:
        tokens += ["[PAD]"] * padding_length
        labels += [-100] * padding_length  # игнорируем паддинг при обучении

    return {
        "input_ids": tokens,  # здесь уже не айдишки из словаря токенайзера, а сами токены
        "labels": labels,
    }


In [32]:
#dataset = dataset.train_test_split(test_size=0.1, seed=42)

encoded_dataset = dataset.map(tokenize_and_align_labels)
#encoded_dataset_t = dataset_t.map(tokenize_and_align_labels)

#combined_dataset = DatasetDict({
#    "train": encoded_dataset,
#    "test": encoded_dataset_t
#})

Map: 100%|███████████████████████████████████████████████████████████████| 27256/27256 [00:06<00:00, 4186.45 examples/s]


In [33]:
# === 4. Готовим модель ===
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# === 5. Тренировка ===
training_args = TrainingArguments(
    output_dir="./ner_rubert",
    #evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
)

In [35]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [36]:
# === 6. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    #eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [37]:
trainer.train()

Step,Training Loss
500,0.2967
1000,0.1247
1500,0.0844
2000,0.056
2500,0.0404


Exception: Input/output error (os error 5)

In [33]:
# Не забываем увеличивать каждый раз номер, чтобы сохранить все вариации
model.save_pretrained("./saved_model_purge")
tokenizer.save_pretrained("./saved_model_purge")

('./saved_model_6/tokenizer_config.json',
 './saved_model_6/special_tokens_map.json',
 './saved_model_6/vocab.txt',
 './saved_model_6/added_tokens.json',
 './saved_model_6/tokenizer.json')

In [71]:
model_path = "./saved_model_3"  # путь к сохранённой модели

tokenizer1 = AutoTokenizer.from_pretrained(model_path)
model1 = AutoModelForTokenClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device)
model1.eval()  # переводим в режим инференса
print()




In [35]:
def predict_ner(text, model, tokenizer):
    # Токенизация
    inputs = tokenizer(text, return_tensors="pt")
    #tokens = text.split(" ")
    #inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Предсказание
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

    # Сопоставление токенов с метками
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    labels = [model.config.id2label[i] for i in pred_ids]

    # Игнорируем спец-токены [CLS] и [SEP]
    tokens_labels = [(t, l) for t, l in zip(tokens, labels) if t not in ("[CLS]", "[SEP]")]

    # Склеиваем подслова
    merged_tokens, merged_labels = [], []
    for t, l in tokens_labels:
        if t.startswith("##") or (merged_tokens and t in [".", ",", "!", "?", ";", ":", "-", "—", "–", "..."]):
            merged_tokens[-1] += t[2:]
        else:
            merged_tokens.append(t)
            merged_labels.append(l)

    return list(zip(merged_tokens, merged_labels))


In [36]:
text1 = "b-brand с клубникой"
text = "вино 3"
result = predict_ner(text, model1, tokenizer1)

for token, label in result:
    print(f"{token:15} -> {label}")

вино            -> B-TYPE
3               -> B-BRAND


In [24]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    # Восстанавливаем word-level метки, игнорируя паддинги (-100)
    for pred, lab in zip(predictions, labels):
        cur_preds = []
        cur_labels = []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:
                cur_preds.append(id2label[p_i])
                cur_labels.append(id2label[l_i])
        true_predictions.append(cur_preds)
        true_labels.append(cur_labels)

    # Вычисляем метрики seqeval
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Собираем micro метрики (overall_*)
    metrics = {
        "precision_micro": results.get("overall_precision", 0.0),
        "recall_micro": results.get("overall_recall", 0.0),
        "f1_micro": results.get("overall_f1", 0.0),
        "accuracy": results.get("overall_accuracy", 0.0),
    }

    # Macro-F1 = среднее F1 по всем существующим классам
    f1_per_class = [
        results[label].get("f1", 0.0)   # <- используем ключ 'f1', не 'f1-score'
        for label in results
        if label not in ["overall_precision","overall_recall","overall_f1","overall_accuracy"]
    ]
    metrics["f1_macro"] = mean(f1_per_class) if f1_per_class else 0.0

    # F1/precision/recall по каждому классу
    for label in results:
        if label not in ["overall_precision","overall_recall","overall_f1","overall_accuracy"]:
            metrics[f"{label}_precision"] = results[label].get("precision", 0.0)
            metrics[f"{label}_recall"] = results[label].get("recall", 0.0)
            metrics[f"{label}_f1"] = results[label].get("f1", 0.0)

    return metrics

In [25]:
metric = evaluate.load("seqeval")

In [26]:
encoded_dataset_t = dataset_t.map(tokenize_and_align_labels)

Map: 100%|█████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 4135.24 examples/s]


In [29]:
trainer_t = Trainer(
    model=model1,
    tokenizer=tokenizer1,
    compute_metrics=compute_metrics
)

# Прогоняем тестовый датасет
results = trainer_t.evaluate(encoded_dataset["test"])
pprint(results)

  trainer_t = Trainer(


{'eval_BRAND_f1': 0.9382334774552191,
 'eval_BRAND_precision': 0.9376543209876543,
 'eval_BRAND_recall': 0.9388133498145859,
 'eval_PERCENT_f1': 1.0,
 'eval_PERCENT_precision': 1.0,
 'eval_PERCENT_recall': 1.0,
 'eval_TYPE_f1': 0.9763793725207357,
 'eval_TYPE_precision': 0.9732207045291158,
 'eval_TYPE_recall': 0.9795586107091172,
 'eval_VOLUME_f1': 0.9473684210526316,
 'eval_VOLUME_precision': 0.9,
 'eval_VOLUME_recall': 1.0,
 'eval_accuracy': 0.9651982888195167,
 'eval_f1_macro': 0.9654953177571466,
 'eval_f1_micro': 0.9677509228947552,
 'eval_loss': 0.27997273206710815,
 'eval_model_preparation_time': 0.0027,
 'eval_precision_micro': 0.9651292025562657,
 'eval_recall_micro': 0.9703869255482609,
 'eval_runtime': 3.5799,
 'eval_samples_per_second': 761.481,
 'eval_steps_per_second': 95.255}


In [27]:
def predict_ner_spans(text, model, tokenizer, device="cuda"):
    # Токенизация с оффсетами
    inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
    offset_mapping = inputs.pop("offset_mapping")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Предсказание
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    labels = [model.config.id2label[i] for i in pred_ids]
    offsets = offset_mapping.squeeze().tolist()

    # Игнорируем спец-токены [CLS] и [SEP]
    tokens_data = [
        (tok, lab, (s, e))
        for tok, lab, (s, e) in zip(tokens, labels, offsets)
        if tok not in ("[CLS]", "[SEP]")
    ]

    # Склеиваем только сабтокены внутри одного слова
    merged_tokens = []
    for tok, lab, (s, e) in tokens_data:
        if tok.startswith("##") or (merged_tokens and tok in [".", ",", "!", "?", ";", ":", "-", "—", "–", "..."]):
            prev_tok, prev_lab, (ps, pe) = merged_tokens[-1]
            merged_tokens[-1] = (prev_tok + tok[2:], prev_lab, (ps, e))
        else:
            merged_tokens.append((tok, lab, (s, e)))
            
    # Преобразуем в спаны, соблюдая BIO
    spans = []
    current_span = None
    for tok, lab, (s, e) in merged_tokens:
        if (current_span):
            if current_span[2] == lab:
                if "B-" in lab:
                    current_span[1] = e
                    continue
        if (current_span):
            spans.append(tuple(current_span))
        current_span = [s, e, lab]
        
    if (current_span):
        spans.append(tuple(current_span))
            
    return spans


In [28]:
def run_inference_csv(input_csv, output_csv, model, tokenizer, device="cuda"):
    df = pd.read_csv(input_csv, sep=';')

    results = []
    for _, row in df.iterrows():
        text = row[0]  # первый столбец — текст
        spans = predict_ner_spans(text, model, tokenizer, device=device)    
        results.append({"sample": text, "annotation": spans})

    pd.DataFrame(results).to_csv(output_csv, index=False, encoding="utf-8", sep=';')
    print(f"✅ Результаты сохранены в {output_csv}")


In [74]:
def run_inference_csv(input_csv, output_csv, model, tokenizer, device="cuda", batch_size=512):
    df = pd.read_csv(input_csv, sep=';')
    texts = df.iloc[:, 0].tolist()  # все тексты в список
    
    results = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_spans = predict_ner_spans_batch(batch_texts, model, tokenizer, device=device)
        
        for text, spans in zip(batch_texts, batch_spans):
            results.append({"sample": text, "annotation": spans})
    
    pd.DataFrame(results).to_csv(output_csv, index=False, encoding="utf-8", sep=';')
    print(f"✅ Результаты сохранены в {output_csv}")

def predict_ner_spans_batch(texts, model, tokenizer, device="cuda"):
    # Токенизация батча
    inputs = tokenizer(
        texts, 
        return_tensors="pt", 
        return_offsets_mapping=True, 
        padding=True, 
        truncation=True
    )
    
    offset_mappings = inputs.pop("offset_mapping")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Предсказание для всего батча
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    pred_ids = torch.argmax(logits, dim=-1).cpu().numpy()
    
    all_batch_spans = []
    
    # Обрабатываем каждый пример в батче по отдельности
    for i in range(len(texts)):
        # Берем данные для одного примера - ВСЕ на CPU
        input_ids = inputs["input_ids"][i].cpu()
        attention_mask = inputs["attention_mask"][i].cpu()
        offset_mapping = offset_mappings[i]  # уже на CPU
        preds = pred_ids[i]
        
        # Убираем паддинг
        valid_indices = attention_mask == 1
        input_ids = input_ids[valid_indices]
        offset_mapping = offset_mapping[valid_indices]
        preds = preds[:len(input_ids)]  # обрезаем по длине без паддинга
        
        # Конвертируем в списки как в оригинальном коде
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        labels = [model.config.id2label[pred] for pred in preds]
        offsets = offset_mapping.tolist()
        
        # Игнорируем спец-токены [CLS] и [SEP] - ВАША ЛОГИКА
        tokens_data = []
        for tok, lab, (s, e) in zip(tokens, labels, offsets):
            if tok not in ("[CLS]", "[SEP]"):
                tokens_data.append((tok, lab, (s, e)))
        
        # Склеиваем сабтокены - ВАША ЛОГИКА
        merged_tokens = []
        for tok, lab, (s, e) in tokens_data:
            if tok.startswith("##") or (merged_tokens and tok in [".", ",", "!", "?", ";", ":", "-", "—", "–", "..."]):
                prev_tok, prev_lab, (ps, pe) = merged_tokens[-1]
                merged_tokens[-1] = (prev_tok + tok[2:], prev_lab, (ps, e))
            else:
                merged_tokens.append((tok, lab, (s, e)))
        
        # Преобразуем в спаны - ВАША ЛОГИКА
        spans = []
        current_span = None
        for tok, lab, (s, e) in merged_tokens:
            if current_span:
                if current_span[2] == lab:
                    if "B-" in lab:
                        current_span[1] = e
                        continue
            if current_span:
                spans.append(tuple(current_span))
            current_span = [s, e, lab]
            
        if current_span:
            spans.append(tuple(current_span))
        
        all_batch_spans.append(spans)
    
    return all_batch_spans

# Сохраняем оригинальную функцию для обратной совместимости
def predict_ner_spans(text, model, tokenizer, device="cuda"):
    return predict_ner_spans_batch([text], model, tokenizer, device=device)[0]

In [75]:
%%time
run_inference_csv("submission.csv", "submission_new.csv", model1, tokenizer1, device=device)


✅ Результаты сохранены в submission_new.csv
CPU times: user 949 ms, sys: 4.04 ms, total: 953 ms
Wall time: 956 ms
