In [None]:
import pandas as pd
import ast
import torch

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'сpu'
device

'cuda'

**Загрузим и посмотрим на датасет**

In [None]:
df = pd.read_csv('/content/train_new.csv', sep=';', header=1)

In [None]:
df = df[['sample', 'annotation']]

In [None]:
df

Unnamed: 0,sample,annotation
0,aa,"[(0, 2, 'O')]"
1,aala,"[(0, 4, 'O')]"
2,aarcca,"[(0, 6, 'O')]"
3,abon,"[(0, 4, 'O')]"
4,abso,"[(0, 4, 'B-BRAND')]"
...,...,...
27246,№1 кофейни,"[(0, 2, 'B-BRAND'), (3, 10, 'B-TYPE')]"
27247,№1 кофейник,"[(0, 2, 'B-BRAND'), (3, 11, 'B-TYPE')]"
27248,№1 пиво,"[(0, 2, 'B-BRAND'), (3, 7, 'B-TYPE')]"
27249,№1 са,"[(0, 2, 'B-BRAND'), (3, 5, 'B-TYPE')]"


In [None]:
df["annotation"] = df["annotation"].apply(ast.literal_eval)

In [None]:
df.sample(10)

Unnamed: 0,sample,annotation
24846,халва на фрукто,"[(0, 5, B-TYPE), (6, 8, O), (9, 15, O)]"
12658,мацо,"[(0, 4, B-TYPE)]"
19245,салфетки влажные хозяйственные,"[(0, 8, B-TYPE), (9, 16, I-TYPE), (17, 30, I-T..."
7149,зажигалк,"[(0, 8, B-TYPE)]"
7476,золотой стандарт,"[(0, 7, B-BRAND), (8, 16, I-BRAND)]"
14245,напиток с иван,"[(0, 7, B-TYPE), (8, 9, O), (10, 14, O)]"
13847,мусс zot,"[(0, 4, B-TYPE), (5, 8, B-BRAND)]"
21052,солены,"[(0, 6, B-TYPE)]"
13265,молотый перец,"[(0, 7, B-TYPE), (8, 13, I-TYPE)]"
22446,сыр спагетти,"[(0, 3, B-TYPE), (4, 12, I-TYPE)]"


**Перейдём от BIO-разметки к обычным именам сущностей**

In [None]:
df['annotation'] = df['annotation'].apply(
    lambda ann: [(start, end, label.replace("B-", "").replace("I-", "").replace("0", "O")) for start, end, label in ann]
)


In [None]:
df.sample(10)

Unnamed: 0,sample,annotation
24414,фейхоа,"[(0, 6, TYPE)]"
5974,грейпфрутт,"[(0, 10, TYPE)]"
6942,жевпчка,"[(0, 7, TYPE)]"
15899,пелнки,"[(0, 6, TYPE)]"
24777,"фужеры, бокалы","[(0, 7, TYPE), (8, 14, TYPE)]"
14365,недалеко козинаки,"[(0, 8, O), (9, 17, TYPE)]"
12563,масло сливочно,"[(0, 5, TYPE), (6, 14, TYPE)]"
16047,переноска,"[(0, 9, TYPE)]"
19656,святой источник 5л,"[(0, 6, BRAND), (7, 15, BRAND), (16, 18, VOLUME)]"
11092,курассан,"[(0, 8, TYPE)]"


In [None]:
train_data = []
for index, row in df.iterrows():
    text = row['sample']
    annotations = row['annotation']
    train_data.append((text, {"entities": annotations}))

print("Пример train_data:", train_data[:3])

Пример train_data: [('aa', {'entities': [(0, 2, 'O')]}), ('aala', {'entities': [(0, 4, 'O')]}), ('aarcca', {'entities': [(0, 6, 'O')]})]


In [None]:
!pip install spacy



In [None]:
!pip install spacy-transformers



In [None]:
import spacy

In [None]:
from spacy.tokens import DocBin
from spacy.training.example import Example
from spacy.util import minibatch
from sklearn.metrics import classification_report
import random

**DeepPavlov/rubert**

In [None]:
spacy.require_gpu()

nlp = spacy.blank("ru")  # пустая модель, без лишних компонентов

transformer = nlp.add_pipe("transformer", config={
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v1",
        "name": "DeepPavlov/rubert-base-cased",
        "tokenizer_config": {"use_fast": True},
        "get_spans": {
            "@span_getters": "spacy-transformers.strided_spans.v1",
            "window": 128,
            "stride": 96
        }
    }
})

**xlm-roberta**

In [None]:
spacy.require_gpu()

nlp = spacy.blank("ru")  # пустая модель, без лишних компонентов

transformer = nlp.add_pipe("transformer", config={
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v1",
        "name": "xlm-roberta-base",
        "tokenizer_config": {"use_fast": True},
        "get_spans": {
            "@span_getters": "spacy-transformers.strided_spans.v1",
            "window": 128,
            "stride": 96
        }
    }
})

**Bi-LSTM**

In [None]:
spacy.require_gpu()

# Создаем модель
nlp = spacy.blank("ru")

# Конфиг BiLSTM
config = {
    "model": {
        "@architectures": "spacy.Tok2Vec.v2",
        "embed": {
            "@architectures": "spacy.MultiHashEmbed.v2",
            "width": 96,
            "rows": [2000, 1000],  # 2 значения
            "attrs": ["NORM", "SHAPE"],  # 2 атрибута (упрощаем)
            "include_static_vectors": False
        },
        "encode": {
            "@architectures": "spacy.MaxoutWindowEncoder.v2",
            "width": 96,
            "window_size": 1,
            "maxout_pieces": 3,
            "depth": 4
        }
    }
}

# Добавляем компоненты
tok2vec = nlp.add_pipe("tok2vec", config=config)
ner = nlp.add_pipe("ner")

# Добавляем labels
labels = ["BRAND", "TYPE", "VOLUME", "PERCENT"]
for label in labels:
    ner.add_label(label)

In [None]:
ner = nlp.add_pipe("ner")

# Добавляем labels
labels = ["BRAND", "TYPE", "VOLUME", "PERCENT"]
for label in labels:
    ner.add_label(label)

**Посмотрим правильно ли мы внесли лейблы**

In [None]:
print(nlp.get_pipe("ner").labels)

('BRAND', 'PERCENT', 'TYPE', 'VOLUME')


**Разделим выборку**

In [None]:
examples = []
for text, ann in train_data:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, ann))

# train/dev split
random.seed(52)
random.shuffle(examples)
split = int(len(examples) * 0.8)
train_examples = examples[:split]
dev_examples = examples[split:]

**Цикл обучения модели**

In [None]:
nlp.initialize(get_examples=lambda: train_examples)

epochs = 14
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(7556.636)}
Epoch 2, Losses: {'tok2vec': 0.0, 'ner': np.float32(4221.635)}
Epoch 3, Losses: {'tok2vec': 0.0, 'ner': np.float32(3037.9856)}
Epoch 4, Losses: {'tok2vec': 0.0, 'ner': np.float32(2277.818)}
Epoch 5, Losses: {'tok2vec': 0.0, 'ner': np.float32(1834.9102)}
Epoch 6, Losses: {'tok2vec': 0.0, 'ner': np.float32(1514.4653)}
Epoch 7, Losses: {'tok2vec': 0.0, 'ner': np.float32(1298.4945)}
Epoch 8, Losses: {'tok2vec': 0.0, 'ner': np.float32(1204.5375)}
Epoch 9, Losses: {'tok2vec': 0.0, 'ner': np.float32(1049.2402)}
Epoch 10, Losses: {'tok2vec': 0.0, 'ner': np.float32(978.3221)}
Epoch 11, Losses: {'tok2vec': 0.0, 'ner': np.float32(915.08997)}
Epoch 12, Losses: {'tok2vec': 0.0, 'ner': np.float32(778.9149)}
Epoch 13, Losses: {'tok2vec': 0.0, 'ner': np.float32(794.8838)}
Epoch 14, Losses: {'tok2vec': 0.0, 'ner': np.float32(658.05237)}


In [None]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(887.7502)}


In [None]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(840.5082)}


In [None]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(763.52026)}


In [None]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(756.38776)}


**Модуль для оценки работы модели**

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=3b26430e42a1b6784de0b27e41d51da1f61e50847071904596e907b1e312794d
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

       BRAND     0.8397    0.8436    0.8417      1528
           O     0.8512    0.8140    0.8322      1054
     PERCENT     0.6000    0.6000    0.6000         5
        TYPE     0.9423    0.9493    0.9458      5782
      VOLUME     0.7273    0.8889    0.8000         9

   micro avg     0.9121    0.9127    0.9124      8378
   macro avg     0.7921    0.8192    0.8039      8378
weighted avg     0.9117    0.9127    0.9122      8378



In [None]:
import time

In [None]:
df_sub = pd.read_csv("/content/submission_new.csv", sep = ';', encoding="cp1251")

In [None]:
df_sub

Unnamed: 0,sample,annotation
0,форма для выпечки,"[(0, 5, 'B-TYPE'), (6, 9, 'I-TYPE'), (10, 17, ..."
1,фарш свиной,"[(0, 4, 'B-TYPE'), (5, 11, 'I-TYPE')]"
2,сок ананасовый без сахара,"[(0, 3, 'B-TYPE'), (4, 14, 'I-TYPE'), (15, 18,..."
3,еринги,"[(0, 6, 'B-TYPE')]"
4,молооко,"[(0, 7, 'B-TYPE')]"
...,...,...
4995,milkywa,"[(0, 7, 'B-BRAND')]"
4996,очиститель для унитаза,"[(0, 10, 'B-TYPE'), (11, 14, 'I-TYPE'), (15, 2..."
4997,арбузные,"[(0, 8, 'B-TYPE')]"
4998,кашы,"[(0, 4, 'B-TYPE')]"


**Блок с более детальной и качественной постобработкой**

In [None]:
%%time
import csv

def build_bio_predictions(nlp, texts):
    """
    Формирует BIO-теги для списка текстов.

    Возвращает список списков: [(start_char, end_char, tag), ...] для каждого текста.
    """
    all_preds = []

    for text in texts:
        doc = nlp(text)
        tags = ["O"] * len(doc)

        # 1) Присваиваем B-/I- только для реальных сущностей (не O)
        for ent in doc.ents:
            if not ent.label_ or ent.label_ == "O":
                continue  # пропускаем пустые или O
            for j, token in enumerate(doc[ent.start:ent.end]):
                tags[token.i] = ("B-" if j == 0 else "I-") + ent.label_

        # 2) Пост-обработка: несколько B одного класса подряд → B + I + I...
        for i in range(1, len(tags)):
            if tags[i].startswith("B-"):
                if tags[i-1].startswith(("B-", "I-")):
                    prev_label = tags[i-1][2:]
                    curr_label = tags[i][2:]
                    if prev_label == curr_label:
                        tags[i] = "I-" + curr_label

        # 3) Конвертируем в список (start, end, tag)
        ents = []
        for token, tag in zip(doc, tags):
            start_char = token.idx
            end_char = token.idx + len(token.text)
            ents.append((start_char, end_char, tag))

        all_preds.append(ents)

    return all_preds

# ---------------------------
# Использование
# ---------------------------
new_annotations = build_bio_predictions(nlp, df_sub["sample"].astype(str).tolist())
df_sub["annotation"] = new_annotations

# # Проверка первых строк
for i in range(min(5, len(df_sub))):
    text = df_sub["sample"].iloc[i]
    print("\nТекст:", text)
    for start, end, tag in df_sub["annotation"].iloc[i]:
        print(f"  '{text[start:end]}' ({start}:{end}) -> {tag}")

# # Сохраняем
out_path = "/content/submission_new.csv"
df_sub.to_csv(out_path, index=False, sep=";", encoding="utf-8", quoting=csv.QUOTE_ALL)
print("Сохранено:", out_path)



Текст: форма для выпечки
  'форма' (0:5) -> B-TYPE
  'для' (6:9) -> O
  'выпечки' (10:17) -> O

Текст: фарш свиной
  'фарш' (0:4) -> B-TYPE
  'свиной' (5:11) -> I-TYPE

Текст: сок ананасовый без сахара
  'сок' (0:3) -> B-TYPE
  'ананасовый' (4:14) -> I-TYPE
  'без' (15:18) -> O
  'сахара' (19:25) -> O

Текст: еринги
  'еринги' (0:6) -> B-TYPE

Текст: молооко
  'молооко' (0:7) -> B-TYPE
Сохранено: /content/submission_new.csv


🧪 **Последняя отправка:**

└─ **F1: 0.8632**

**Данный результат показал RuBERT от DeepPavlov**

**Блок ниже более быстрая обработка**

In [None]:
import csv

def build_bio_predictions_fast(nlp, texts):
    """
    Быстро генерирует BIO-теги для списка текстов.
    Результат: список списков (start_char, end_char, tag)
    """
    all_preds = []

    # Используем nlp.pipe для ускорения обработки батчами
    for doc in nlp.pipe(texts, batch_size=512):
        tags = ["O"] * len(doc)

        # Заполняем BIO сразу
        for ent in doc.ents:
            if not ent.label_ or ent.label_ == "O":
                continue
            # Первый токен сущности — B-, остальные I-
            tags[ent.start] = "B-" + ent.label_
            for token in doc[ent.start + 1 : ent.end]:
                tags[token.i] = "I-" + ent.label_


        ents = [(token.idx, token.idx + len(token.text), tag) for token, tag in zip(doc, tags)]
        all_preds.append(ents)

    return all_preds

In [None]:
%%time
# Использование
# ---------------------------
texts = df_sub["sample"].astype(str).tolist()
df_sub["annotation"] = build_bio_predictions_fast(nlp, texts)

out_path = "/content/submission_new.csv"
df_sub.to_csv(out_path, index=False, sep=";", encoding="utf-8", quoting=csv.QUOTE_ALL)
print("Сохранено:", out_path)


Сохранено: /content/submission_new.csv
CPU times: user 1.54 s, sys: 2 ms, total: 1.54 s
Wall time: 1.63 s


🧪 **Последняя отправка:**



└─ **F1: 0.7783**

**Играла роль постобработка, но она занимала много времени**
**Это результат Bi-LSTM**

In [None]:
df_sub["annotation"]

Unnamed: 0,annotation
0,"[(0, 5, B-TYPE), (6, 9, O), (10, 17, O)]"
1,"[(0, 4, B-TYPE), (5, 11, B-TYPE)]"
2,"[(0, 3, B-TYPE), (4, 14, B-TYPE), (15, 18, O),..."
3,"[(0, 6, B-TYPE)]"
4,"[(0, 7, B-TYPE)]"
...,...
4995,"[(0, 7, B-BRAND)]"
4996,"[(0, 10, B-TYPE), (11, 14, O), (15, 22, O)]"
4997,"[(0, 8, B-TYPE)]"
4998,"[(0, 4, B-TYPE)]"


In [None]:
out_path = "/content/submission_new.csv"
df_sub.to_csv(out_path, index=False, sep=";", encoding="utf-8", quoting=csv.QUOTE_ALL)
print("Сохранено:", out_path)

Сохранено: /content/submission_new.csv


**Аугментация**