In [1]:
import pandas as pd
import ast
import torch

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'сpu'
device

'cuda'

**Загрузим и посмотрим на датасет**

In [19]:
df = pd.read_csv('/content/train_new.csv', sep=';', header=0)

In [20]:
df = df[['sample', 'annotation']]

In [5]:
df

Unnamed: 0,sample,annotation
0,aa,"[(0, 2, 'O')]"
1,aala,"[(0, 4, 'O')]"
2,aarcca,"[(0, 6, 'O')]"
3,abon,"[(0, 4, 'O')]"
4,abso,"[(0, 4, 'B-BRAND')]"
...,...,...
27246,№1 кофейни,"[(0, 2, 'B-BRAND'), (3, 10, 'B-TYPE')]"
27247,№1 кофейник,"[(0, 2, 'B-BRAND'), (3, 11, 'B-TYPE')]"
27248,№1 пиво,"[(0, 2, 'B-BRAND'), (3, 7, 'B-TYPE')]"
27249,№1 са,"[(0, 2, 'B-BRAND'), (3, 5, 'B-TYPE')]"


In [6]:
df["annotation"] = df["annotation"].apply(ast.literal_eval)

In [7]:
df.sample(10)

Unnamed: 0,sample,annotation
20214,сковородка,"[(0, 10, B-TYPE)]"
18871,ручки ind,"[(0, 5, B-TYPE), (6, 9, B-BRAND)]"
13487,мороже,"[(0, 6, B-TYPE)]"
25630,чашкк,"[(0, 5, B-TYPE)]"
10813,круглый горох,"[(0, 7, B-TYPE), (8, 13, I-TYPE)]"
8904,кефир для детей,"[(0, 5, B-TYPE), (6, 9, O), (10, 15, O)]"
16804,пловн,"[(0, 5, B-TYPE)]"
23978,туалетная влажная бумага,"[(0, 9, B-TYPE), (10, 17, I-TYPE), (18, 24, I-..."
8527,картофель,"[(0, 9, B-TYPE)]"
12322,мариновянная,"[(0, 12, B-TYPE)]"


**Перейдём от BIO-разметки к обычным именам сущностей**

In [23]:
import ast

def clean_annotations(ann):
    if isinstance(ann, str):
        try:
            ann = ast.literal_eval(ann)
        except Exception:
            return []

    fixed = []
    for item in ann:
        if not isinstance(item, (list, tuple)) or len(item) != 3:
            continue
        start, end, label = item
        # убираем BIO-префиксы
        label = label.replace("B-", "").replace("I-", "")
        if label == "O" or label == "0":
            continue
        fixed.append((start, end, label))
    return fixed

df["annotation"] = df["annotation"].apply(clean_annotations)


**Первоначальная обработка BIO**

In [8]:
df['annotation'] = df['annotation'].apply(
    lambda ann: [(start, end, label.replace("B-", "").replace("I-", "").replace("0", "O")) for start, end, label in ann]
)


In [24]:
df.sample(10)

Unnamed: 0,sample,annotation
12559,масло сливосное,"[(0, 5, TYPE), (6, 15, TYPE)]"
24773,фсташки,"[(0, 7, TYPE)]"
14649,облепихи,"[(0, 8, TYPE)]"
17553,приправа доя картофеля,"[(0, 8, TYPE)]"
6631,дойки,"[(0, 5, TYPE)]"
18945,рыбоовощные,"[(0, 11, TYPE)]"
4269,в томате килька,"[(9, 15, TYPE)]"
9481,колибри,"[(0, 7, BRAND)]"
20278,скф,[]
6158,гророх,"[(0, 6, TYPE)]"


In [25]:
train_data = []
for index, row in df.iterrows():
    text = row['sample']
    annotations = row['annotation']
    train_data.append((text, {"entities": annotations}))

print("Пример train_data:", train_data[:3])

Пример train_data: [('aa', {'entities': []}), ('aala', {'entities': []}), ('aarcca', {'entities': []})]


In [11]:
!pip install spacy



In [12]:
!pip install spacy-transformers

Collecting spacy-transformers
  Downloading spacy_transformers-1.3.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy-transformers)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy-transformers)
  Downloading spacy_alignments-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<4.50.0,>=3.4.0->spacy-transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading spacy_transformers-1.3.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (795 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m795.8/795.8 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[

In [13]:
import spacy

In [14]:
from spacy.tokens import DocBin
from spacy.training.example import Example
from spacy.util import minibatch
from sklearn.metrics import classification_report
import random

**DeepPavlov/rubert**

In [68]:
spacy.require_gpu()

nlp = spacy.blank("ru")

transformer = nlp.add_pipe("transformer", config={
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v1",
        "name": "DeepPavlov/rubert-base-cased",
        "tokenizer_config": {"use_fast": True},
        "get_spans": {
            "@span_getters": "spacy-transformers.strided_spans.v1",
            "window": 128,
            "stride": 96
        }
    }
})

**xlm-roberta**

In [None]:
spacy.require_gpu()

nlp = spacy.blank("ru")

transformer = nlp.add_pipe("transformer", config={
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v1",
        "name": "xlm-roberta-base",
        "tokenizer_config": {"use_fast": True},
        "get_spans": {
            "@span_getters": "spacy-transformers.strided_spans.v1",
            "window": 128,
            "stride": 96
        }
    }
})

**MaxoutWindowEncoder**

In [79]:
spacy.require_gpu()

# Создаем модель
nlp = spacy.blank("ru")

config = {
    "model": {
        "@architectures": "spacy.Tok2Vec.v2",
        "embed": {
            "@architectures": "spacy.MultiHashEmbed.v2",
            "width": 128,
            "rows": [2000, 1000],
            "attrs": ["NORM", "SHAPE"],
            "include_static_vectors": False
        },
        "encode": {
            "@architectures": "spacy.MaxoutWindowEncoder.v2",
            "width": 128,
            "window_size": 1,
            "maxout_pieces": 3,
            "depth": 4
        }
    }
}

# Добавляем компоненты
tok2vec = nlp.add_pipe("tok2vec", config=config)
ner = nlp.add_pipe("ner")

# Добавляем labels
labels = ["BRAND", "TYPE", "VOLUME", "PERCENT"]
for label in labels:
    ner.add_label(label)

**BiLSTM**

In [61]:
spacy.require_gpu()

# Создаем модель
nlp = spacy.blank("ru")

# Конфиг BiLSTM
config = {
    "model": {
        "@architectures": "spacy.Tok2Vec.v2",
        "embed": {
            "@architectures": "spacy.MultiHashEmbed.v2",
            "width": 128,
            "rows": [2000, 1000],
            "attrs": ["NORM", "SHAPE"],
            "include_static_vectors": False
        },
        "encode": {
            "@architectures": "spacy.TorchBiLSTMEncoder.v1",
            "width": 128,      # размер скрытого слоя
            "depth": 2,        # количество слоёв
            "dropout": 0.25,
}
    }
}

# Добавляем компоненты
tok2vec = nlp.add_pipe("tok2vec", config=config)
ner = nlp.add_pipe("ner")

# Добавляем labels
labels = ["BRAND", "TYPE", "VOLUME", "PERCENT"]
for label in labels:
    ner.add_label(label)


In [70]:
ner = nlp.add_pipe("ner")

# Добавляем labels
labels = ["BRAND", "TYPE", "VOLUME", "PERCENT"]
for label in labels:
    ner.add_label(label)

**Посмотрим правильно ли мы внесли лейблы**

In [80]:
print(nlp.get_pipe("ner").labels)

('BRAND', 'PERCENT', 'TYPE', 'VOLUME')


**Разделим выборку**

In [81]:
examples = []
for text, ann in train_data:
    doc = nlp.make_doc(text)
    examples.append(Example.from_dict(doc, ann))

# train/dev split
random.seed(52)
random.shuffle(examples)
split = int(len(examples) * 0.8)
train_examples = examples[:split]
dev_examples = examples[split:]

**Цикл обучения модели**

**Для RuBERT**

In [73]:
nlp.initialize(get_examples=lambda: train_examples)

epochs = 12
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Epoch 1, Losses: {'transformer': 0.0, 'ner': np.float32(7460.639)}
Epoch 2, Losses: {'transformer': 0.0, 'ner': np.float32(4045.8328)}
Epoch 3, Losses: {'transformer': 0.0, 'ner': np.float32(2943.0933)}
Epoch 4, Losses: {'transformer': 0.0, 'ner': np.float32(2199.8137)}
Epoch 5, Losses: {'transformer': 0.0, 'ner': np.float32(1748.7516)}
Epoch 6, Losses: {'transformer': 0.0, 'ner': np.float32(1496.3965)}
Epoch 7, Losses: {'transformer': 0.0, 'ner': np.float32(1300.0714)}
Epoch 8, Losses: {'transformer': 0.0, 'ner': np.float32(1142.2488)}
Epoch 9, Losses: {'transformer': 0.0, 'ner': np.float32(1016.3555)}
Epoch 10, Losses: {'transformer': 0.0, 'ner': np.float32(928.3505)}
Epoch 11, Losses: {'transformer': 0.0, 'ner': np.float32(886.657)}
Epoch 12, Losses: {'transformer': 0.0, 'ner': np.float32(804.3834)}


In [74]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'transformer': 0.0, 'ner': np.float32(780.0777)}


In [75]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'transformer': 0.0, 'ner': np.float32(743.5196)}


In [76]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'transformer': 0.0, 'ner': np.float32(632.922)}


In [77]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'transformer': 0.0, 'ner': np.float32(566.02136)}


**Для XLM-RoBERTA**

In [None]:
nlp.initialize(get_examples=lambda: train_examples)

epochs = 12
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(874.8544)}


**Для MaxoutWindowEncoder**

In [82]:
nlp.initialize(get_examples=lambda: train_examples)

epochs = 14
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(7338.4272)}
Epoch 2, Losses: {'tok2vec': 0.0, 'ner': np.float32(4085.1404)}
Epoch 3, Losses: {'tok2vec': 0.0, 'ner': np.float32(2914.4578)}
Epoch 4, Losses: {'tok2vec': 0.0, 'ner': np.float32(2176.7366)}
Epoch 5, Losses: {'tok2vec': 0.0, 'ner': np.float32(1767.0839)}
Epoch 6, Losses: {'tok2vec': 0.0, 'ner': np.float32(1437.3646)}
Epoch 7, Losses: {'tok2vec': 0.0, 'ner': np.float32(1269.9851)}
Epoch 8, Losses: {'tok2vec': 0.0, 'ner': np.float32(1120.222)}
Epoch 9, Losses: {'tok2vec': 0.0, 'ner': np.float32(1005.084)}
Epoch 10, Losses: {'tok2vec': 0.0, 'ner': np.float32(926.8863)}
Epoch 11, Losses: {'tok2vec': 0.0, 'ner': np.float32(869.45044)}
Epoch 12, Losses: {'tok2vec': 0.0, 'ner': np.float32(795.9928)}
Epoch 13, Losses: {'tok2vec': 0.0, 'ner': np.float32(730.5112)}
Epoch 14, Losses: {'tok2vec': 0.0, 'ner': np.float32(697.27747)}


**Для BiLSTM**

In [64]:
nlp.initialize(get_examples=lambda: train_examples)

epochs = 13
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(7519.8706)}
Epoch 2, Losses: {'tok2vec': 0.0, 'ner': np.float32(4122.8594)}
Epoch 3, Losses: {'tok2vec': 0.0, 'ner': np.float32(2978.804)}
Epoch 4, Losses: {'tok2vec': 0.0, 'ner': np.float32(2220.4705)}
Epoch 5, Losses: {'tok2vec': 0.0, 'ner': np.float32(1661.4628)}
Epoch 6, Losses: {'tok2vec': 0.0, 'ner': np.float32(1467.1515)}
Epoch 7, Losses: {'tok2vec': 0.0, 'ner': np.float32(1301.8916)}
Epoch 8, Losses: {'tok2vec': 0.0, 'ner': np.float32(1185.0573)}
Epoch 9, Losses: {'tok2vec': 0.0, 'ner': np.float32(976.25494)}
Epoch 10, Losses: {'tok2vec': 0.0, 'ner': np.float32(929.0169)}
Epoch 11, Losses: {'tok2vec': 0.0, 'ner': np.float32(869.6383)}
Epoch 12, Losses: {'tok2vec': 0.0, 'ner': np.float32(786.7149)}
Epoch 13, Losses: {'tok2vec': 0.0, 'ner': np.float32(757.98236)}


In [65]:
epochs = 1
for epoch in range(epochs):
    random.shuffle(train_examples)
    losses = {}
    batches = minibatch(train_examples, size=32)
    for batch in batches:
        nlp.update(batch, drop=0.1, losses=losses)
    print(f"Epoch {epoch+1}, Losses: {losses}")

Epoch 1, Losses: {'tok2vec': 0.0, 'ner': np.float32(737.34503)}


**Модуль для оценки работы модели**

In [30]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6c36486b110f95f6df845105671b1e86e2bdc1c51acebc342c9f87d72a56ea91
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


**Для RuBERT**

In [None]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

**Для XLM-RoBERTA**

In [None]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

**Для MaxoutWindowEncoder**

In [84]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

       BRAND     0.8514    0.8246    0.8378      1528
     PERCENT     0.6667    0.4000    0.5000         5
        TYPE     0.9427    0.9645    0.9535      5782
      VOLUME     1.0000    0.6667    0.8000         9

   micro avg     0.9244    0.9346    0.9295      7324
   macro avg     0.8652    0.7140    0.7728      7324
weighted avg     0.9235    0.9346    0.9289      7324



**Оценка для BiLSTM**

*В экспериментах ниже менялись параметры*

**Экспермент 1**

In [31]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

       BRAND     0.8530    0.8390    0.8459      1528
     PERCENT     1.0000    0.6000    0.7500         5
        TYPE     0.9425    0.9609    0.9516      5782
      VOLUME     0.7778    0.7778    0.7778         9

   micro avg     0.9242    0.9350    0.9296      7324
   macro avg     0.8933    0.7944    0.8313      7324
weighted avg     0.9237    0.9350    0.9292      7324



**Эксперимент 2**

In [45]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

       BRAND     0.8545    0.8416    0.8480      1528
     PERCENT     0.2000    0.2000    0.2000         5
        TYPE     0.9409    0.9609    0.9508      5782
      VOLUME     0.7778    0.7778    0.7778         9

   micro avg     0.9227    0.9353    0.9289      7324
   macro avg     0.6933    0.6951    0.6941      7324
weighted avg     0.9222    0.9353    0.9286      7324



In [51]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

       BRAND     0.8752    0.8351    0.8547      1528
     PERCENT     0.6000    0.6000    0.6000         5
        TYPE     0.9439    0.9637    0.9537      5782
      VOLUME     0.8750    0.7778    0.8235         9

   micro avg     0.9300    0.9364    0.9332      7324
   macro avg     0.8235    0.7941    0.8080      7324
weighted avg     0.9293    0.9364    0.9326      7324



In [66]:
from seqeval.metrics import classification_report

y_true, y_pred = [], []

for ex in dev_examples:
    true_tags = ["O"] * len(ex.reference)
    pred_tags = ["O"] * len(ex.reference)

    # Реальные сущности с BIO-префиксами
    for ent in ex.reference.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            true_tags[i] = f"{prefix}{ent.label_}"

    # Предсказанные сущности с BIO-префиксами
    pred_doc = nlp(ex.text)
    for ent in pred_doc.ents:
        for i in range(ent.start, ent.end):
            prefix = "B-" if i == ent.start else "I-"
            pred_tags[i] = f"{prefix}{ent.label_}"

    y_true.append(true_tags)
    y_pred.append(pred_tags)

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

       BRAND     0.8591    0.8462    0.8526      1528
     PERCENT     0.7500    0.6000    0.6667         5
        TYPE     0.9410    0.9623    0.9515      5782
      VOLUME     0.8333    0.5556    0.6667         9

   micro avg     0.9242    0.9373    0.9307      7324
   macro avg     0.8459    0.7410    0.7844      7324
weighted avg     0.9236    0.9373    0.9303      7324



**Загрузка результатов**

In [32]:
import time

In [52]:
df_sub = pd.read_csv("/content/submission_new.csv", sep = ';', encoding="cp1251")

In [53]:
df_sub

Unnamed: 0,sample,annotation
0,форма для выпечки,"[(0, 5, 'B-TYPE'), (6, 9, 'I-TYPE'), (10, 17, ..."
1,фарш свиной,"[(0, 4, 'B-TYPE'), (5, 11, 'I-TYPE')]"
2,сок ананасовый без сахара,"[(0, 3, 'B-TYPE'), (4, 14, 'I-TYPE'), (15, 18,..."
3,еринги,"[(0, 6, 'B-TYPE')]"
4,молооко,"[(0, 7, 'B-TYPE')]"
...,...,...
4995,milkywa,"[(0, 7, 'B-BRAND')]"
4996,очиститель для унитаза,"[(0, 10, 'B-TYPE'), (11, 14, 'I-TYPE'), (15, 2..."
4997,арбузные,"[(0, 8, 'B-TYPE')]"
4998,кашы,"[(0, 4, 'B-TYPE')]"


**Блок с более детальной и качественной постобработкой**

In [54]:
import csv

def build_bio_predictions_batched(nlp, texts, batch_size=1024):
    """
    Формирует BIO-теги для списка текстов (с постобработкой).
    Использует nlp.pipe для ускорения.
    """
    all_preds = []

    # обрабатываем тексты батчами
    for doc in nlp.pipe(texts, batch_size=batch_size):
        tags = ["O"] * len(doc)  # по умолчанию все O

        # 1) Присваиваем B-/I- только для реальных сущностей (не O)
        for ent in doc.ents:
            if not ent.label_ or ent.label_ == "O":
                continue
            for j, token in enumerate(doc[ent.start:ent.end]):
                tags[token.i] = ("B-" if j == 0 else "I-") + ent.label_

        # 2) Пост-обработка: несколько B одного класса подряд → B + I + I...
        for i in range(1, len(tags)):
            if tags[i].startswith("B-"):
                if tags[i-1].startswith(("B-", "I-")):
                    prev_label = tags[i-1][2:]
                    curr_label = tags[i][2:]
                    if prev_label == curr_label:
                        tags[i] = "I-" + curr_label

        # 3) Конвертируем в список (start, end, tag)
        ents = [
            (token.idx, token.idx + len(token.text), tag)
            for token, tag in zip(doc, tags)
        ]

        all_preds.append(ents)

    return all_preds


In [55]:
%%time
texts = df_sub["sample"].astype(str).tolist()
df_sub["annotation"] = build_bio_predictions_batched(nlp, texts, batch_size=1024)
out_path = "/content/submission_new.csv"
df_sub.to_csv(out_path, index=False, sep=";", encoding="utf-8", quoting=csv.QUOTE_ALL)
print("Сохранено:", out_path)

Сохранено: /content/submission_new.csv
CPU times: user 2.08 s, sys: 7.99 ms, total: 2.09 s
Wall time: 2.1 s


🧪 **Последняя отправка:**

└─ **F1: 0.8769**

**Данный результат показал Bi-LSTM**

**Блок ниже более быстрая обработка**

In [37]:
import csv

def build_bio_predictions_fast(nlp, texts):
    """
    Быстро генерирует BIO-теги для списка текстов.
    Результат: список списков (start_char, end_char, tag)
    """
    all_preds = []

    # Используем nlp.pipe для ускорения обработки батчами
    for doc in nlp.pipe(texts, batch_size=512):
        tags = ["O"] * len(doc)

        # Заполняем BIO сразу
        for ent in doc.ents:
            if not ent.label_ or ent.label_ == "O":
                continue
            # Первый токен сущности — B-, остальные I-
            tags[ent.start] = "B-" + ent.label_
            for token in doc[ent.start + 1 : ent.end]:
                tags[token.i] = "I-" + ent.label_


        ents = [(token.idx, token.idx + len(token.text), tag) for token, tag in zip(doc, tags)]
        all_preds.append(ents)

    return all_preds

In [38]:
%%time
# Использование
# ---------------------------
texts = df_sub["sample"].astype(str).tolist()
df_sub["annotation"] = build_bio_predictions_fast(nlp, texts)

out_path = "/content/submission_new.csv"
df_sub.to_csv(out_path, index=False, sep=";", encoding="utf-8", quoting=csv.QUOTE_ALL)
print("Сохранено:", out_path)


Сохранено: /content/submission_new.csv
CPU times: user 1.75 s, sys: 9.99 ms, total: 1.76 s
Wall time: 1.79 s


🧪 **Последняя отправка:**



└─ **F1: 0.7752**

**Другя пост-обработка, которая изначально была быстрее.**

**Это результат Bi-LSTM.**

In [None]:
df_sub["annotation"]

Unnamed: 0,annotation
0,"[(0, 5, B-TYPE), (6, 9, O), (10, 17, O)]"
1,"[(0, 4, B-TYPE), (5, 11, B-TYPE)]"
2,"[(0, 3, B-TYPE), (4, 14, B-TYPE), (15, 18, O),..."
3,"[(0, 6, B-TYPE)]"
4,"[(0, 7, B-TYPE)]"
...,...
4995,"[(0, 7, B-BRAND)]"
4996,"[(0, 10, B-TYPE), (11, 14, O), (15, 22, O)]"
4997,"[(0, 8, B-TYPE)]"
4998,"[(0, 4, B-TYPE)]"


In [None]:
out_path = "/content/submission_new.csv"
df_sub.to_csv(out_path, index=False, sep=";", encoding="utf-8", quoting=csv.QUOTE_ALL)
print("Сохранено:", out_path)

Сохранено: /content/submission_new.csv


**Аугментация**