In [1]:
from deeppavlov import train_model, build_model 
from deeppavlov.core.commands.utils import parse_config

PROJECT_DIR = '..'
MODEL_NAME = 'model'


# dataset that the model was trained on
model_config = parse_config('ner_collection3_bert')

# dataset that the model was trained on
print(model_config['dataset_reader']['data_path'])

model_config['dataset_reader']['data_path'] = PROJECT_DIR + '/datasets/conll/'

del model_config['metadata']['download']


model_config['dataset_reader']['iobes'] = False
model_config['metadata']['variables']['MODEL_PATH'] = PROJECT_DIR + '/models/' + MODEL_NAME

model_config['chainer']['pipe'][1]['save_path'] = PROJECT_DIR + '/models/tag.dict'
model_config['chainer']['pipe'][1]['load_path'] = PROJECT_DIR + '/models/tag.dict'

model_config['chainer']['pipe'][2]['save_path'] = PROJECT_DIR + '/models/' + MODEL_NAME
model_config['chainer']['pipe'][2]['load_path'] = PROJECT_DIR + '/models/' + MODEL_NAME


model_config['train']['batch_size'] = 400

model_config['train']['log_every_n_batches'] = 10
model_config['train']['val_every_n_batches'] = 10


model_config['chainer']['pipe'][0]['in'] = ['x_tokens']
model_config['chainer']['pipe'].insert(0, {"id": "ws_tok", "class_name": "split_tokenizer", "in": ["x"], "out": ["x_tokens"]})

ner_model = build_model(model_config, download=False)

~/.deeppavlov/downloads/collection3/


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

In [11]:
ner_model2 = build_model(model_config, download=False)


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

In [8]:
import time


In [12]:
import time
import statistics as stats

def benchmark_model(model, sample, warmup=3, runs=20):
    # прогрев — избавляемся от накладных расходов первого вызова
    for _ in range(warmup):
        model(sample)

    timings = []
    for _ in range(runs):
        t0 = time.perf_counter()
        model(sample)
        timings.append(time.perf_counter() - t0)

    print(f"runs: {runs}, warmup: {warmup}")
    print(f"mean: {stats.mean(timings):.6f}s")
    print(f"median: {stats.median(timings):.6f}s")
    print(f"stdev: {stats.pstdev(timings):.6f}s")
    print(f"min: {min(timings):.6f}s, max: {max(timings):.6f}s")

# пример вызова
benchmark_model(ner_model, ['форма для выпечки'], 5, 1000)


runs: 1000, warmup: 5
mean: 0.004872s
median: 0.004767s
stdev: 0.000781s
min: 0.004451s, max: 0.025998s


In [13]:
benchmark_model(ner_model, ['форма для выпечки'], 0, 10000)

runs: 10000, warmup: 0
mean: 0.004721s
median: 0.004675s
stdev: 0.000292s
min: 0.004414s, max: 0.012190s


In [23]:
import time
import statistics as stats
from concurrent.futures import ThreadPoolExecutor, as_completed

def _call_once(model, sample):
    start = time.perf_counter()
    model(sample)
    end = time.perf_counter()
    return start, end

def benchmark_concurrent(model, sample, qps=100, warmup_batches=2,
                         measurement_batches=20, batch_gap=0.0):
    """Имитация нагрузки ~qps запросов/сек; sample — список токенов для модели."""
    # прогрев (батчи не учитываются в статистике)
    with ThreadPoolExecutor(max_workers=qps) as executor:
        for _ in range(warmup_batches):
            futures = [executor.submit(_call_once, model, sample) for _ in range(qps)]
            for future in as_completed(futures):
                future.result()

    timings = []
    with ThreadPoolExecutor(max_workers=qps) as executor:
        for _ in range(measurement_batches):
            batch_start = time.perf_counter()
            futures = [executor.submit(_call_once, model, sample) for _ in range(qps)]
            batch_durations = []
            for future in as_completed(futures):
                start, end = future.result()
                batch_durations.append(end - start)
                timings.append(end - start)

            batch_elapsed = time.perf_counter() - batch_start
            if batch_elapsed < batch_gap:
                time.sleep(batch_gap - batch_elapsed)

    print(f"batches: {measurement_batches}, concurrency: {qps}")
    print(f"mean: {stats.mean(timings):.6f}s")
    print(f"median: {stats.median(timings):.6f}s")
    print(f"stdev: {stats.pstdev(timings):.6f}s")
    print(f"min: {min(timings):.6f}s, max: {max(timings):.6f}s")
    timings_sorted = sorted(timings)
    p95 = timings_sorted[int(len(timings_sorted) * 0.95)]
    print(f"p95: {p95:.6f}s")

# пример вызова
benchmark_concurrent(ner_model, ['форма для выпечки'])


batches: 20, concurrency: 100
mean: 0.648246s
median: 0.637933s
stdev: 0.059266s
min: 0.005023s, max: 0.805383s
p95: 0.750717s


In [2]:
import re
from pathlib import Path

import pandas as pd

SUBMISSION_PATH = Path(PROJECT_DIR) / 'datasets' / 'submission_raw.csv'
OUTPUT_PATH = Path(PROJECT_DIR) / 'datasets' / 'submission.csv'

TOKEN_PATTERN = re.compile(r'\S+')

def normalize_tag(tag: str) -> str:
    if not isinstance(tag, str):
        return 'O'
    tag = tag.strip()
    if not tag:
        return 'O'
    upper_tag = tag.upper()
    # keep O as-is; no S/E mapping anymore
    if upper_tag == 'O':
        return 'O'
    # pass B-/I- through, preserving the type suffix
    if upper_tag.startswith('B-') or upper_tag.startswith('I-'):
        return tag
    # anything else is unexpected; return as-is
    return tag

def is_tag(value) -> bool:
    if not isinstance(value, str):
        return False
    candidate = value.strip().upper()
    if not candidate:
        return False
    if candidate == 'O':
        return True
    return (candidate == 'O') or (len(candidate) >= 3 and candidate[1] == '-' and candidate[0] in {'B','I'})

def looks_like_sequence(seq, predicate) -> bool:
    if not isinstance(seq, (list, tuple)) or not seq:
        return False
    return all(predicate(item) for item in seq)

def looks_like_tag_sequence(seq) -> bool:
    return looks_like_sequence(seq, is_tag)

def looks_like_token_sequence(seq) -> bool:
    return looks_like_sequence(seq, lambda item: isinstance(item, str) and not is_tag(item))

def extract_tokens_and_tags(prediction) -> tuple[list[str], list[str]]:
    if isinstance(prediction, tuple):
        prediction = list(prediction)
    if not isinstance(prediction, list):
        raise ValueError(f'Unexpected model output type: {type(prediction)}')
    tokens: list[str] = []
    tags: list[str] = []

    def traverse(node):
        nonlocal tokens, tags
        if isinstance(node, tuple):
            node = list(node)
        if isinstance(node, list):
            if not tokens and looks_like_token_sequence(node):
                tokens = [str(item) for item in node]
            if not tags and looks_like_tag_sequence(node):
                tags = [normalize_tag(str(item)) for item in node]
            if tokens and tags:
                return
            for child in node:
                traverse(child)

    traverse(prediction)
    if not tags:
        raise ValueError(f'Unable to extract tag sequence from model output: {prediction}')
    return tokens, tags

def compute_annotation(text: str, tokens: list[str], tags: list[str]) -> list[tuple[int, int, str]]:
    if not tags:
        return []
    if tokens:
        effective_len = min(len(tokens), len(tags))
        tokens = tokens[:effective_len]
        tags = tags[:effective_len]
    annotation: list[tuple[int, int, str]] = []
    if tokens:
        cursor = 0
        fallback = False
        for token, tag in zip(tokens, tags):
            token = token or ''
            if not tag:
                cursor += len(token)
                continue
            start = text.find(token, cursor)
            if start == -1:
                fallback = True
                break
            end = start + len(token)
            annotation.append((start, end, tag))
            cursor = end
        if fallback:
            annotation = []
    if not annotation:
        matches = list(TOKEN_PATTERN.finditer(text))
        effective_len = min(len(matches), len(tags))
        for match, tag in zip(matches[:effective_len], tags[:effective_len]):
            if not tag:
                continue
            annotation.append((match.start(), match.end(), tag))
    return annotation

submission_df = pd.read_csv(SUBMISSION_PATH, sep=';', encoding='utf-8')
annotations = []
for row_idx, sample in enumerate(submission_df['sample'], start=1):
    text = '' if pd.isna(sample) else str(sample)
    model_output = ner_model([text])
    tokens, tags = extract_tokens_and_tags(model_output)
    annotation = compute_annotation(text, tokens, tags)
    word_count = len(tokens) if tokens else len(TOKEN_PATTERN.findall(text))
    entity_count = len(annotation)
    if word_count != entity_count:
        print(f'Row {row_idx}: word count {word_count} != annotation entities {entity_count}')
    annotations.append(annotation)

submission_df['annotation'] = [str(ann) for ann in annotations]
submission_df[['sample', 'annotation']].to_csv(OUTPUT_PATH, sep=';', encoding='utf-8', index=False)
print(f'Saved predictions to {OUTPUT_PATH.resolve()}')


Saved predictions to C:\Users\lexan\OneDrive\Documents\hackaton_lct\datasets\submission.csv


In [12]:
import pandas as pd

submission = pd.read_csv('../datasets/submission_raw.csv', sep=';')
submission.loc[:, 'annotation'] = ""
submission.to_csv('../datasets/submission_empty.csv', sep=';')

In [None]:
'''
РќР°РїРёС€Рё РєРѕРґ, РєРѕС‚РѕСЂС‹Р№ РїСЂРѕР№РґРµС‚СЃСЏ РїРѕ РІСЃРµРј СЃС‚РѕР»Р±С†Р° 'sample' РІ submission, РїСЂРёРјРµРЅРёС‚ ner_model Рє РєР°Р¶РґРѕРјСѓ РёР· РЅРёС….
РќР° РІС‹С…РѕРґРµ РёР· ner_model РїРѕР»СѓС‡Р°РµС‚СЃСЏ СЃРїРёСЃРѕРє СЃС‚СЂРѕРє СЃ С‚РёРїР°РјРё СЃСѓС‰РЅРѕСЃС‚РµР№. РќСѓР¶РЅРѕ РїСЂРµРѕР±СЂР°Р·РѕРІР°С‚СЊ СЌС‚Рѕ РІ РЅРѕРІС‹Р№ С„РѕСЂРјР°С‚.
Р’Рѕ-РїРµСЂРІС‹С…, РЅСѓР¶РЅРѕ Р·Р°РјРµРЅРёС‚СЊ РІСЃРµ S- РЅР° B-, Р° РІСЃРµ E- РЅР° I-.
Р’Рѕ-РІС‚РѕСЂС‹С…, РЅСѓР¶РЅРѕ СЃРґРµР»Р°С‚СЊ Р°РЅРЅРѕС‚Р°С†РёСЋ С„РѕСЂРјР°С‚Р° [(РёРЅРґРµРєСЃ РЅР°С‡Р°Р»Р° СЃСѓС‰РЅРѕСЃС‚Рё, РёРЅРґРµРєСЃ РєРѕРЅС†Р° СЃСѓС‰РЅРѕСЃС‚Рё, СЃС‚СЂРѕРєР° С‚РёРїР°),] 
РќР°РїСЂРёРјРµСЂ, РґР»СЏ СЃС‚СЂРѕРєРё "Р№РѕРіСѓСЂС‚С‹ РїРёС‚СЊРµРІС‹" Р°РЅРЅРѕС‚Р°С†РёСЏ Р±СѓРґРµС‚ [(0, 7, 'B-TYPE'), (8, 15, 'I-TYPE')], РёРЅРґРµРєСЃС‹ РЅР°С‡Р°Р»Р° Рё РєРѕРЅС†Р° СЃСѓС‰РЅРѕСЃС‚Рё СЂР°Р±РѕС‚Р°СЋС‚ РєР°Рє СЃСЂРµР·С‹ РІ РїРёС‚РѕРЅРµ - РІРєР»СЋС‡РёС‚РµР»СЊРЅРѕ-РёСЃРєР»СЋС‡РёС‚РµР»СЊРЅРѕ.
РўРѕ РµСЃС‚СЊ, 0 РёРЅРґРµРєСЃ СЌС‚Рѕ Р±СѓРєРІР° "Р№", 7 РёРЅРґРµРєСЃ - СЌС‚Рѕ РїСЂРѕР±РµР» РїРѕСЃР»Рµ СЃР»РѕРІР° "Р№РѕРіСѓСЂС‚С‹", РЅРѕ РїСЂРѕР±РµР» РІ СЃР°РјСѓ СЃСѓС‰РЅРѕСЃС‚СЊ РЅРµ РІС…РѕРґРёС‚.
Р’ РЅРѕРІС‹Р№ С„Р°Р№Р» submission_final.csv РЅСѓР¶РЅРѕ РІ С‚РѕРј Р¶Рµ РїРѕСЂСЏРґРєРµ Р·Р°РїРёСЃС‹РІР°С‚СЊ СЃС‚РѕР»Р±С†С‹ sample Рё annotation (Р°РЅРЅРѕС‚Р°С†РёРё СѓР¶Рµ РІ РЅРѕРІРѕРј С„РѕСЂРјР°С‚Рµ)
'''

"\nРќР°РїРёС€Рё РєРѕРґ, РєРѕС‚РѕСЂС‹Р№ РїСЂРѕР№РґРµС‚СЃСЏ РїРѕ РІСЃРµРј СЃС‚РѕР»Р±С†Р° 'sample' РІ submission, РїСЂРёРјРµРЅРёС‚ ner_model Рє РєР°Р¶РґРѕРјСѓ РёР· РЅРёС….\nРќР° РІС‹С…РѕРґРµ РёР· ner_model РїРѕР»СѓС‡Р°РµС‚СЃСЏ СЃРїРёСЃРѕРє СЃС‚СЂРѕРє СЃ С‚РёРїР°РјРё СЃСѓС‰РЅРѕСЃС‚РµР№. РќСѓР¶РЅРѕ РїСЂРµРѕР±СЂР°Р·РѕРІР°С‚СЊ СЌС‚Рѕ РІ РЅРѕРІС‹Р№ С„РѕСЂРјР°С‚.\nР’Рѕ-РїРµСЂРІС‹С…, РЅСѓР¶РЅРѕ Р·Р°РјРµРЅРёС‚СЊ РІСЃРµ S- РЅР° B-, Р° РІСЃРµ E- РЅР° I-.\nР’Рѕ-РІС‚РѕСЂС‹С…, РЅСѓР¶РЅРѕ СЃРґРµР»Р°С‚СЊ Р°РЅРЅРѕС‚Р°С†РёСЋ С„РѕСЂРјР°С‚Р° [(РёРЅРґРµРєСЃ РЅР°С‡Р°Р»Р° СЃСѓС‰РЅРѕСЃС‚Рё, РёРЅРґРµРєСЃ РєРѕРЅС†Р° СЃСѓС‰РЅРѕСЃС‚Рё, СЃС‚СЂРѕРєР° С‚РёРїР°),] Р№РѕРіСѓСЂС‚С‹ РїРёС‚СЊРµРІС‹\t[(0, 7, 'B-TYPE'), (8, 15, 'I-TYPE')]\n"