Импорт библиотек

In [26]:
import pandas as pd
import numpy as np
import re
from transliterate import translit
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
from pathlib import Path

Чтение файла (запрос | выделенный из запроса товар | выделенный из запроса бренд)

In [27]:
df = pd.read_csv('../datasets/ofd.csv')
df = df[['name', 'good', 'brand']]

mask = df['good'].fillna('').str.strip().str.lower() != 'товара нет'
df = df[mask].reset_index(drop=True)
df

Unnamed: 0,name,good,brand
0,Petmax Бантик леопард с красн розой 2шт,бантик,petmax
1,87191 Бусы для елки шарики_87191,бусы,
2,Футболка Piazza Italia WR011446881,футболка,piazza italia
3,7) YI572-03X-ONE ЗАКОЛКА ДЛЯ ВОЛОС ДЛЯ ДЕВОЧКИ,заколка,
4,Одежда (вес) 1500,одежда,
...,...,...,...
34908,"500 считалок, загадок, скороговорок для детей ...",,
34909,"Макароны ""МакСтори "" перо рефленное 400г",макароны,макстори
34910,2496950 [М] Обувь зима TENDANCE 1067-3M,обувь,tendance
34911,ИНГАВИРИН КАПС 1.000000 X 754.00,капсулы,ингавирин


Выделение из запроса параметров volume и percent

In [28]:
volume_pattern = r'(\d+[,.]?\d*\s?(?:г|кг|мл|л|шт))'
percent_pattern = r'(\d+[,.]?\d*\s?(?:%|проц\.?|percent))'
df['volume'] = df['name'].str.extract(volume_pattern, flags=re.IGNORECASE)[0]
df['percent'] = df['name'].str.extract(percent_pattern, flags=re.IGNORECASE)[0]
df = df[(~df['volume'].isna()) | (~df['percent'].isna())].reset_index(drop = True)
df = df.fillna('')
df

Unnamed: 0,name,good,brand,volume,percent
0,Petmax Бантик леопард с красн розой 2шт,бантик,petmax,2шт,
1,РУСАЛОЧКА Губка ТРИО трехслой.3 шт ЛЮКС,губка,русалочка,3 шт,
2,Квас Очаковский Пряная зима 2л,квас,очаковский,2л,
3,Наконечник вилочный изолированный НВИ 2-6 (50ш...,наконечник,ekf,50шт,
4,ЧокоПай Дарк какао 6шт.Орион,чокопай,орион,6шт,
...,...,...,...,...,...
16271,Контейнер д/прод 1.2л GR1855,контейнер,,1.2л,
16272,Фасоль Нота спаржевая 10шт Гавриш,фасоль,гавриш,10шт,
16273,Глорикс 1л. сред. д/пола - Свежесть Атлантики ...,средство для пола,glorix,1л,
16274,"Макароны ""МакСтори "" перо рефленное 400г",макароны,макстори,400г,


Создание нового запроса

In [29]:
import random

def shuffle_row(row):
    first = random.choice(['brand', 'good'])
    second = 'good' if first == 'brand' else 'brand'
    if row[first] != '':
        values = [str(row[first])]
        rest = [str(row[second]), str(row['percent']), str(row['volume'])]
        random.shuffle(rest)
        values += rest
    else:
        values = [str(row[second])]
        rest = [str(row['percent']), str(row['volume'])]
        random.shuffle(rest)
        values += rest
    return re.sub(r'\s+', ' ', ' '.join(values)).strip()

df['query'] = df.apply(shuffle_row, axis=1)
df


Unnamed: 0,name,good,brand,volume,percent,query
0,Petmax Бантик леопард с красн розой 2шт,бантик,petmax,2шт,,бантик petmax 2шт
1,РУСАЛОЧКА Губка ТРИО трехслой.3 шт ЛЮКС,губка,русалочка,3 шт,,губка русалочка 3 шт
2,Квас Очаковский Пряная зима 2л,квас,очаковский,2л,,квас очаковский 2л
3,Наконечник вилочный изолированный НВИ 2-6 (50ш...,наконечник,ekf,50шт,,ekf наконечник 50шт
4,ЧокоПай Дарк какао 6шт.Орион,чокопай,орион,6шт,,чокопай орион 6шт
...,...,...,...,...,...,...
16271,Контейнер д/прод 1.2л GR1855,контейнер,,1.2л,,контейнер 1.2л
16272,Фасоль Нота спаржевая 10шт Гавриш,фасоль,гавриш,10шт,,фасоль гавриш 10шт
16273,Глорикс 1л. сред. д/пола - Свежесть Атлантики ...,средство для пола,glorix,1л,,glorix 1л средство для пола
16274,"Макароны ""МакСтори "" перо рефленное 400г",макароны,макстори,400г,,макстори макароны 400г


In [30]:
df.iloc[14]

name       Вода минеральная Шалбуз 5л
good                             вода
brand                          шалбуз
volume                             5л
percent                              
query                  вода 5л шалбуз
Name: 14, dtype: object

Функция для разметки BIO

In [31]:
def get_word_spans(text):
    # Возвращает список кортежей (start, end, word) по всем словам в тексте
    spans = []
    for m in re.finditer(r'\S+', text):
        spans.append((m.start(), m.end(), m.group()))
    return spans

def bio_words(text, good=None, percent=None, volume=None, brand=None):
    entities = [
        ('TYPE', good),
        ('PERCENT', percent),
        ('VOLUME', volume),
        ('BRAND', brand)
    ]
    lower_text = text.lower()
    entity_spans = []
    # Для каждой сущности ищем все вхождения в тексте (если значение задано)
    for entity, value in entities:
        if value:
            value = value.strip().lower()
            for m in re.finditer(re.escape(value), lower_text):
                entity_spans.append((m.start(), m.end(), entity))
    # Если сущности перекрывают друг друга, оставляем ту, что раньше (или по длине)
    entity_spans.sort(key=lambda x: (x[0], -(x[1]-x[0])))  # сначала по позиции, потом по длине обратной
    chosen = []
    used = [False] * len(lower_text)
    for st, en, entity in entity_spans:
        if not any(used[st:en]):  # если этот кусок ещё не размечен
            chosen.append((st, en, entity))
            for i in range(st, en):
                used[i] = True
    # BIO разметка по словам
    word_spans = get_word_spans(text)
    word_tags = ['O'] * len(word_spans)
    for st, en, entity in chosen:
        words_in_entity = []
        for idx, (w_st, w_en, _) in enumerate(word_spans):
            if w_en <= st:
                continue
            if w_st >= en:
                break
            words_in_entity.append(idx)
        if words_in_entity:
            word_tags[words_in_entity[0]] = f'B-{entity}'
            for idx in words_in_entity[1:]:
                word_tags[idx] = f'I-{entity}'
    result = []
    for idx, tag in enumerate(word_tags):
        if tag != 'O':
            w_st, w_en, _ = word_spans[idx]
            result.append((w_st, w_en, tag))
    return result

In [32]:
df['markup'] = df.apply(lambda row: bio_words(row['query'], row['good'], row['percent'], row['volume'], row['brand']), axis=1)
df = df[df['query'].notna() & df['query'].astype(str).str.strip().ne('')]

df

Unnamed: 0,name,good,brand,volume,percent,query,markup
0,Petmax Бантик леопард с красн розой 2шт,бантик,petmax,2шт,,бантик petmax 2шт,"[(0, 6, B-TYPE), (7, 13, B-BRAND), (14, 17, B-..."
1,РУСАЛОЧКА Губка ТРИО трехслой.3 шт ЛЮКС,губка,русалочка,3 шт,,губка русалочка 3 шт,"[(0, 5, B-TYPE), (6, 15, B-BRAND), (16, 17, B-..."
2,Квас Очаковский Пряная зима 2л,квас,очаковский,2л,,квас очаковский 2л,"[(0, 4, B-TYPE), (5, 15, B-BRAND), (16, 18, B-..."
3,Наконечник вилочный изолированный НВИ 2-6 (50ш...,наконечник,ekf,50шт,,ekf наконечник 50шт,"[(0, 3, B-BRAND), (4, 14, B-TYPE), (15, 19, B-..."
4,ЧокоПай Дарк какао 6шт.Орион,чокопай,орион,6шт,,чокопай орион 6шт,"[(0, 7, B-TYPE), (8, 13, B-BRAND), (14, 17, B-..."
...,...,...,...,...,...,...,...
16271,Контейнер д/прод 1.2л GR1855,контейнер,,1.2л,,контейнер 1.2л,"[(0, 9, B-TYPE), (10, 14, B-VOLUME)]"
16272,Фасоль Нота спаржевая 10шт Гавриш,фасоль,гавриш,10шт,,фасоль гавриш 10шт,"[(0, 6, B-TYPE), (7, 13, B-BRAND), (14, 18, B-..."
16273,Глорикс 1л. сред. д/пола - Свежесть Атлантики ...,средство для пола,glorix,1л,,glorix 1л средство для пола,"[(0, 6, B-BRAND), (7, 9, B-VOLUME), (10, 18, B..."
16274,"Макароны ""МакСтори "" перо рефленное 400г",макароны,макстори,400г,,макстори макароны 400г,"[(0, 8, B-BRAND), (9, 17, B-TYPE), (18, 22, B-..."


Экспорт в csv/xlsx

In [None]:
# df[['query', 'markup']].to_excel('../datasets/ofd.xlsx', index=False)
# df[['good', 'brand', 'percent', 'volume', 'query', 'markup']].to_excel('../datasets/ofd_adv.xlsx', index=False)
df[['query', 'markup']].to_csv('../datasets/ofd_marked.csv', encoding="utf-8-sig", index=False)
# df[['good', 'brand', 'percent', 'volume', 'query', 'markup']].to_csv('../datasets/ofd_adv.csv', encoding="utf-8-sig", index=False)

In [34]:
df_aug = df[['query', 'markup']].copy()

SEPARABLE_CHARS = {'%'}

def _insert_spaces(text: str):
    new_chars = []
    index_map = [-1] * len(text)
    for idx, ch in enumerate(text):
        if ch in SEPARABLE_CHARS:
            if new_chars and not new_chars[-1].isspace():
                new_chars.append(' ')
            index_map[idx] = len(new_chars)
            new_chars.append(ch)
            if idx + 1 < len(text) and not text[idx + 1].isspace():
                new_chars.append(' ')
        else:
            index_map[idx] = len(new_chars)
            new_chars.append(ch)
    return ''.join(new_chars), index_map

def _spans_to_char_labels(text: str, spans):
    labels = ['O'] * len(text)
    for start, end, label in spans or []:
        if not isinstance(start, int) or not isinstance(end, int):
            continue
        if start < 0 or end > len(text) or start >= end:
            continue
        if not label:
            continue
        if '-' in label:
            prefix, entity = label.split('-', 1)
        else:
            prefix, entity = 'B', label
        for pos in range(start, end):
            if pos == start:
                if prefix == 'I':
                    labels[pos] = f'I-{entity}'
                else:
                    labels[pos] = f'B-{entity}'
            else:
                labels[pos] = f'I-{entity}'
    return labels

def _char_labels_to_spans(text: str, labels):
    spans = []
    for start, end, _ in get_word_spans(text):
        tag = next((labels[pos] for pos in range(start, end) if labels[pos] != 'O'), 'O')
        if tag != 'O':
            spans.append((start, end, tag))
    return spans

def normalize_query_and_markup(text, spans):
    if not isinstance(text, str) or not text:
        return text, spans
    new_text, index_map = _insert_spaces(text)
    if new_text == text:
        return text, spans
    if not isinstance(spans, (list, tuple)):
        return new_text, spans
    spans_list = [tuple(item) for item in spans]
    char_labels_old = _spans_to_char_labels(text, spans_list)
    char_labels_new = ['O'] * len(new_text)
    for old_idx, new_idx in enumerate(index_map):
        if new_idx >= 0:
            char_labels_new[new_idx] = char_labels_old[old_idx]
    new_spans = _char_labels_to_spans(new_text, char_labels_new)
    return new_text, new_spans

def _ensure_tuple_markup(markup):
    if isinstance(markup, list):
        return [tuple(item) for item in markup]
    if isinstance(markup, tuple):
        return [markup]
    return markup

augmented_rows = []
for _, row in df_aug.iterrows():
    original_text = row['query']
    original_markup = _ensure_tuple_markup(row['markup'])
    augmented_rows.append((original_text, original_markup))
    normalized_text, normalized_markup = normalize_query_and_markup(original_text, row['markup'])
    if normalized_text != original_text or normalized_markup != row['markup']:
        normalized_markup = _ensure_tuple_markup(normalized_markup)
        augmented_rows.append((normalized_text, normalized_markup))

df_aug = pd.DataFrame(augmented_rows, columns=['query', 'markup'])



In [35]:
df_aug

Unnamed: 0,query,markup
0,бантик petmax 2шт,"[(0, 6, B-TYPE), (7, 13, B-BRAND), (14, 17, B-..."
1,губка русалочка 3 шт,"[(0, 5, B-TYPE), (6, 15, B-BRAND), (16, 17, B-..."
2,квас очаковский 2л,"[(0, 4, B-TYPE), (5, 15, B-BRAND), (16, 18, B-..."
3,ekf наконечник 50шт,"[(0, 3, B-BRAND), (4, 14, B-TYPE), (15, 19, B-..."
4,чокопай орион 6шт,"[(0, 7, B-TYPE), (8, 13, B-BRAND), (14, 17, B-..."
...,...,...
18271,контейнер 1.2л,"[(0, 9, B-TYPE), (10, 14, B-VOLUME)]"
18272,фасоль гавриш 10шт,"[(0, 6, B-TYPE), (7, 13, B-BRAND), (14, 18, B-..."
18273,glorix 1л средство для пола,"[(0, 6, B-BRAND), (7, 9, B-VOLUME), (10, 18, B..."
18274,макстори макароны 400г,"[(0, 8, B-BRAND), (9, 17, B-TYPE), (18, 22, B-..."


In [None]:
df_aug.to_csv('../datasets/ofd_marked_aug.csv', encoding="utf-8-sig", index=False)

Аугментация от Степы

In [13]:
# Russian ЙЦУКЕН keyboard layout
RU_LAYOUT = "йцукенгшщзхъфывапролджэячсмитьбю.ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬБЮ,"

# Ukrainian keyboard layout (similar but different)
UA_LAYOUT = "йцукенгшщзхїфівапролджєячсмитьбю.ЙЦУКЕНГШЩЗХЇФІВАПРОЛДЖЄЯЧСМИТЬБЮ,"

In [14]:
BRAND_TAGS = ["B-BRAND", "I-BRAND"]
TYPE_TAGS = ["B-TYPE", "I-TYPE"]
VOLUME_TAGS = ["B-VOLUME", "I-VOLUME"]
PERSENT_TAGS = ["B-PERCENT", "I-PERCENT"]

In [15]:
def has_special(text):
    pattern = r'[!@№#$%^&*()_+\-=\[\]{};\':"\\|,.<>/?]'

    if re.search(pattern, text):
        return True
    return False

In [16]:
def map_by_keyboard_position(text, from_layout, to_layout):
    """Map characters based on keyboard position"""
    translation_table = str.maketrans(from_layout, to_layout)
    return text.translate(translation_table)

In [17]:
def ru_to_ua(text):
    return map_by_keyboard_position(text, RU_LAYOUT, UA_LAYOUT)

def ua_to_ru(text):
    return map_by_keyboard_position(text, UA_LAYOUT, RU_LAYOUT)

In [18]:
def split_by_tag(text, annotations):
    """Split text into substrings according to tags"""
    segments = []

    for start, end, label in annotations:
        segments.append(text[start:end])
        last_end = end

    return segments

In [19]:
def crop(text, tags, crop_p):
    spl = split_by_tag(text, tags)
    probs = np.random.uniform(0, 1, len(spl))

    common_shift = 0
    new_tags = []

    for i, s in enumerate(spl):

        begin = tags[i][0] - common_shift

        label = tags[i][2]
        forbidden = (label in VOLUME_TAGS) or (label in PERSENT_TAGS)

        if probs[i] < crop_p and not forbidden:
            to_del = int(np.random.uniform(.5, 1)*len(s))
            concat = s[:to_del]

            common_shift += len(s) - to_del
        else:
            concat = s

        if i > 0:
            croped = f"{croped} {concat}"

        else:
            croped = concat

        end = tags[i][1] - common_shift
        new_tags.append((begin, end, tags[i][2]))


    return croped, new_tags

In [20]:
def shoot(text, tags, shoot_p):
    spl = split_by_tag(text, tags)
    probs = np.random.uniform(0, 1, len(spl))

    common_shift = 0
    new_tags = []

    for i, s in enumerate(spl):
        
        begin = tags[i][0] - common_shift

        label = tags[i][2]
        forbidden = (label in VOLUME_TAGS) or (label in PERSENT_TAGS)

        if probs[i] < shoot_p and not forbidden:
            to_del = int(np.random.uniform(0, 1)*len(s))
            concat = f"{s[:to_del]}{s[to_del+1:]}"

            common_shift += 1
        else:
            concat = s

        if i > 0:
            croped = f"{croped} {concat}"
        else:
            croped = concat

        end = tags[i][1] - common_shift
        new_tags.append((begin, end, tags[i][2]))

    return croped, new_tags

In [21]:
def char_augment(sample, n = 1, 
                 ru_typo_p = .1,
                 en_typo_p = .1,
                 swap_p = .1, 
                 crop_p = .1, 
                 shoot_p = .1):

    which = np.random.choice(range(4), size = n)

    text, tags = sample
    augmented_tags = []
    augmented_text = []

    for i in range(n):
        # Keyboard typo
        if which[i] == 0:

            augmented_text.append(f"{text} 1")
            augmented_tags.append(tags)

            while len(augmented_text[i]) > len(text) or \
                    len(augmented_text[i].split("!")) > len(text.split("!")):

                ua_text = ru_to_ua(text)

                aug = nac.keyboard.KeyboardAug(
                                               lang = "uk",
                                               aug_char_min = 0, 
                                               aug_char_p = ru_typo_p,
                                               aug_char_max = None, 
                                               include_upper_case = False,
                                               include_special_char = True,
                                                tokenizer = str.split
                                               )
                augmented_text[i] = aug.augment(ua_text, n = 1)[0]


                augmented_text[i] = ua_to_ru(augmented_text[i])



                aug = nac.keyboard.KeyboardAug(
                                               lang = "en",
                                               aug_char_min = 0, 
                                               aug_char_p = en_typo_p,
                                               aug_char_max = None, 
                                               include_upper_case = False,
                                               include_special_char = False,
                                                tokenizer = str.split
                                               )
                augmented_text[i] = aug.augment(augmented_text[i], n = 1)[0]


                # preserve volume and persents from the original query
                for begin, end, label in augmented_tags[i]:
                    if label in PERSENT_TAGS or label in VOLUME_TAGS:
                        #augmented_text[i][begin:end] = text[begin:end]
                        augmented_text[i] = f"{augmented_text[i][:begin]}{text[begin:end]}{augmented_text[i][end+1:]}"

            #print(which[i], augmented_text[i])



                



        # Letters swap typo
        
        if which[i] == 1:
            aug = nac.random.RandomCharAug(action="swap", 
                                           aug_char_min = 0,
                                           aug_char_p = swap_p,
                                           aug_char_max = None,
                                           swap_mode = "adjacent",
                                           include_upper_case = False)

            augmented_text.append(aug.augment(text, n = 1)[0])
            augmented_tags.append(tags)

        # Skipping the end of a word typo
        if which[i] == 2:
            atext, atags = crop(text, 
                                tags,
                                crop_p)                                                 
            augmented_text.append(atext)
            augmented_tags.append(atags)


        # Missing single letter from a word typo
        if which[i] == 3:

            atext, atags = shoot(text, 
                                 tags,
                                 shoot_p)

            augmented_text.append(atext)
            augmented_tags.append(atags)




    return augmented_text, augmented_tags

In [22]:
def mix_augment(sample):
    text, tags = sample

    if len(tags) < 2:
        return [], []

    s_type = ""
    s_brand = ""
    s_volume = ""

    t_type = []
    t_brand = []
    t_volume = []

    aug_text = []
    aug_tags = []

    for begin, end, label in tags:

        if label in TYPE_TAGS or label == "O":
            s_type = f"{s_type} {text[begin:end]}"
            t_type.append((end-begin, label))

        elif label in BRAND_TAGS:
            s_brand = f"{s_brand} {text[begin:end]}"
            t_brand.append((end-begin, label))

        elif label in VOLUME_TAGS or label in PERSENT_TAGS:
            s_volume = f"{s_volume} {text[begin:end]}"
            t_volume.append((end-begin, label))

    if s_type != "":
        s_type = s_type[1:]
    if s_brand != "":
        s_brand = s_brand[1:]
    if s_volume != "":
        s_volume = s_volume[1:]


    a = (s_type == "")
    b = (s_brand == "")
    c = (s_volume == "")

    # if any two striggs empty at the same time
    # return nothing
    cond = (~a & b & c) | (a & ~b & c) | (a & b & ~c) 
    if cond:
        return [], []


    def tag_string(tags):
        tag = []
        pos = 0
        for t in tags:
            tag.append((pos, pos + t[0], t[1]))
            pos = pos + t[0] + 1

        return tag

    aug_text.append(f"{s_type} {s_brand}")
    aug_tags.append(tag_string(t_type + t_brand))

    aug_text.append(f"{s_brand} {s_type}")
    aug_tags.append(tag_string(t_brand + t_type))


    if not (s_volume == ""):
        aug_text.append(f"{s_type} {s_brand} {s_volume}")
        aug_tags.append(tag_string(t_type + t_brand + t_volume))

        aug_text.append(f"{s_type} {s_volume} {s_brand}")
        aug_tags.append(tag_string(t_type + t_volume + t_brand))

        aug_text.append(f"{s_brand} {s_type} {s_volume}")
        aug_tags.append(tag_string(t_brand + t_type + t_volume))

        aug_text.append(f"{s_type} {s_volume}")
        aug_tags.append(tag_string(t_type + t_volume))

    # find and delete original string from the list
    for i in range(len(aug_text)):
        if aug_text[i] == text:
            break

    aug_text.pop(i)
    aug_tags.pop(i)

    return aug_text, aug_tags

Опечатки

In [23]:
df.iloc[11]

name                Торт Абрикосовая мечта 600г Наш Кондитер
good                                                    торт
brand                                           наш кондитер
volume                                                  600г
percent                                                     
query                                 торт наш кондитер 600г
markup     [(0, 4, B-TYPE), (5, 8, B-BRAND), (9, 17, I-BR...
Name: 11, dtype: object

In [36]:

df_test = df[['query', 'markup']]

for _, row in df_test.iterrows():
    # print(row['query'])
    orig_row = pd.DataFrame({'query': [row['query']], 'markup': [row['markup']]})
    df_aug = pd.concat([df_aug, orig_row], ignore_index=True) 
    augmented_texts, augmented_tags = char_augment(row) #если нужно несколько строк с каждого оргинала, ввести n=...
    df_temp = pd.DataFrame({'query': augmented_texts, 'markup': augmented_tags})
    df_aug = pd.concat([df_aug, df_temp], ignore_index=True)
    if _ % 50 == 0:
        print(f"Processed {_} rows")

Processed 0 rows
Processed 50 rows
Processed 100 rows
Processed 150 rows
Processed 200 rows
Processed 250 rows
Processed 300 rows
Processed 350 rows
Processed 400 rows
Processed 450 rows
Processed 500 rows
Processed 550 rows
Processed 600 rows
Processed 650 rows
Processed 700 rows
Processed 750 rows
Processed 800 rows
Processed 850 rows
Processed 900 rows
Processed 950 rows
Processed 1000 rows
Processed 1050 rows
Processed 1100 rows
Processed 1150 rows
Processed 1200 rows
Processed 1250 rows
Processed 1300 rows
Processed 1350 rows
Processed 1400 rows
Processed 1450 rows
Processed 1500 rows
Processed 1550 rows
Processed 1600 rows
Processed 1650 rows
Processed 1700 rows
Processed 1750 rows
Processed 1800 rows
Processed 1850 rows
Processed 1900 rows
Processed 1950 rows
Processed 2000 rows
Processed 2050 rows
Processed 2100 rows
Processed 2150 rows
Processed 2200 rows
Processed 2250 rows
Processed 2300 rows
Processed 2350 rows
Processed 2400 rows
Processed 2450 rows
Processed 2500 rows
Pro

In [45]:
print(df_aug.iloc[-2]['markup'])
print(df_aug.iloc[-2]['query'])

df_aug.tail(20)

[(0, 7, 'B-BRAND'), (8, 12, 'B-VOLUME'), (13, 20, 'B-TYPE')]
crafers 180г печенье


Unnamed: 0,query,markup
50808,wolf 20л,"[(0, 4, B-BRAND), (5, 8, B-VOLUME)]"
50809,wolf 20л,"[(0, 4, B-BRAND), (5, 8, B-VOLUME)]"
50810,россиянка 1Л масло,"[(0, 9, B-BRAND), (10, 12, B-VOLUME), (13, 18,..."
50811,россиянка 1Л масло,"[(0, 9, B-BRAND), (10, 12, B-VOLUME), (13, 18,..."
50812,сосиски 0.8КГ,"[(0, 7, B-TYPE), (8, 13, B-VOLUME)]"
50813,сосикси 0. 8КГ,"[(0, 7, B-TYPE), (8, 13, B-VOLUME)]"
50814,йогурт активиа 260г,"[(0, 6, B-TYPE), (7, 14, B-BRAND), (15, 19, B-..."
50815,йогурт активиа 260г,"[(0, 6, B-TYPE), (7, 14, B-BRAND), (15, 19, B-..."
50816,пакет 80шт,"[(0, 5, B-TYPE), (6, 10, B-VOLUME)]"
50817,пкает 80шт,"[(0, 5, B-TYPE), (6, 10, B-VOLUME)]"


Перемешка слов

In [25]:
# df_mix = pd.DataFrame()
# df_test = df[['query', 'markup']]
# for _, row in df_test.iterrows():
#     orig_row = pd.DataFrame({'query': [row['query']], 'markup': [row['markup']]})
#     df_mix = pd.concat([df_mix, orig_row], ignore_index=True) 
#     augmented_texts, augmented_tags = mix_augment(row)
#     df_temp = pd.DataFrame({'query': augmented_texts, 'markup': augmented_tags})
#     df_mix = pd.concat([df_mix, df_temp], ignore_index=True)
#     if _ % 50 == 0:
#         print(f"Processed {_} rows")


In [27]:
# df_mix

Перемешка слов (к df с опечатками)

In [28]:
# df_aug_mix = pd.DataFrame()
# for _, row in df_aug.iterrows():
#     orig_row = pd.DataFrame({'query': [row['query']], 'markup': [row['markup']]})
#     df_aug_mix = pd.concat([df_aug_mix, orig_row], ignore_index=True) 
#     augmented_texts, augmented_tags = mix_augment(row)
#     df_temp = pd.DataFrame({'query': augmented_texts, 'markup': augmented_tags})
#     df_aug_mix = pd.concat([df_aug_mix, df_temp], ignore_index=True)
#     if _ % 1000 == 0:
#         print(f"Processed {_} rows")


In [30]:
# df_aug_mix