# Преобразование текста

In [None]:
import pandas as pd
import os
import numpy as np
import json
import zipfile
import glob
import os
from tqdm import tqdm
import re
import torch
from transformers import (
    AutoTokenizer,
)
from collections import defaultdict

In [None]:
os.chdir("avitotech_data\\avitotech_data")

In [None]:
df_train_1 = pd.read_parquet("train_part_0001.snappy.parquet")
df_train_2 = pd.read_parquet("train_part_0002.snappy.parquet")
df_train_3 = pd.read_parquet("train_part_0003.snappy.parquet")
df_train_4 = pd.read_parquet("train_part_0004.snappy.parquet")

df_test_1 = pd.read_parquet("test_part_0001.snappy.parquet")
df_test_2 = pd.read_parquet("test_part_0002.snappy.parquet")

In [None]:
df_train = pd.concat([df_train_1, df_train_2, df_train_3, df_train_4])

df_test = pd.concat([df_test_1, df_test_2])

In [None]:
df_train.head(3)

In [None]:
df_train.columns

In [None]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)        # удаление HTML
    text = re.sub(r'[^\w\s]', ' ', text)       # удаление пунктуации
    text = re.sub(r'\s+', ' ', text)           # удаление лишних пробелов
    return text.strip()

In [None]:
def clean_product_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    columns_to_clean = ['base_title',
       'cand_title', 'base_description', 'cand_description',
       'base_category_name', 'cand_category_name', 'base_subcategory_name',
       'cand_subcategory_name', 'base_param1', 'cand_param1', 'base_param2',
       'cand_param2']

    for col in tqdm(columns_to_clean):
        if col in df.columns:
            df[col] = df[col].apply(clean_text)

    return df

In [None]:
df_train = clean_product_data(df_train)
df_test = clean_product_data(df_test)

In [None]:
def trim_to_tokens(text, max_tokens):
    tokens = tokenizer.tokenize(text)
    return tokenizer.convert_tokens_to_string(tokens[:max_tokens]).strip()

def concat_row(row):
    # Товар
    title = trim_to_tokens(str(row.iloc[0]) if pd.notnull(row.iloc[0]) else "", 64)
    
    # Описание
    description = trim_to_tokens(str(row.iloc[1]) if pd.notnull(row.iloc[1]) else "", 128)

    # Категория (3-я и 4-я колонки)
    cat_parts = [str(val) for val in [row.iloc[2], row.iloc[3]] if pd.notnull(val)]
    cat_text = ", ".join(cat_parts) if cat_parts else "нет"
    cat_text = trim_to_tokens(cat_text, 128)

    # Параметры (5-я и 6-я колонки)
    param_parts = [str(val) for val in [row.iloc[4], row.iloc[5]] if pd.notnull(val)]
    param_text = ", ".join(param_parts) if param_parts else "нет"
    param_text = trim_to_tokens(param_text, 128)

    # Собираем итоговую строку
    result = (
        f"товар: {title}."
        f" описание: {description}."
        f" категория: {cat_text}."
        f" параметры: {param_text}."
    )
    return result


In [None]:
df_train = df_train[['base_item_id', 'cand_item_id', 'base_title', 'cand_title',
       'base_description', 'cand_description', 'base_category_name',
       'cand_category_name', 'base_subcategory_name', 'cand_subcategory_name',
       'base_param1', 'cand_param1', 'base_param2', 'cand_param2']]

In [None]:
df_train_text_base = df_train[['base_title', 'base_description', 'base_category_name', 'base_subcategory_name',
                               'base_param1', 'base_param2']].copy()

df_train_text_cand = df_train[['cand_title', 'cand_description', 'cand_category_name', 'cand_subcategory_name',
                               'cand_param1', 'cand_param2']].copy()

df_train_text_base['base_item_id'] = df_train['base_item_id'].copy()
df_train_text_cand['cand_item_id'] = df_train['cand_item_id'].copy()

In [None]:
df_test = df_test[['base_item_id', 'cand_item_id', 'base_title', 'cand_title',
       'base_description', 'cand_description', 'base_category_name',
       'cand_category_name', 'base_subcategory_name', 'cand_subcategory_name',
       'base_param1', 'cand_param1', 'base_param2', 'cand_param2']]

In [None]:
df_test_text_base = df_test[['base_title', 'base_description', 'base_category_name', 'base_subcategory_name',
                               'base_param1', 'base_param2']].copy()

df_test_text_cand = df_test[['cand_title', 'cand_description', 'cand_category_name', 'cand_subcategory_name',
                               'cand_param1', 'cand_param2']].copy()

df_test_text_base['base_item_id'] = df_test['base_item_id'].copy()
df_test_text_cand['cand_item_id'] = df_test['cand_item_id'].copy()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tqdm.pandas()
df_train_text_base["base_all_together"] = df_train_text_base.progress_apply(concat_row, axis=1)
df_train_text_cand["base_all_together"] = df_train_text_cand.progress_apply(concat_row, axis=1)

In [None]:
tqdm.pandas()
df_test_text_base["base_all_together"] = df_test_text_base.progress_apply(concat_row, axis=1)
df_test_text_cand["base_all_together"] = df_test_text_cand.progress_apply(concat_row, axis=1)

In [None]:
df_cards_train = pd.concat([
    df_train_text_base[['base_item_id', 'base_all_together']].rename(columns={'base_item_id': 'item_id'}),
    df_train_text_cand[['cand_item_id', 'base_all_together']].rename(columns={'cand_item_id': 'item_id'})
], ignore_index=True)

df_cards_test = pd.concat([
    df_test_text_base[['base_item_id', 'base_all_together']].rename(columns={'base_item_id': 'item_id'}),
    df_test_text_cand[['cand_item_id', 'base_all_together']].rename(columns={'cand_item_id': 'item_id'})
], ignore_index=True)

In [None]:
df_cards_train = df_cards_train.rename(columns={'base_all_together': 'text'})
df_cards_test = df_cards_test.rename(columns={'base_all_together': 'text'})

In [None]:
cards_train = defaultdict(str)
for idx, item_id in enumerate(df_cards_train['item_id']):
    if item_id not in cards_train:
        cards_train[item_id] = df_cards_train['text'][idx]

cards_test = defaultdict(str)
for idx, item_id in enumerate(df_cards_test['item_id']):
    if item_id not in cards_test:
        cards_test[item_id] = df_cards_test['text'][idx]

In [None]:
with open("cards_train.json", "w") as file:
    json.dump(cards_train, file)

with open("cards_test.json", "w") as file:
    json.dump(cards_test, file)