In [1]:
import pandas as pd

df = pd.read_excel('faq_channels_et_products_train_dataset_verified.xlsx')

In [2]:
manual_map = {
    'Smart QR': 'SmartQR',
    'smart qr': 'SmartQR'
}

df['right_category'] = df['right_category'].replace(manual_map)

In [3]:
df = df[['text', 'right_category', 'right_product']]

In [4]:
df = df.drop_duplicates(subset='text', keep='first')

In [5]:
df = df[df["right_category"] != "Investitsii_v_bcc_kz"]

In [6]:
def normalize_label(label):
    return str(label).strip().lower().replace(" ", "_")

df["right_product"] = df["right_product"].astype(str).apply(normalize_label)
df["right_category"] = df["right_category"].astype(str).apply(normalize_label)

In [7]:
# Очистка от нестроковых значений
df["text"] = df["text"].astype(str)

# *Модели*

In [8]:
pip install onnx

Collecting onnx
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m112.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.18.0


In [9]:
pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack

In [None]:
import os
import torch
import joblib
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torch.optim import Adam
from onnxruntime.quantization import quantize_dynamic, QuantType

In [12]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "intfloat/multilingual-e5-small"
BASE_SAVE_DIR = "saved_models"

# Убедимся, что папка существует
os.makedirs(BASE_SAVE_DIR, exist_ok=True)

In [13]:
# === Группируем по категориям ===
category_names = df["right_category"].unique()
category_groups = {cat: df[df["right_category"] == cat].copy() for cat in category_names}

In [16]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class ProductDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts.reset_index(drop=True).astype(str)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [17]:
class E5ProductClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = out.last_hidden_state[:, 0, :]
        return self.classifier(cls_token)

In [None]:
EPOCHS = 3
BATCH_SIZE = 16

# Корневая директория для сохранения
os.makedirs(BASE_SAVE_DIR, exist_ok=True)

# Сохраняем токенизатор один раз
tokenizer_save_dir = os.path.join(BASE_SAVE_DIR, "tokenizer")
if not os.path.exists(tokenizer_save_dir):
    tokenizer.save_pretrained(tokenizer_save_dir)
    print(f"Токенизатор сохранён в {tokenizer_save_dir}")

for category, sub_df in category_groups.items():
    category_dir = os.path.join(BASE_SAVE_DIR, category.replace(" ", "_"))
    if os.path.exists(os.path.join(category_dir, "model_quantized.onnx")):
        print(f"Пропущена категория {category}, квантизированная модель уже существует.")
        continue

    print(f"\nОбучение модели для категории: {category} ({len(sub_df)} записей)")
    os.makedirs(category_dir, exist_ok=True)

    sub_df = sub_df.copy().reset_index(drop=True)

    # === LabelEncoder ===
    le = LabelEncoder()
    sub_df["product_label"] = le.fit_transform(sub_df["right_product"])
    joblib.dump(le, os.path.join(category_dir, "label_encoder.pkl"))

    # Удалим редкие классы
    min_class_size = 2
    class_counts = sub_df["product_label"].value_counts()
    valid_classes = class_counts[class_counts >= min_class_size].index
    sub_df = sub_df[sub_df["product_label"].isin(valid_classes)].reset_index(drop=True)

    if sub_df["product_label"].nunique() < 2:
        print(f"Пропущена категория {category} — слишком мало уникальных классов после фильтрации.")
        continue

    # Подготовка данных
    X = sub_df["text"]
    y = sub_df["product_label"]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
    train_ds = ProductDataset(X_train.reset_index(drop=True), pd.Series(y_train).reset_index(drop=True), tokenizer)
    val_ds = ProductDataset(X_val.reset_index(drop=True), pd.Series(y_val).reset_index(drop=True), tokenizer)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)

    # Модель
    model = E5ProductClassifier(MODEL_NAME, len(le.classes_)).to(DEVICE)
    optimizer = Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"{category} | Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            optimizer.zero_grad()
            loss = criterion(model(input_ids, attention_mask), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} loss: {total_loss / len(train_loader):.4f}")

    # Оценка
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            logits = model(batch["input_ids"].to(DEVICE), batch["attention_mask"].to(DEVICE))
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())

    real_labels = sorted(list(set(all_labels)))
    target_names = [le.classes_[i] for i in real_labels]

    print("\n=== Classification Report (обычная модель) ===")
    print(classification_report(all_labels, all_preds, labels=real_labels, target_names=target_names, zero_division=0))

    # === Экспорт в ONNX и квантизация ===
    print(f"Квантизация модели для категории: {category}")
    model_cpu = model.to("cpu").eval()
    dummy_inputs = tokenizer("Пример текста", return_tensors="pt", padding="max_length", truncation=True, max_length=256)
    onnx_path = os.path.join(category_dir, "model.onnx")

    torch.onnx.export(
        model_cpu,
        (dummy_inputs["input_ids"], dummy_inputs["attention_mask"]),
        onnx_path,
        input_names=["input_ids", "attention_mask"],
        output_names=["logits"],
        dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence_length"},
                      "attention_mask": {0: "batch_size", 1: "sequence_length"},
                      "logits": {0: "batch_size"}},
        opset_version=17
    )

    # Квантизация ONNX-модели
    quantized_onnx_path = os.path.join(category_dir, "model_quantized.onnx")
    quantize_dynamic(
        model_input=onnx_path,
        model_output=quantized_onnx_path,
        weight_type=QuantType.QInt8
    )
    print(f"ONNX-квантизированная модель сохранена в {quantized_onnx_path}")

    # Очистка ненужного файла
    os.remove(onnx_path)