In [5]:
import pandas as pd

new_df = pd.read_excel('edited_faq_dataset.xlsx')

In [6]:
def resolve_category(row):
    if str(row['verification']) == '1':
        return row['right_category']
    elif pd.isna(row['verification']) or str(row['verification']) == '0':
        return row['predicted_category']
    else:
        return row['verification']

In [7]:
new_df['right_category'] = new_df.apply(resolve_category, axis=1)

In [8]:
# 3. Удалим дубликаты, оставив первый из группы
new_df = new_df.drop_duplicates(subset=['text', 'right_category'], keep='first')

In [9]:
new_df = new_df[new_df["right_category"] != "Investitsii_v_bcc_kz"]

# *Модель*

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [12]:
# Удалим категории, у которых менее 2 примеров
category_counts = new_df['right_category'].value_counts()
valid_categories = category_counts[category_counts >= 2].index

new_df = new_df[new_df['right_category'].isin(valid_categories)].copy()

In [13]:
cat_encoder = LabelEncoder()
new_df["category_id"] = cat_encoder.fit_transform(new_df["right_category"])

texts = new_df["text"].tolist()
cat_labels = new_df["category_id"].tolist()

X_train, X_test, y_cat_train, y_cat_test = train_test_split(
    texts, cat_labels, test_size=0.2, random_state=42, stratify=cat_labels
)


In [14]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

MODEL_NAME = "intfloat/multilingual-e5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class CategoryDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = CategoryDataset(X_train, y_cat_train, tokenizer, max_len=128)
test_dataset = CategoryDataset(X_test, y_cat_test, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

In [15]:
from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F

class E5CategoryClassifier(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_emb = out.last_hidden_state[:, 0, :]
        return self.classifier(cls_emb)

model = E5CategoryClassifier(MODEL_NAME, len(cat_encoder.classes_)).to("cuda")


config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

In [18]:
from torch.optim import Adam
from tqdm import tqdm

optimizer = Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

EPOCHS = 6
device = "cuda"

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()


        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")


Epoch 1: 100%|██████████| 1059/1059 [04:00<00:00,  4.41it/s]


Epoch 1 Loss: 1.0516


Epoch 2: 100%|██████████| 1059/1059 [03:57<00:00,  4.46it/s]


Epoch 2 Loss: 0.4857


Epoch 3: 100%|██████████| 1059/1059 [03:57<00:00,  4.46it/s]


Epoch 3 Loss: 0.3123


Epoch 4: 100%|██████████| 1059/1059 [03:57<00:00,  4.46it/s]


Epoch 4 Loss: 0.2230


Epoch 5: 100%|██████████| 1059/1059 [03:57<00:00,  4.46it/s]


Epoch 5 Loss: 0.1650


Epoch 6: 100%|██████████| 1059/1059 [03:57<00:00,  4.45it/s]

Epoch 6 Loss: 0.1318





In [19]:
from sklearn.metrics import classification_report, f1_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Классический отчёт
print(classification_report(all_labels, all_preds, target_names=cat_encoder.classes_))

# F1-метрика (для однометочной классификации — average="macro" или "weighted")
f1_macro = f1_score(all_labels, all_preds, average="macro", zero_division=0)
f1_weighted = f1_score(all_labels, all_preds, average="weighted", zero_division=0)

              precision    recall  f1-score   support

  Avtokredit       0.99      0.96      0.97       357
      BCC_KZ       0.92      0.84      0.88       618
       Cards       0.96      0.98      0.97      2257
     Credits       0.87      0.84      0.85       415
    Deposits       0.99      0.99      0.99       991
     Ipoteka       0.98      0.99      0.98       994
       Other       0.97      0.92      0.94       581
         PKO       0.79      0.87      0.83       231
    Payments       0.76      0.83      0.80       138
     Savings       0.97      0.97      0.97       717
    Smart QR       0.57      0.96      0.72        24
   Transfers       0.98      0.98      0.98       982
   Zalogovoe       0.95      0.96      0.96       160

    accuracy                           0.95      8465
   macro avg       0.90      0.93      0.91      8465
weighted avg       0.95      0.95      0.95      8465



In [21]:
for_test_df = pd.read_csv('data-1749109894878.csv')

In [23]:
for_test_df = for_test_df[['subproduct_id', 'subproduct_name', 'product_name', 'question']]

In [24]:
for_test_df

Unnamed: 0,subproduct_id,subproduct_name,product_name,question
0,7,JuniorBank,Карты,#juniorcard балалар картасын толтыру лимиті қа...
1,56,Smart_QR,Smart QR,"Какие условия, чтобы получить Smart QR для физ..."
2,7,JuniorBank,Карты,Какие преимущества у карты #juniorcard?
3,56,Smart_QR,Smart QR,Условия займа Smart QR
4,0,BlackEdition,Карты,Какие типы валют поддерживает карта #blackedit...
...,...,...,...,...
3828,54,bccpay_2_0,Карты,Қандай бөлімшелерде цифрлык bccpay 2.0 (monoca...
3829,148,Payments_Common,Платежи,если я оплачиваю с Трэвел карты кэшбеки поступ...
3830,7,JuniorBank,Карты,#Juniorcard картасында сақтандыру бар ма
3831,148,Payments_Common,Платежи,Трэвел картасымен төлем жүргізгенде кэшбек түс...


In [25]:
for_test_df["product_name"].unique()

array(['Карты', 'Smart QR', 'Сбережения', 'Переводы', 'Кредиты',
       'Депозиты', 'Платежи', 'BСС.KZ', 'ПКО ФЛ'], dtype=object)

In [26]:
new_df['right_category'].unique()

array(['Cards', 'BCC_KZ', 'Credits', 'Payments', 'Ipoteka', 'Deposits',
       'Transfers', 'Other', 'Savings', 'Avtokredit', 'PKO', 'Zalogovoe',
       'Smart QR'], dtype=object)

In [27]:
category_mapping = {
    'Карты': 'Cards',
    'Smart QR': 'Smart QR',
    'Сбережения': 'Savings',
    'Переводы': 'Transfers',
    'Кредиты': 'Credits',
    'Депозиты': 'Deposits',
    'Платежи': 'Payments',
    'BСС.KZ': 'BCC_KZ',
    'ПКО ФЛ': 'PKO'
}


In [28]:
import pandas as pd
import torch

# Загрузи for_test_df перед этим
# for_test_df = pd.read_csv("path_to_test_data.csv")

def predict_category(texts, model, tokenizer, label_encoder, device="cuda"):
    model.eval()
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = model(inputs["input_ids"], inputs["attention_mask"])
        preds = torch.argmax(logits, dim=1).cpu().numpy()
    return label_encoder.inverse_transform(preds)

In [29]:

# Прогон по всем вопросам
questions = for_test_df["question"].tolist()
predicted_eng = predict_category(questions, model, tokenizer, cat_encoder)  # ← model и cat_encoder уже загружены

# Конвертация реального значения из `product_name` в английские названия
for_test_df["mapped_product"] = for_test_df["product_name"].map(category_mapping)

# Добавляем предсказание
for_test_df["predict_product"] = predicted_eng

# Сравниваем и ставим 1/0
for_test_df["verification"] = (for_test_df["mapped_product"] == for_test_df["predict_product"]).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_test_df["mapped_product"] = for_test_df["product_name"].map(category_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_test_df["predict_product"] = predicted_eng
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_test_df["verification"] = (for_test_df["mapped_product"] == for_test_df[

In [30]:
for_test_df.to_csv("model_test_with_predictions.csv", index=False)

In [31]:
for_test_df

Unnamed: 0,subproduct_id,subproduct_name,product_name,question,mapped_product,predict_product,verification
0,7,JuniorBank,Карты,#juniorcard балалар картасын толтыру лимиті қа...,Cards,Cards,1
1,56,Smart_QR,Smart QR,"Какие условия, чтобы получить Smart QR для физ...",Smart QR,Smart QR,1
2,7,JuniorBank,Карты,Какие преимущества у карты #juniorcard?,Cards,Cards,1
3,56,Smart_QR,Smart QR,Условия займа Smart QR,Smart QR,Avtokredit,0
4,0,BlackEdition,Карты,Какие типы валют поддерживает карта #blackedit...,Cards,Cards,1
...,...,...,...,...,...,...,...
3828,54,bccpay_2_0,Карты,Қандай бөлімшелерде цифрлык bccpay 2.0 (monoca...,Cards,Cards,1
3829,148,Payments_Common,Платежи,если я оплачиваю с Трэвел карты кэшбеки поступ...,Payments,Cards,0
3830,7,JuniorBank,Карты,#Juniorcard картасында сақтандыру бар ма,Cards,Cards,1
3831,148,Payments_Common,Платежи,Трэвел картасымен төлем жүргізгенде кэшбек түс...,Payments,Cards,0


In [36]:
from sklearn.metrics import classification_report

# Убедимся, что нет NaN и сравнение валидно
df_eval = for_test_df.dropna(subset=["mapped_product", "predict_product"])

# Формируем отчёт
report = classification_report(
    df_eval["mapped_product"],
    df_eval["predict_product"],
    digits=3
)

print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

  Avtokredit      0.000     0.000     0.000         0
      BCC_KZ      0.608     0.819     0.698        72
       Cards      0.977     0.970     0.974      1989
     Credits      0.769     0.216     0.337       768
    Deposits      0.926     1.000     0.962       176
     Ipoteka      0.000     0.000     0.000         0
       Other      0.000     0.000     0.000         0
         PKO      0.948     0.647     0.769       252
    Payments      0.476     0.196     0.278        51
     Savings      0.921     0.886     0.903       236
    Smart QR      1.000     0.692     0.818        13
   Transfers      0.866     0.935     0.899       276
   Zalogovoe      0.000     0.000     0.000         0

    accuracy                          0.777      3833
   macro avg      0.576     0.489     0.511      3833
weighted avg      0.906     0.777     0.807      3833



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
