## Eval

In [77]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import pandas as pd

model_name = "deepvk/USER-bge-m3"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [78]:
qa_df = pd.read_csv('qa_df_ready_splits.csv')

In [79]:
train_df = qa_df[qa_df.split == 'train']
val_df = qa_df[qa_df.split == 'val']

In [80]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [81]:
train_df.shape, val_df.shape

((1423, 7), (352, 7))

In [82]:
# 1. заполнить бд - есть
# 2. поднять эмбеддер - в процессе 
# 3. поднять пайплайн поиска - в процессе
# 4. попробовать реранк - потом
# 5. попробовать потретить retrieval модель - пробуем

In [83]:
def generate_embeddings_in_batches(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            outputs = model(**inputs.to(model.device))
        batch_embeddings = outputs[0][:, 0, :].detach().cpu()
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

In [84]:
def count_recall_topn(model, tokenizer, train, val, 
                      q_column='question_clear', a_column='content_clear', 
                      batch_size=128, topn=10):    
    train = train.reset_index(drop=True)
    val = val.reset_index(drop=True)
    
    # topn = 10
    # q_column = 'question_clear'
    # a_column = 'content_clear'
    
    train_embeddings = generate_embeddings_in_batches(train[q_column].tolist(), batch_size)
    test_embeddings = generate_embeddings_in_batches(val[q_column].tolist(), batch_size)
    
    top_n_matches = []
    for test_embedding in tqdm(test_embeddings):
        cosine_scores = cosine_similarity(test_embedding.unsqueeze(0), train_embeddings)[0]
        top_results = torch.topk(torch.tensor(cosine_scores), topn)
        top_n_matches.append(top_results.indices.numpy())
    
    accuracy_count = 0
    unique_answers = []
    position_found = []
    
    for i, indices in enumerate(top_n_matches):
        truth_content = val.iloc[i][a_column]
        pred_content = train[train.index.isin(indices)][a_column]
    
        if truth_content in pred_content.values:
            accuracy_count += 1
            position_found.append((pred_content.values == truth_content).argmax())
            
        else:
            position_found.append(-1)
            
        unique_answers.append(pred_content.nunique())
        
    recall_topn = accuracy_count / len(top_n_matches)
    
    return recall_topn, unique_answers, position_found
    

In [9]:
model = model.to('cuda')

# for topn in range(1, 11):
#     recall_topn, unique_answers, position_found = count_recall_topn(model, tokenizer, train_df, val_df, 
#                                                                     q_column='question_clear', a_column='content_clear', 
#                                                                     batch_size=128, topn=topn)
#     print(f"recall@{topn} = {recall_topn}")

## Train

In [10]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import random

In [11]:
class TripletDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.groups = df.groupby('content_clear').groups
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        anchor = self.df.iloc[idx]
        content = anchor['content_clear']
        
        # Positive sample: same content_clear but different query_clear
        group = self.groups[content]

        try:
            positive_idx = random.choice(group[group != idx])
        except IndexError:
            positive_idx = idx

        # while positive_idx == idx:
            # positive_idx = random.choice(self.groups[content])
        
        positive = self.df.iloc[positive_idx]
        
        # Negative sample: different content_clear
        negative_content = random.choice(list(set(self.groups.keys()) - {content}))
        negative_idx = random.choice(self.groups[negative_content])
        negative = self.df.iloc[negative_idx]
        
        anchor_input = self.tokenizer(anchor['question_clear'], padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        positive_input = self.tokenizer(positive['question_clear'], padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        negative_input = self.tokenizer(negative['question_clear'], padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        
        return {
            'anchor_input_ids': anchor_input['input_ids'].squeeze(0),
            'anchor_attention_mask': anchor_input['attention_mask'].squeeze(0),
            'positive_input_ids': positive_input['input_ids'].squeeze(0),
            'positive_attention_mask': positive_input['attention_mask'].squeeze(0),
            'negative_input_ids': negative_input['input_ids'].squeeze(0),
            'negative_attention_mask': negative_input['attention_mask'].squeeze(0),
        }

class TripletLossModel(nn.Module):
    def __init__(self, model):
        super(TripletLossModel, self).__init__()
        self.model = model
        self.loss = nn.TripletMarginLoss(margin=0.5)

    def forward(self, anchor_input, positive_input, negative_input):
        anchor_emb = self._get_embedding(anchor_input)
        positive_emb = self._get_embedding(positive_input)
        negative_emb = self._get_embedding(negative_input)

        loss = self.loss(anchor_emb, positive_emb, negative_emb)
        return loss
    
    def _get_embedding(self, inputs):
        outputs = self.model(**inputs)
        cls_embedding = outputs[0][:, 0, :]
        return cls_embedding

def train_and_validate(model, train_loader, train, val, optimizer, device, topn=10, epochs=5):
    model.train()
    model.to(device)

    max_recall = 0.90625
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()

            anchor_input = {
                'input_ids': batch['anchor_input_ids'].to(device),
                'attention_mask': batch['anchor_attention_mask'].to(device)
            }
            positive_input = {
                'input_ids': batch['positive_input_ids'].to(device),
                'attention_mask': batch['positive_attention_mask'].to(device)
            }
            negative_input = {
                'input_ids': batch['negative_input_ids'].to(device),
                'attention_mask': batch['negative_attention_mask'].to(device)
            }

            loss = model(anchor_input, positive_input, negative_input)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        test_recall_topn, _, _ = count_recall_topn(model, tokenizer, train, val)
        avg_loss = total_loss / len(train_loader)

        if test_recall_topn > max_recall:
            max_recall = test_recall_topn
            torch.save(model.state_dict(), f"triplet_model_{max_recall}.pth")
        
        print(f"epoch {epoch + 1}/{epochs}: test recall@10 = {test_recall_topn}")
        print(f"epoch {epoch + 1}/{epochs}: train loss = {avg_loss}")


In [12]:
train_dataset = TripletDataset(train_df, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [13]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cuda:0'

triplet_model = TripletLossModel(model)
optimizer = torch.optim.AdamW(triplet_model.parameters(), lr=1e-5)

In [14]:
train_and_validate(triplet_model, train_dataloader, train_df, val_df, optimizer, device, epochs=5)

100%|██████████| 178/178 [05:00<00:00,  1.69s/it]
100%|██████████| 12/12 [00:04<00:00,  2.65it/s]
100%|██████████| 3/3 [00:00<00:00,  3.38it/s]
100%|██████████| 352/352 [00:01<00:00, 184.03it/s]


epoch 1/5: test recall@10 = 0.90625
epoch 1/5: train loss = 0.10918871137533295


100%|██████████| 178/178 [05:00<00:00,  1.69s/it]
100%|██████████| 12/12 [00:04<00:00,  2.69it/s]
100%|██████████| 3/3 [00:00<00:00,  3.44it/s]
100%|██████████| 352/352 [00:01<00:00, 219.51it/s]


epoch 2/5: test recall@10 = 0.9147727272727273
epoch 2/5: train loss = 0.06251984796999546


100%|██████████| 178/178 [04:59<00:00,  1.68s/it]
100%|██████████| 12/12 [00:04<00:00,  2.67it/s]
100%|██████████| 3/3 [00:00<00:00,  3.45it/s]
100%|██████████| 352/352 [00:01<00:00, 214.81it/s]


epoch 3/5: test recall@10 = 0.9204545454545454
epoch 3/5: train loss = 0.05067158749933993


100%|██████████| 178/178 [04:59<00:00,  1.68s/it]
100%|██████████| 12/12 [00:04<00:00,  2.69it/s]
100%|██████████| 3/3 [00:00<00:00,  3.46it/s]
100%|██████████| 352/352 [00:01<00:00, 183.73it/s]


epoch 4/5: test recall@10 = 0.8977272727272727
epoch 4/5: train loss = 0.036057067553648785


100%|██████████| 178/178 [05:00<00:00,  1.69s/it]
100%|██████████| 12/12 [00:04<00:00,  2.67it/s]
100%|██████████| 3/3 [00:00<00:00,  3.47it/s]
100%|██████████| 352/352 [00:01<00:00, 180.23it/s]


epoch 5/5: test recall@10 = 0.8863636363636364
epoch 5/5: train loss = 0.02622495407468817


## Push

In [93]:
from transformers import AutoModel, AutoTokenizer
import torch
from huggingface_hub import login

model_name = "deepvk/USER-bge-m3"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [82]:
state_dict = torch.load("triplet_model_0.9204545454545454.pth", map_location="cpu")
model.load_state_dict(state_dict, strict=False)

In [83]:
# # Save model and tokenizer
# save_path = "USER-bge-m3-x5"
# model.save_pretrained(save_path)
# tokenizer_bge.save_pretrained(save_path)

In [95]:
from sentence_transformers import SentenceTransformer, models

transformer = models.Transformer(model_name)
pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode="cls")
sentence_model = SentenceTransformer(modules=[transformer, pooling])

In [96]:
sentence_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [None]:
login(token="")
sentence_model.push_to_hub("elderberry17/USER-bge-m3-x5-sentence")

model.safetensors: 100%|██████████| 1.44G/1.44G [00:49<00:00, 29.0MB/s]   


'https://huggingface.co/elderberry17/USER-bge-m3-x5-sentence/commit/86d65c0a1866d4790011f4613e54cb7ad5959151'

## Data Processing

In [50]:
import pandas as pd

In [51]:
df = pd.read_csv('qa_for_test.csv')
df.shape

(110, 5)

In [52]:
# categories_last12

In [53]:
categories_last12 = df.tail(12).answer.values
df.loc[110-12:, 'category'] = categories_last12
df = df.drop('answer', axis=1)

In [54]:
df.isna().sum()

old_question    0
question        0
old_answer      0
category        0
dtype: int64

In [57]:
df.columns

Index(['old_question', 'question', 'old_answer', 'category'], dtype='object')

In [58]:
df_list = []

for cat in df.category.unique():
    df_cat = df[df.category == cat]
    df_cat = df_cat.drop_duplicates(subset=['old_answer'])
    df_list.append(df_cat)

In [59]:
df_new = pd.concat(df_list)

In [60]:
df_new.shape

(88, 4)

In [61]:
df_new.to_csv('qa_for_test_short.csv', index=False)

In [62]:
df_new.head(1)

Unnamed: 0,old_question,question,old_answer,category
0,не отображается автомобиль в личном кабинете.,Почему автомобиль не показывается в личном каб...,для внесения данных по личному автомобилю обра...,автомобиль


In [68]:
df_new[df_new.old_question == 'question']

Unnamed: 0,old_question,question,old_answer,category


In [69]:
df_new = pd.read_csv('qa_for_test_short.csv')

In [70]:
df_new.columns

Index(['old_question', 'question', 'old_answer', 'category'], dtype='object')