In [None]:
import pandas as pd
import numpy as np
import torch
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load preprocessed corpus
df = pd.read_csv('/kaggle/input/bkai-ai-track2-legal-document-retrieval/Legal Document Retrieval/public_test.csv')

# Initialize the model
model = SentenceTransformer('/kaggle/input/bkaibiencoderfinetuned/transformers/default/1/final')
model = model.to(device)

def encode(lst = [], convert_to_tensor=True, batch_size=1024):
    vectors = []
    # Create progress bar
    with tqdm(total=len(lst), desc="Encoding texts") as pbar:
        # Process in batches
        for i in range(0, len(lst), batch_size):
            batch = lst[i:i + batch_size]
            encoded_batch = model.encode(batch, convert_to_tensor=True, device=device)
            # Move to CPU before converting to numpy
            if torch.cuda.is_available():
                encoded_batch = encoded_batch.cpu()
            vectors.extend([np.array(arr) for arr in encoded_batch.numpy()])
            pbar.update(len(batch))
    return vectors

# Encode the text column
df['vector'] = encode(lst=list(df['question']))

# Save the encoded corpus
df.to_json('encoded_public_test_finetuned.json')


In [5]:
import pandas as pd
df = pd.read_json('/kaggle/input/finetunedpublictest/encoded_public_test_finetuned.json')
df

Unnamed: 0,question,qid,vector
0,Hiệp hội Công nghiệp ghi âm Việt Nam hoạt động...,98440,"[-0.0347496048, 0.25700578090000004, 0.3575423..."
1,Báo cáo nghiên cứu khả thi đầu tư xây dựng là ...,105737,"[0.1764316112, 0.16144044700000001, 0.07395143..."
2,Lịch khai giảng năm học 2022 - 2023 đối với họ...,106239,"[0.0155817755, -0.11466823520000001, -0.470231..."
3,Số định danh cá nhân có được dùng thay thế các...,79491,"[-0.048475273000000006, 0.3610271513, -0.23998..."
4,Trợ cấp đối với Chủ tịch Hội cựu chiến binh cấ...,130557,"[-0.0451854542, -0.06843171270000001, -0.28169..."
...,...,...,...
9995,Đón trả hành khách trên đường cao tốc có bị gi...,42798,"[-0.1179260835, -0.0827077851, -0.108169354500..."
9996,"Các đơn vị được giao là đầu mối trao đổi, cung...",10533,"[0.1132282391, -0.0720719025, -0.1837236732000..."
9997,Ban Thường vụ Hội Hỗ trợ khắc phục hậu quả bom...,46794,"[-0.1014185771, -0.0322549231, -0.1336619556, ..."
9998,"Tài liệu thông tin, giáo dục, truyền thông về ...",112007,"[0.0728569478, -0.5358711481, 0.2039459348, -0..."


In [None]:
import pandas as pd
import numpy as np
import torch
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
df = pd.read_csv('/kaggle/input/bkai-ai-track2-legal-document-retrieval/Legal Document Retrieval/train.csv')

# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the model
model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
model = model.to(device)

def encode(lst = [], convert_to_tensor=True, batch_size=512):
    vectors = []
    with tqdm(total=len(lst), desc="Encoding questions") as pbar:
        for i in range(0, len(lst), batch_size):
            batch = lst[i:i + batch_size]
            encoded_batch = model.encode(batch, convert_to_tensor=True, device=device)
            if torch.cuda.is_available():
                encoded_batch = encoded_batch.cpu()
            vectors.extend([np.array(arr) for arr in encoded_batch.numpy()])
            pbar.update(len(batch))
    return vectors

# Encode questions for both train and test sets
train_df['question_vector'] = encode(lst=list(train_df['question']))
test_df['question_vector'] = encode(lst=list(test_df['question']))

# Save encoded datasets
train_df.to_json('encoded_train.json')
test_df.to_json('encoded_test.json')

print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")



In [2]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

download_file('/kaggle/working', 'encoded_public_test_finetuned')
# download_file('/kaggle/working/encoded_test.json', 'out')

In [None]:
import pandas as pd
import numpy as np
import torch
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Load data with both question and qid columns
df = pd.read_csv('/kaggle/input/bkai-ai-track2-legal-document-retrieval/Legal Document Retrieval/corpus.csv')

# Determine the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize the model with the specified device 
model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder', device=device)

def encode(lst = [], convert_to_tensor=True, batch_size=128):
    vectors = []
    with tqdm(total=len(lst), desc="Encoding questions") as pbar:
        for i in range(0, len(lst), batch_size):
            batch = lst[i:i + batch_size]
            encoded_batch = model.encode(batch, convert_to_tensor=True)
            if device == 'cuda':
                encoded_batch = encoded_batch.cpu()
            vectors.extend([np.array(arr) for arr in encoded_batch.numpy()])
            pbar.update(len(batch))
    return vectors

# Encode questions while preserving original columns
df['question_vector'] = encode(lst=list(df['text']))

# Select only the required columns
output_df = df[['text', 'qid', 'question_vector']]

# Save to JSON
output_df.to_json('encoded_public_tes.json')

In [6]:
import json
import torch

# Load encoded vectors from JSON files
with open('/kaggle/input/finetunedpublictest/encoded_public_test_finetuned.json', 'r') as f:
    train_data = json.load(f)

with open('/kaggle/input/finetunedcorpus/encoded_corpus.json', 'r') as f:
    corpus_data = json.load(f)

# Extract vectors and IDs
train_vectors = [item['vector'] for item in train_data]
train_ids = [item['qid'] for item in train_data]

corpus_vectors = [item['vector'] for item in corpus_data]
corpus_ids = [item['cid'] for item in corpus_data]

# Convert to PyTorch tensors and move to CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_tensor = torch.tensor(train_vectors, device=device)
corpus_tensor = torch.tensor(corpus_vectors, device=device)

# Normalize vectors
train_tensor = torch.nn.functional.normalize(train_tensor, p=2, dim=1)
corpus_tensor = torch.nn.functional.normalize(corpus_tensor, p=2, dim=1)

# Compute cosine similarity
similarity_matrix = torch.matmul(train_tensor, corpus_tensor.T)

# Get top K similar documents for each query
top_k = 50  # Adjust as needed
top_k_values, top_k_indices = torch.topk(similarity_matrix, k=top_k, dim=1)

# Write results directly to text file in required format
with open('predict_top50.txt', 'w') as f:
    for i, (values, indices) in enumerate(zip(top_k_values, top_k_indices)):
        qid = train_ids[i]
        # Convert indices to cids and create space-separated string
        top_cids = [str(corpus_ids[idx]) for idx in indices.cpu().numpy()]
        line = f"{qid} {' '.join(top_cids)}\n"
        f.write(line)


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7e5d48793e20>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7e5d48793e20>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


TypeError: string indices must be integers

In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
import zipfile

# ...existing code...

# Load JSON files using Pandas
train_df = pd.read_json('/kaggle/input/encoded-public-test/encoded_public_test.json')
# train_df = train_df.head(1000)
corpus_df = pd.read_json('/kaggle/input/encoded/encoded_corpus.json')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


# Extract IDs and vectors
train_ids = train_df['qid'].tolist()
train_vectors = torch.tensor(train_df['question_vector'].tolist(), dtype=torch.float32).to(device)
corpus_ids = corpus_df['cid'].tolist()
corpus_vectors = torch.tensor(corpus_df['vector'].tolist(), dtype=torch.float32).to(device)

# Normalize vectors
train_vectors = F.normalize(train_vectors, p=2, dim=1)
corpus_vectors = F.normalize(corpus_vectors, p=2, dim=1)

# Compute cosine similarity using PyTorch on CUDA
similarity_matrix = torch.matmul(train_vectors, corpus_vectors.T)

# Get top K similar documents
top_k = 50  # Adjust as needed
top_k_values, top_k_indices = torch.topk(similarity_matrix, k=top_k, dim=1)

# Write results to predict.txt in required format
with open('predict_top50.txt', 'w') as f:
    for i, indices in enumerate(top_k_indices):
        qid = train_ids[i]
        top_cids = [str(corpus_ids[idx.item()]) for idx in indices]
        line = f"{qid} {' '.join(top_cids)}\n"
        f.write(line)

# Zip the predict.txt file
with zipfile.ZipFile('predict_top50.zip', 'w') as zipf:
    zipf.write('predict_top50.txt')

# ...existing code...


In [None]:
import pandas as pd
import numpy as np
import torch
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load preprocessed corpus
df = pd.read_csv('/kaggle/input/bkai-ai-track2-legal-document-retrieval/Legal Document Retrieval/train.csv')

# Initialize the model
model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
model = model.to(device)

def encode(lst = [], convert_to_tensor=True, batch_size=1024):
    vectors = []
    # Create progress bar
    with tqdm(total=len(lst), desc="Encoding texts") as pbar:
        # Process in batches
        for i in range(0, len(lst), batch_size):
            batch = lst[i:i + batch_size]
            encoded_batch = model.encode(batch, convert_to_tensor=True, device=device)
            # Move to CPU before converting to numpy
            if torch.cuda.is_available():
                encoded_batch = encoded_batch.cpu()
            vectors.extend([np.array(arr) for arr in encoded_batch.numpy()])
            pbar.update(len(batch))
    return vectors

# Encode the text column
df['question_vector'] = encode(lst=list(df['question']))

# Save the encoded corpus
df.to_json('encoded_train_full.json')


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    get_scheduler,
    default_data_collator
)
from accelerate import Accelerator
from torch.optim import AdamW
from tqdm.auto import tqdm
import math
import os
import json

class VietnameseDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_length=256):
        self.df = pd.read_csv(csv_path)
        self.texts = self.df['text'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Pre-tokenize all texts
        self.encodings = self.tokenizer(
            self.texts,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors=None  # Changed this to return lists instead of tensors
        )

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Return dictionary of items for the specified index
        return {
            key: torch.tensor(val[idx]) 
            for key, val in self.encodings.items()
        }

class BERTMaskingTrainer:
    def __init__(self, model_name="bkai-foundation-models/vietnamese-bi-encoder", 
                 output_dir="model_checkpoints", max_length=256):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
        self.output_dir = output_dir
        self.max_length = max_length
        self.mask_probability = 0.20  # 20% masking as specified
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize accelerator
        self.accelerator = Accelerator()
        
        # Initialize data collator for masked language modeling
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm_probability=self.mask_probability
        )
        
        # Add special tokens if needed
        special_tokens = {'pad_token': '[PAD]', 'mask_token': '[MASK]'}
        self.tokenizer.add_special_tokens(special_tokens)
        self.model.resize_token_embeddings(len(self.tokenizer))

    def prepare_data(self, csv_path, batch_size=16, test_size=0.1):  # Reduced batch size
        dataset = VietnameseDataset(csv_path, self.tokenizer, self.max_length)
        
        # Split into train/test
        train_size = int((1 - test_size) * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            dataset, [train_size, test_size]
        )

        self.train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            collate_fn=self.data_collator
        )
        
        self.eval_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            collate_fn=default_data_collator
        )

    def save_checkpoint(self, epoch, loss):
        checkpoint_dir = os.path.join(self.output_dir, f"checkpoint-{epoch}")
        os.makedirs(checkpoint_dir, exist_ok=True)
        
        unwrapped_model = self.accelerator.unwrap_model(self.model)
        unwrapped_model.save_pretrained(checkpoint_dir)
        self.tokenizer.save_pretrained(checkpoint_dir)
        
        # Save training info
        with open(os.path.join(checkpoint_dir, "training_info.json"), "w") as f:
            json.dump({"epoch": epoch, "loss": loss}, f)

    def train(self, num_epochs=30, learning_rate=5e-5):
        # Prepare for training
        self.model, self.optimizer, self.train_loader, self.eval_loader = \
            self.accelerator.prepare(
                self.model, 
                AdamW(self.model.parameters(), lr=learning_rate),
                self.train_loader, 
                self.eval_loader
            )

        num_training_steps = num_epochs * len(self.train_loader)
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=self.optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        progress_bar = tqdm(range(num_training_steps))

        for epoch in range(num_epochs):
            # Training phase
            self.model.train()
            for batch in self.train_loader:
                outputs = self.model(**batch)
                loss = outputs.loss
                self.accelerator.backward(loss)
                
                self.optimizer.step()
                lr_scheduler.step()
                self.optimizer.zero_grad()
                progress_bar.update(1)

            # Evaluation phase
            self.model.eval()
            eval_losses = []
            for batch in self.eval_loader:
                with torch.no_grad():
                    outputs = self.model(**batch)
                eval_losses.append(self.accelerator.gather(outputs.loss))

            eval_loss = torch.mean(torch.cat(eval_losses))
            try:
                perplexity = math.exp(eval_loss)
            except OverflowError:
                perplexity = float("inf")
            
            print(f"Epoch {epoch+1}: Perplexity: {perplexity}")
            
            # Save checkpoint
            self.save_checkpoint(epoch+1, eval_loss.item())

if __name__ == "__main__":
    trainer = BERTMaskingTrainer()
    trainer.prepare_data("/kaggle/input/preprocessed-corpus/preprocessed_corpus.csv")
    trainer.train()


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling,
                          get_scheduler, AdamW)
from accelerate import Accelerator
from tqdm.auto import tqdm
import math

class CustomTextDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size=128):
        self.tokenizer = tokenizer
        # Tokenize the texts
        tokenized_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_masks = tokenized_inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx]
        }

class MaskedLanguageModelTrainer:
    def __init__(self, csv_file, model_name, masking_prob=0.2, batch_size=32, num_epochs=30):
        self.csv_file = csv_file
        self.model_name = model_name
        self.masking_prob = masking_prob
        self.batch_size = batch_size
        self.num_epochs = num_epochs

        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(self.model_name)

        self.accelerator = Accelerator()
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=True, mlm_probability=self.masking_prob
        )

    def load_data(self):
        df = pd.read_csv(self.csv_file)
        texts = df['text'].tolist()
        self.dataset = CustomTextDataset(texts, self.tokenizer)

    def prepare_dataloader(self):
        self.dataloader = DataLoader(
            self.dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.data_collator
        )

    def train(self):
        optimizer = AdamW(self.model.parameters(), lr=5e-5)
        self.model, optimizer, self.dataloader = self.accelerator.prepare(
            self.model, optimizer, self.dataloader
        )

        num_update_steps_per_epoch = len(self.dataloader)
        num_training_steps = self.num_epochs * num_update_steps_per_epoch
        lr_scheduler = get_scheduler(
            "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

        progress_bar = tqdm(range(num_training_steps))

        for epoch in range(self.num_epochs):
            self.model.train()
            for batch in self.dataloader:
                outputs = self.model(**batch)
                loss = outputs.loss
                self.accelerator.backward(loss)

                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)

            # Optional: Evaluate the model at the end of each epoch
            perplexity = self.evaluate()
            print(f">>> Epoch {epoch + 1}: Perplexity: {perplexity}")

    def evaluate(self):
        self.model.eval()
        losses = []
        for batch in self.dataloader:
            with torch.no_grad():
                outputs = self.model(**batch)
            loss = outputs.loss
            losses.append(self.accelerator.gather(loss.repeat(self.batch_size)))

        losses = torch.cat(losses)
        losses = losses[: len(self.dataset)]
        try:
            perplexity = math.exp(torch.mean(losses))
        except OverflowError:
            perplexity = float("inf")
        return perplexity

    def save_model(self, output_dir):
        self.accelerator.wait_for_everyone()
        unwrapped_model = self.accelerator.unwrap_model(self.model)
        unwrapped_model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

if __name__ == "__main__":
    trainer = MaskedLanguageModelTrainer(
        csv_file="/kaggle/input/preprocessed-corpus/preprocessed_corpus.csv",
        model_name="bkai-foundation-models/vietnamese-bi-encoder"
    )
    trainer.load_data()
    trainer.prepare_dataloader()
    trainer.train()
    trainer.save_model(output_dir="trained_model")


In [None]:
# !pip3 install 
!ls -la /usr/lib/jvm/

import py_vncorenlp
py_vncorenlp.download_model(save_dir='/kaggle/working')
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/kaggle/working')

query = "Trường UIT là gì?"
sentences = [
    "Trường Đại học Công nghệ Thông tin có tên tiếng Anh là University of Information Technology (viết tắt là UIT) là thành viên của Đại học Quốc Gia TP.HCM.",
    "Trường Đại học Kinh tế – Luật (tiếng Anh: University of Economics and Law – UEL) là trường đại học đào tạo và nghiên cứu khối ngành kinh tế, kinh doanh và luật hàng đầu Việt Nam.",
    "Quĩ uỷ thác đầu tư (tiếng Anh: Unit Investment Trusts; viết tắt: UIT) là một công ty đầu tư mua hoặc nắm giữ một danh mục đầu tư cố định"
]

tokenized_query = rdrsegmenter.word_segment(query)
tokenized_sentences = [rdrsegmenter.word_segment(sent) for sent in sentences]

tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]

MODEL_ID = 'itdainb/PhoRanker'
MAX_LENGTH = 256


This is the code for cosine similarity and json output for topk

In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
import zipfile
import json

# ...existing code...

# Load JSON files using Pandas
train_df = pd.read_json('/kaggle/input/finetunedpublictest/encoded_public_test_finetuned.json')
# train_df = train_df.head(1000)
corpus_df = pd.read_json('/kaggle/input/finetunedcorpus/encoded_corpus.json')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


# Extract IDs and vectors
train_ids = train_df['qid'].tolist()
train_vectors = torch.tensor(train_df['vector'].tolist(), dtype=torch.float32).to(device)
corpus_ids = corpus_df['cid'].tolist()
corpus_vectors = torch.tensor(corpus_df['vector'].tolist(), dtype=torch.float32).to(device)

# Normalize vectors
train_vectors = F.normalize(train_vectors, p=2, dim=1)
corpus_vectors = F.normalize(corpus_vectors, p=2, dim=1)

# Compute cosine similarity using PyTorch on CUDA
similarity_matrix = torch.matmul(train_vectors, corpus_vectors.T)

# Get top K similar documents
top_k = 50  # Adjust as needed
top_k_values, top_k_indices = torch.topk(similarity_matrix, k=top_k, dim=1)

# Create results for both TXT and JSON formats
json_results = []

with open('predict_top50.txt', 'w') as f:
    for i, (indices, scores) in enumerate(zip(top_k_indices, top_k_values)):
        qid = train_ids[i]
        
        # Convert indices and scores to Python lists
        top_cids = [str(corpus_ids[idx.item()]) for idx in indices]
        similarity_scores = [score.item() for score in scores]
        
        # Write to TXT format
        line = f"{qid} {' '.join(top_cids)}\n"
        f.write(line)
        
        # Prepare JSON format
        # Format suitable for re-ranker: including query_id, candidate_ids, and their scores
        json_entry = {
            "query_id": qid,
            "candidates": {
                "doc_ids": top_cids,
                "scores": similarity_scores,
                # Additional fields that might be useful for re-ranker:
            }
        }
        json_results.append(json_entry)

# Save JSON results
with open('predict_top50.json', 'w', encoding='utf-8') as f:
    json.dump(json_results, f, ensure_ascii=False, indent=2)

# Zip both files
with zipfile.ZipFile('predict_top50.zip', 'w') as zipf:
    zipf.write('predict_top50.txt')
    zipf.write('predict_top50.json')

# ...existing code...


Using device: cuda


In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json

# Load all required data
predictions = pd.read_json('/kaggle/input/top50cosinejson/predict_top50.json')
test_df = pd.read_csv('/kaggle/input/bkai-ai-track2-legal-document-retrieval/Legal Document Retrieval/public_test.csv')
corpus_df = pd.read_csv('/kaggle/input/preprocessed-corpus/preprocessed_corpus.csv')

# Fix: Create corpus dictionary with correct column mapping
corpus_dict = dict(zip(corpus_df['cid'], corpus_df['text']))

# Setup model and device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('namdp-ptit/ViRanker')
model = AutoModelForSequenceClassification.from_pretrained('namdp-ptit/ViRanker').to(device)
model.eval()

reranked_results = []

# Process each query
for _, row in predictions.iterrows():
    # qid = row['query_id']
    # # Fix: Use 'question' column instead of 'query'
    # query_text = test_df[test_df['qid'] == qid]['question'].iloc[0]
    # doc_ids = row['candidates']['doc_ids']
    
    # # Fix: Use corpus_dict correctly - get text for each doc_id
    # pairs = [[query_text, corpus_dict[doc_id]] for doc_id in doc_ids]
    
    # Re-rank in batches
    batch_size = 8  # Smaller batch size due to longer texts
    all_scores = []
    
    with torch.no_grad():
        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i + batch_size]
            inputs = tokenizer(batch_pairs, padding=True, truncation=True, 
                             return_tensors='pt', max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            scores = model(**inputs, return_dict=True).logits.view(-1,).float()
            all_scores.extend(scores.cpu().numpy().tolist())
    
    # Create result entry
    json_entry = {
        "query_id": qid,
        "candidates": {
            "doc_ids": doc_ids,
            "scores": all_scores
        }
    }
    reranked_results.append(json_entry)

# Save results
with open('predict_top50_reranked.json', 'w') as f:
    json.dump(reranked_results, f, indent=2)


In [None]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import json

# Load all required data
predictions = pd.read_json('/kaggle/input/top50cosinejson/predict_top50.json')
test_df = pd.read_csv('/kaggle/input/bkai-ai-track2-legal-document-retrieval/Legal Document Retrieval/public_test.csv')
corpus_df = pd.read_csv('/kaggle/input/preprocessed-corpus/preprocessed_corpus.csv')

# # Debugging: Print first few rows to verify data format
# print("First few corpus rows:", corpus_df.head())
# print("First few predictions:", predictions.head())

# Fix: Create corpus dictionary with correct column mapping
# corpus_dict = dict(zip(corpus_df['text'], corpus_df['cid']))
# print("First few corpus_dict items:", list(corpus_dict.items())[:3])

# Setup model and device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('namdp-ptit/ViRanker')
model = AutoModelForSequenceClassification.from_pretrained('namdp-ptit/ViRanker').to(device)
model.eval()

reranked_results = []

# Process each query
for _, row in predictions.iterrows():
    # Fix: Use correct key from predictions json
    qid = row['query_id']  # or 'query_id' depending on actual json format
    query_text = test_df['question'][test_df['qid'] == qid].iloc[0]
    doc_ids = row['candidates']['doc_ids']
    
    # Debug print
    print(f"Processing qid: {qid}")
    print(f"First few doc_ids: {doc_ids[:3]}")
    
    # Create pairs with proper error handling
    pairs = []
    for doc_id in doc_ids:
            # Convert doc_id to string if it's numeric
            # doc_id_str = str(doc_id)
            texts = corpus_df.loc[corpus_df['cid'] == int(doc_id), 'text'].iloc[0]
            pairs.append([query_text, texts])
    
    # Re-rank in batches
    batch_size = 8  # Smaller batch size due to longer texts
    all_scores = []
    
    with torch.no_grad():
        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i + batch_size]
            inputs = tokenizer(batch_pairs, padding=True, truncation=True, 
                             return_tensors='pt', max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            scores = model(**inputs, return_dict=True).logits.view(-1,).float()
            all_scores.extend(scores.cpu().numpy().tolist())
    
    # Create result entry
    json_entry = {
        "query_id": qid,
        "candidates": {
            "doc_ids": doc_ids,
            "scores": all_scores
        }
    }
    reranked_results.append(json_entry)

# Save results
with open('predict_top50_reranked.json', 'w') as f:
    json.dump(reranked_results, f, indent=2)


In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
import zipfile
import json

# ...existing code...

# Load JSON files using Pandas
train_df = pd.read_json('/kaggle/input/encodedtrainfull/encoded_train_full.json')
corpus_df = pd.read_json('/kaggle/input/encoded/encoded_corpus.json')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Extract IDs and vectors
train_ids = train_df['qid'].tolist()
train_vectors = torch.tensor(train_df['question_vector'].tolist(), dtype=torch.float32).to(device)
corpus_ids = corpus_df['cid'].tolist()
corpus_vectors = torch.tensor(corpus_df['vector'].tolist(), dtype=torch.float32).to(device)

# Normalize vectors
train_vectors = F.normalize(train_vectors, p=2, dim=1)
corpus_vectors = F.normalize(corpus_vectors, p=2, dim=1)

# Get top K similar documents
top_k = 50  # Adjust as needed

# Create results for both TXT and JSON formats
json_results = []
batch_size = 128 # Process 8 QIDs at a time

with open('predict_top50_training.txt', 'w') as f:
    for i in range(0, len(train_ids), batch_size): 
        # Process in batches
        batch_train_vectors = train_vectors[i:i+batch_size]
        batch_train_ids = train_ids[i:i+batch_size]
        
        # Compute cosine similarity using PyTorch on CUDA for the batch
        similarity_matrix = torch.matmul(batch_train_vectors, corpus_vectors.T)
        
        # Get top K similar documents for the batch
        top_k_values, top_k_indices = torch.topk(similarity_matrix, k=top_k, dim=1)
        
        for j, (indices, scores) in enumerate(zip(top_k_indices, top_k_values)):
            qid = batch_train_ids[j] 

            # Convert indices and scores to Python lists
            top_cids = [str(corpus_ids[idx.item()]) for idx in indices]
            similarity_scores = [score.item() for score in scores]

            # Write to TXT format
            line = f"{qid} {' '.join(top_cids)}\n"
            f.write(line)

            # Prepare JSON format
            json_entry = {
                "query_id": qid,
                "candidates": {
                    "doc_ids": top_cids,
                    "scores": similarity_scores,
                }
            }
            json_results.append(json_entry)

# Save JSON results
with open('predict_top50_training.json', 'w', encoding='utf-8') as f:
    json.dump(json_results, f, ensure_ascii=False, indent=2)

# Zip both files
with zipfile.ZipFile('predict_top50_training.zip', 'w') as zipf:
    zipf.write('predict_top50_training.txt')
    zipf.write('predict_top50_training.json')

# ...existing code...

Negative pairing for fine-tuning the cross-encoder

In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
import zipfile
import json

# ...existing code...

# Load JSON files using Pandas
train_df = pd.read_json('/kaggle/input/encodedtrainfull/encoded_train_full.json')
corpus_df = pd.read_json('/kaggle/input/encoded/encoded_corpus.json')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Extract IDs and vectors
train_ids = train_df['qid'].tolist()
train_vectors = torch.tensor(train_df['question_vector'].tolist(), dtype=torch.float32).to(device)
corpus_ids = corpus_df['cid'].tolist()
corpus_vectors = torch.tensor(corpus_df['vector'].tolist(), dtype=torch.float32).to(device)

# Normalize vectors
train_vectors = F.normalize(train_vectors, p=2, dim=1)
corpus_vectors = F.normalize(corpus_vectors, p=2, dim=1)

# Get most dissimilar document
json_results = []
batch_size = 128 # Process 8 QIDs at a time

with open('predict_most_dissimilar.txt', 'w') as f:
    for i in range(0, len(train_ids), batch_size): 
        # Process in batches
        batch_train_vectors = train_vectors[i:i+batch_size]
        batch_train_ids = train_ids[i:i+batch_size]
        
        # Compute cosine similarity using PyTorch on CUDA for the batch
        similarity_matrix = torch.matmul(batch_train_vectors, corpus_vectors.T)
        
        # Get most dissimilar document (smallest similarity score)
        min_values, min_indices = torch.min(similarity_matrix, dim=1)
        
        for j, (idx, score) in enumerate(zip(min_indices, min_values)):
            qid = batch_train_ids[j]
            cid = str(corpus_ids[idx.item()])
            
            # Write to TXT format
            f.write(f"{qid} {cid}\n")

            # Prepare JSON format
            json_entry = {
                "query_id": qid,
                "candidates": {
                    "doc_ids": [cid],
                    "scores": [score.item()],
                }
            }
            json_results.append(json_entry)

# Save JSON results
with open('predict_most_dissimilar.json', 'w', encoding='utf-8') as f:
    json.dump(json_results, f, ensure_ascii=False, indent=2)

# Zip both files
with zipfile.ZipFile('predict_most_dissimilar.zip', 'w') as zipf:
    zipf.write('predict_most_dissimilar.txt')
    zipf.write('predict_most_dissimilar.json')

# ...existing code...

Fine tune with negative pairings

In [4]:
from datasets import Dataset, load_dataset, concatenate_datasets
import os
print(os.listdir("../input"))  # Check input directory contents (Kaggle specific)
!pip install sentence_transformers
import json
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
import math

# 1. Load a model to finetune
model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')

# 2. Load and format the dataset
dataset = load_dataset("json", data_files="/kaggle/input/negativepairing/fine_tune_training.json")
dataset = dataset["train"]  # Access the main split (likely named 'train' by default)


def format_dataset(example):
    return {'query': example['query'], 'positive': example['pos'], 'negative': example['neg']}

dataset = dataset.map(format_dataset)

# Split into train and evaluation sets (adjust split ratio as needed)
train_test_valid = dataset.train_test_split(test_size=0.2, seed=42)

train_dataset_formatted = train_test_valid["train"]
test_dataset = train_test_valid["test"]
train_eval = test_dataset.train_test_split(test_size = 0.5, seed = 42)
eval_dataset_formatted = train_eval["train"]
predict_dataset_formatted = train_eval["test"]



# 3. Define the loss function
loss = MultipleNegativesRankingLoss(model)

# 4. Specify training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/vietnamese-bi-encoder-finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Adjust based on your GPU
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # IMPORTANT for MultipleNegativesRankingLoss
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=2500,
    save_total_limit=2,
    logging_steps=100,
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="cos_sim",  # Or other suitable metric
    greater_is_better=True,  # For cosine similarity, higher is better
    report_to="none" 
)


# 5. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset_formatted,
    eval_dataset=eval_dataset_formatted,
    loss=loss
)
trainer.train()



# 6. Save the trained model (best model is already saved during training)
model.save_pretrained("vietnamese-bi-encoder-finetuned/final") # Final model (may be the same as the best, depending on settings)

# Now you can evaluate on a separate test set. This should use different data than the eval set used during training.
trainer.evaluate(predict_dataset_formatted) # Now Evaluate on the separate test set.

['top50-bkai-cosine', 'encodedtrainfull', 'bkai-ai-track2-legal-document-retrieval', 'preprocessed-corpus', 'top50cosinejson', 'negativepairing', 'encoded', 'encoded-public-test']


  pid, fd = os.forkpty()




  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


TypeError: unhashable type: 'list'