# Relevant examples with embeddings
## Precomputing train data

In [1]:
import pandas as pd

train_df = pd.read_json('./data/train_v2.jsonl', lines=True)
test_df = pd.read_json('./data/test_v4.jsonl', lines=True)

In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

torch.cuda.empty_cache()

embedder = SentenceTransformer("intfloat/multilingual-e5-base", device="cpu")

train_embs = embedder.encode(
    train_df['text'].tolist(),
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

test_embs = embedder.encode(
    test_df['text'].tolist(),
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

del embedder

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 16/16 [00:08<00:00,  1.82it/s]
Batches: 100%|██████████| 17/17 [00:12<00:00,  1.36it/s]


Precompute top-k indices for each test row

In [3]:
k = 8
topk_indices = []

for q_idx, row in test_df.iterrows():
    acronym = row['acronym']
    # Filter same acronym in train_df
    subdf = train_df[train_df['acronym'] == acronym]
    if len(subdf) == 0:
    # fallback: random examples
        topk_indices.append(train_df.sample(k).index.to_list())
        continue

    subset_indices = subdf.index.to_list()
    subset_embs = train_embs[subset_indices]
    
    # Cosine similarity (dot product of normalized embeddings)
    sims = np.dot(subset_embs, test_embs[q_idx].reshape(-1,1)).squeeze()
    topk_idx = np.argsort(-sims)[:k]
    topk_indices.append([subset_indices[i] for i in topk_idx])

test_df['topk_example_indices'] = topk_indices

del test_embs, train_embs

## Generation

Loading LLM Model

In [4]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.empty_cache()

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import os

torch.cuda.empty_cache()

model_name = "Qwen/Qwen2.5-32B-Instruct" #"mistralai/Mistral-Small-24B-Instruct-2501" #"mistralai/Mistral-7B-Instruct-v0.3"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    offload_folder="./offload",
    torch_dtype=torch.float16,
)

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)

pipe.tokenizer.pad_token_id = model.config.eos_token_id
pipe.model.config.use_cache = False

print("model loaded")


`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 17 files:   6%|▌         | 1/17 [05:10<1:22:51, 310.70s/it]'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /Qwen/Qwen2.5-32B-Instruct/resolve/5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd/model-00014-of-00017.safetensors (Caused by ProxyError(\'Unable to connect to proxy\', NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f4b05f86780>: Failed to resolve \'proxy.univ-lyon1.fr\' ([Errno -3] Temporary failure in name resolution)")))'), '(Request ID: d0509f0f-4c76-46b0-b725-0c639c3381f4)')' thrown while requesting HEAD https://huggingface.co/Qwen/Qwen2.5-32B-Instruct/resolve/5ede1c97bbab6ce5cda5812749b4c0bdf79b18dd/model-00014-of-00017.safetensors
Retrying in 1s [Retry 1/5].
Fetching 17 files: 100%|██████████| 17/17 [08:29<00:00, 29.95s/it]  
Loading checkpoint shards: 100%|██████████| 17/17 [13:41<00:00, 48.33s/it]
`torch_dtype` is deprecated! Use `dtype` inst

model loaded


Building prompt

In [6]:
def create_prompt_with_examples(acronym, text, options, k=5):
    system = """Tu es un modèle expert en expansion d'acronymes ferroviaires.
Ton rôle est d'identifier la ou les définitions correctes d'un acronyme dans un texte.
Réponds uniquement avec une liste Python d'indices, ex. [0] ou [1, 2] ou []. \nExemples:"""
    for idx in row['topk_example_indices']: 
        example = train_df.iloc[idx]
        system += f'\nTexte exemple : "{example["text"]}\nAcronyme: {example["acronym"]}"\nOptions: '
        for j, opt in enumerate(example['options'].keys()):
            system += f'\n{j}. : {opt}'
        system += f'\nReponse correcte : {[i for i, value in enumerate(example["options"].values()) if value]}\n'
    user = f'Texte : "{text}"\nAcronyme : {acronym}\n'
    for i, opt in enumerate(options):
        user += f"Option {i} : {opt}\n"
    user += "Réponds avec la liste des numéros corrects :"
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": user}]
    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat

Inference

In [7]:
inputs = []
for _, row in test_df.iterrows():
    prompt = create_prompt_with_examples(row['acronym'], row['text'], row['options'])
    inputs.append(prompt)

outputs = pipe(inputs, do_sample=False, batch_size=4)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [23]:
import re

def extract_predicted_ids(outputs):
    predicted_ids = []
    for output in outputs:
        # Get the text after [/INST]
        text = output[0]["generated_text"].split("<|im_start|>assistant")[1] #"[/INST]"
        
        # Find all numeric patterns, including decimals
        numbers = re.findall(r'\d+(?:\.\d+)?', text)
        
        # Convert to ints safely, remove duplicates, and filter < 15
        ids = list(set(int(float(i)) for i in numbers if float(i) < 15))
        
        predicted_ids.append(ids)
    return predicted_ids

In [25]:
predicted_ids = extract_predicted_ids(outputs)
submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/rag2.csv", index=False)

previous = pd.read_csv("./predictions/rag-mistral-small.csv")

merged = previous.merge(submission, on="id", suffixes=("_prev", "_new"))
merged['prediction_prev'] = merged['prediction_prev'].astype(str).str.strip()
merged['prediction_new'] = merged['prediction_new'].astype(str).str.strip()

similarity = (merged['prediction_prev'] == merged['prediction_new']).mean()
print(f"similarity: {similarity:.4f}")

similarity: 0.6513
