In [1]:
import pandas as pd

train_df = pd.read_json('./data/train_v2.jsonl', lines=True)
test_df = pd.read_json('./data/test_v4.jsonl', lines=True)

In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

torch.cuda.empty_cache()

embedder = SentenceTransformer("intfloat/multilingual-e5-base", device="cpu")

train_embs = embedder.encode(
    train_df['text'].tolist(),
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

test_embs = embedder.encode(
    test_df['text'].tolist(),
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

  from .autonotebook import tqdm as notebook_tqdm
Batches:   0%|          | 0/16 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [3]:
k = 4
topk_indices = []

for q_idx, row in test_df.iterrows():
    acronym = row['acronym']
    # Filter same acronym in train_df
    subdf = train_df[train_df['acronym'] == acronym]
    if len(subdf) == 0:
    # fallback: random examples
        topk_indices.append(train_df.sample(k).index.to_list())
        continue

    subset_indices = subdf.index.to_list()
    subset_embs = train_embs[subset_indices]
    
    # Cosine similarity (dot product of normalized embeddings)
    sims = np.dot(subset_embs, test_embs[q_idx].reshape(-1,1)).squeeze()
    topk_idx = np.argsort(-sims)[:k]
    topk_indices.append([subset_indices[i] for i in topk_idx])

test_df['topk_example_indices'] = topk_indices

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import os

torch.cuda.empty_cache()

model_name = "Qwen/Qwen2.5-32B-Instruct" #"mistralai/Mistral-Small-24B-Instruct-2501" #"mistralai/Mistral-7B-Instruct-v0.3"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    offload_folder="./offload",
    torch_dtype=torch.float16,
)

pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)

pipe.tokenizer.pad_token_id = model.config.eos_token_id
pipe.model.config.use_cache = False

print("model loaded")


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 17/17 [13:34<00:00, 47.94s/it]
`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


model loaded


In [5]:
def create_prompt_with_tot(acronym, text, options, examples):
    system = """Tu es un modèle expert en expansion d'acronymes ferroviaires.
Ton rôle est d'identifier la ou les définitions correctes d'un acronyme dans un texte. Analyse synthètiquement le text et les options. 
Après termine en ecrivant les indeces des accronymes corrects sous la forme d'une liste de python. \nExemples:"""

    for idx in examples: 
        example = train_df.iloc[idx]
        system += f'\nTexte exemple : "{example["text"]}\nAcronyme: {example["acronym"]}"\nOptions: '
        for j, opt in enumerate(example['options'].keys()):
            system += f'\n{j}. : {opt}'
        system += f'\nReponse correcte : {[i for i, value in enumerate(example["options"].values()) if value]}\n'
    user = f'Texte : "{text}"\nAcronyme : {acronym}\n'
    for i, opt in enumerate(options):
        user += f"Option {i} : {opt}\n"
    user += "Analyse chaqu'une des options, termine avec une réponse pour chaque option sous le format indiqué"
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": user}]
    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat

#create_prompt_with_tot(test.iloc[0]['acronym'], test.iloc[0]['text'], test.iloc[0]['options'].keys(), top_k_examples(5)[0])

In [6]:
import re
def extract_predicted_ids_tot(outputs):
    predicted_ids = []
    for output in outputs:
        # Get the text after [/INST]
        text = output[0]["generated_text"].split("<|im_start|>assistant")[1] #"[/INST]"
        
        ids_for_this_output = []
 
        bracket_contents = re.findall(r'\[(.*?)\]', text)
        
        for content in bracket_contents:
            # Find all numbers within each bracket content
            numbers = re.findall(r'\d+', content)
        
        # Convert to ints safely, remove duplicates, and filter < 15
        ids = list(set(int(float(i)) for i in numbers if float(i) < 15))
        
        predicted_ids.append(ids)
    return predicted_ids

In [7]:
def predict_with_tot(k=4):
    inputs = []
    for indexx, row in test_df.iterrows():
        prompt = create_prompt_with_tot(row['acronym'], row['text'], row['options'], row['topk_example_indices'])
        inputs.append(prompt)
    outputs = pipe(inputs, temperature=0, max_new_tokens=768, do_sample=False, batch_size=4)
    return outputs

In [8]:
outputs = predict_with_tot(4)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [9]:
predicted_ids = extract_predicted_ids_tot(outputs)

submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/rag-tot-qwen.csv", index=False)

previous = pd.read_csv("./predictions/rag-tot.csv")

merged = previous.merge(submission, on="id", suffixes=("_prev", "_new"))
merged['prediction_prev'] = merged['prediction_prev'].astype(str).str.strip()
merged['prediction_new'] = merged['prediction_new'].astype(str).str.strip()

similarity = (merged['prediction_prev'] == merged['prediction_new']).mean()
print(f"similarity: {similarity:.4f}")

similarity: 0.6609
