# Naive solution

The naive solution consists of simply querying a LLM for the results without using the training data.

For this we will use pytorch and try some diffrent models

In [1]:
import torch

print("GPUs available:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

GPUs available: 1
GPU 0: Tesla V100-PCIE-32GB


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             return_dict=True,
                                             quantization_config=quantization_config,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                             offload_folder="./offload")
pipe = pipeline(tokenizer=tokenizer, 
                model=model, 
                task='text-generation', 
                torch_dtype=torch.float16, 
                device_map="auto")
pipe.tokenizer.pad_token_id = model.config.eos_token_id
pipe.model.config.use_cache = False


  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 3/3 [04:16<00:00, 85.62s/it]
`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


In [13]:
prompt = [
    {"role": "system", "content": """You do as told no more."""},
    {"role": "user", "content": """Say hello world!"""}]
chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)

output = pipe(chat, do_sample=False, batch_size=4)
print(output)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': "<s>[INST] You do as told no more.\n\nSay hello world![/INST] Hello, world! I'm here to help you with your questions and tasks. How can I assist you today?"}]


In [3]:
 def create_prompt(system, acronym, text, options):
    question = f"Parmi les définitions ci-dessous, laquelle ou lesquelles correspondent à l'acronyme {acronym} dans le texte suivant : \"{text}\"\n"
    for i, option in enumerate(options):
        question += f'\nDéfinition {i} : {option}'
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": question}]

    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat


In [14]:
import pandas as pd

train_df = pd.read_json('./data/train_v2.jsonl', lines=True)
test_df = pd.read_json('./data/test_v4.jsonl', lines=True)
train_df.head()

Unnamed: 0,text,acronym,options
0,LRA limite de résistance des attelages PAR po...,PAR,"{'Plan d'action régularité': False, 'Poste d'a..."
1,Désigna -tion des PN,PN,"{'Passages à niveau : fichier des pn, recensem..."
2,prédéterminées de trains : _x0001_ les masses ...,EM,"{'EMERAINVILLE PONTAULT COMBAULT': False, 'Eng..."
3,/Commentaires N° AC B81500 thermique: compati...,AC,"{'ACcès': False, 'Agent d'aCcompagnement ': Fa..."
4,"kilomètres/heure (ex : 12 pour 120 km/h), _x00...",TIV,"{'THIVIERS': False, 'Trafic international voya..."


In [5]:
system = "Tu es un assistant spécialisé dans les QCM et expert du domaine ferroviaire. Lorsqu'on te pose une question, tu réponds strictement en listant le numéro ou les numéros des définitions correctes, sans explications, juste une liste de numéros."
# print(create_prompt(system, train_df.iloc[0]['acronym'], train_df.iloc[0]['text'], train_df.iloc[0]['options'].keys()))

## Naive solution

In [6]:
inputs = []
for index, row in train_df.iterrows():
    prompt = create_prompt(system, row['acronym'], row['text'], row['options'].keys())
    inputs.append(prompt)

outputs = pipe(inputs, do_sample=False, batch_size=4)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for ope

In [24]:
print(len(outputs))
print(len(train_df))

492
492


In [7]:
import re

def extract_predicted_ids(outputs):
    predicted_ids = []
    for output in outputs:
        # Get the text after [/INST]
        text = output[0]["generated_text"].split("[/INST]")[1]
        
        # Find all numeric patterns, including decimals
        numbers = re.findall(r'\d+(?:\.\d+)?', text)
        
        # Convert to ints safely, remove duplicates, and filter < 15
        ids = list(set(int(float(i)) for i in numbers if float(i) < 15))
        
        predicted_ids.append(ids)
    return predicted_ids

In [13]:
predicted_ids = extract_predicted_ids(outputs)
train_df['predictions'] = predicted_ids

train_df['answer'] = [[] for _ in range(len(train_df))]
for i in range(len(outputs)):
    train_df.at[i, 'answer'] = [j for j, val in enumerate(train_df.at[i, 'options'].values()) if val]

train_df['correct'] = train_df.apply(lambda row: set(row['predictions']) == set(row['answer']), axis=1)
accuracy = train_df['correct'].mean()
right = train_df['correct'].sum()
total = len(train_df['correct'])

print(f"Expected F1: {right*2/ (right + total)}")
print(f"Expected accuracy: {accuracy}")

Expected F1: 0.6483516483516484
Expected F1: 0.4796747967479675


In [15]:
inputs = []
for index, row in test_df.iterrows():
    prompt = create_prompt(system, row['acronym'], row['text'], row['options'])
    inputs.append(prompt)

outputs_test = pipe(inputs, do_sample=False, batch_size=4)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [17]:
predicted_ids = extract_predicted_ids(outputs_test)
submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/naive.csv", index=False)

## Example fetching

In [34]:
def find_examples(acronym):
    examples = train_df[train_df['acronym'] == acronym].copy()
    return examples

In [50]:
# Sending examples to model in prompt
def create_prompt_with_examples(system, acronym, text, options):
    question = f"Parmi les définitions ci-dessous, laquelle ou lesquelles correspondent à l'acronyme {acronym} dans le texte suivant : \"{text}\"\n"
    for i, option in enumerate(options):
        question += f'\nDéfinition {i} : {option}'
    question += "\n\nVoici quelques exemples d'utilisation de cet acronyme:\n"
    examples = find_examples(acronym).head(5).copy()
    for idx, example in examples.iterrows():
        question += f'\nTexte exemple : "{example["text"]}"\nDéfinitions : '
        for j, opt in enumerate(example['options'].keys()):
            question += f'\nDéfinition {j} : {opt}'
        question += f'\nReponse correcte : {[i for i, value in enumerate(example["options"].values()) if value]}\n'
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": question}]
    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat

print(create_prompt_with_examples(system, train_df.iloc[0]['acronym'], train_df.iloc[0]['text'], train_df.iloc[0]['options'].keys()))

<s>[INST] Tu es un assistant spécialisé dans les QCM et expert du domaine ferroviaire. Lorsqu'on te pose une question, tu réponds strictement en listant le numéro ou les numéros des définitions correctes, sans explications, juste une liste de numéros.

Parmi les définitions ci-dessous, laquelle ou lesquelles correspondent à l'acronyme PAR dans le texte suivant : "LRA  limite de résistance des attelages PAR poste d’aiguillage et de régulation PL pleine ligne PN passage à niveau RFN réseau ferré national "

Définition 0 : Plan d'action régularité
Définition 1 : Poste d'aiguillage et de régulation : assure les commandes des installations de signalisation et la gestion de la circulation des huit lignes à grande vitesse
Définition 2 : PONT DE L'ARCHE
Définition 3 : Plan d'action régional

Voici quelques exemples d'utilisation de cet acronyme:

Texte exemple : "LRA  limite de résistance des attelages PAR poste d’aiguillage et de régulation PL pleine ligne PN passage à niveau RFN réseau ferré

In [36]:
test_df = pd.read_json('./data/test_v4.jsonl', lines=True)
test_df.head()

Unnamed: 0,id,text,acronym,options
0,0,o V3 RCI o V5 RCI A101.2 Caractéristiques de ...,RCI,"[Régulateur de Circulation Interconnecté, Rele..."
1,1,Les présentes valeurs sont données à titre ind...,RC,"[Résistance de continuité, Recommandation publ..."
2,2,EAS Equipement agent seul. EF Entreprise ferro...,EAS,"[Équipement Agent Seul. Concrètement, pour qu'..."
3,3,pour l’application des prescriptions (exemples...,TGV,"[Technologie de Guidage à Vitesse élevée, Trai..."
4,4,"164,900165,2500 .8 3 3 5 0 166,6505 .3 9 1 4...",IR,"[mauvais découpage des mots en majuscule, Inci..."


In [53]:
inputs = []
for index, row in test_df.iterrows():
    prompt = create_prompt_with_examples(system, row['acronym'], row['text'], row['options'])
    inputs.append(prompt)

outputs = pipe(inputs, do_sample=False, batch_size=4)

print("done")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

done


In [56]:
predicted_ids = extract_predicted_ids(outputs)
print(predicted_ids)

[[1], [1], [0], [1, 2], [1], [2], [1], [1, 2], [1], [1], [1, 2], [3], [1, 3], [3], [3], [0, 1, 4], [1], [2], [3], [2], [1], [1], [1], [1, 10], [2], [1, 2, 3], [1], [0], [1], [3], [1, 3], [4], [0], [0, 1, 2, 3], [1], [2], [1], [1], [4], [1, 3], [1], [1], [2], [3], [3], [0], [1], [1], [0], [1], [1, 2], [1], [1, 2], [2], [1], [1, 3], [0, 1], [0, 2, 3], [1, 2], [0, 1, 3], [1], [2, 3], [0], [1, 3], [1], [0], [1, 3], [0, 1], [1, 3], [1], [3], [2, 3], [1], [1], [1], [1], [3, 5], [1], [1, 2], [1, 3, 4], [0], [1, 2], [1, 2], [1, 2, 3], [1, 2], [1], [2], [1, 2, 3], [0], [1, 2], [3], [2], [2, 6], [2], [2, 3], [1], [0], [1, 3], [2], [2, 5], [1, 10], [7], [1, 3], [1, 2], [3], [1, 2], [1], [0, 2], [2], [2, 3], [1], [3], [1, 2], [1, 3], [1, 3], [1, 3], [2], [0], [1, 2], [1], [2, 3], [2], [1], [1, 2], [3], [1], [2], [2, 3], [1], [3, 4], [1, 2], [0], [3], [0], [1, 2, 3], [0, 1], [1, 4], [1], [3], [1], [1], [0], [0, 2], [1, 2], [1], [1], [3], [1, 3], [1], [2], [3], [0, 3], [1, 3], [1], [2], [3], [1], [1

In [58]:
submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/with_examples.csv", index=False)

In [63]:
# Sending examples to model in prompt
def create_prompt_with_examples_2(system, acronym, text, options):
    examples = find_examples(acronym).head(5).copy()
    question = ""
    for idx, example in examples.iterrows():
        question += f'\nTexte exemple : "{example["text"]}"\nDéfinitions : '
        for j, opt in enumerate(example['options'].keys()):
            question += f'\nDéfinition {j} : {opt}'
        question += f'\nReponse correcte : {[i for i, value in enumerate(example["options"].values()) if value]}\n'
    question += f"Parmi les définitions ci-dessous et suivant les examples au-dessus, laquelle ou lesquelles correspondent à l'acronyme {acronym} dans le texte suivant : \"{text}\"\n"
    for i, option in enumerate(options):
        question += f'\nDéfinition {i} : {option}'
    question += "\n\nVoici quelques exemples d'utilisation de cet acronyme:\n"
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": question}]
    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat

print(create_prompt_with_examples(system, train_df.iloc[0]['acronym'], train_df.iloc[0]['text'], train_df.iloc[0]['options'].keys()))

<s>[INST] Tu es un assistant spécialisé dans les QCM et expert du domaine ferroviaire. Lorsqu'on te pose une question, tu réponds strictement en listant le numéro ou les numéros des définitions correctes, sans explications, juste une liste de numéros.


Texte exemple : "LRA  limite de résistance des attelages PAR poste d’aiguillage et de régulation PL pleine ligne PN passage à niveau RFN réseau ferré national "
Définitions : 
Définition 0 : Plan d'action régularité
Définition 1 : Poste d'aiguillage et de régulation : assure les commandes des installations de signalisation et la gestion de la circulation des huit lignes à grande vitesse
Définition 2 : PONT DE L'ARCHE
Définition 3 : Plan d'action régional
Reponse correcte : [1]

Texte exemple : "LGV ligne à grande vitesse LRA  limite de résistance des attelages PAR poste d’aiguillage et de régulation PL pleine ligne PN passage à niveau "
Définitions : 
Définition 0 : Plan d'action régional
Définition 1 : Poste d'aiguillage et de régulati

In [64]:
inputs = []
for index, row in test_df.iterrows():
    prompt = create_prompt_with_examples_2(system, row['acronym'], row['text'], row['options'])
    inputs.append(prompt)

outputs = pipe(inputs, do_sample=False, batch_size=4)

print("done")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

done


In [65]:
predicted_ids = extract_predicted_ids(outputs)
print(predicted_ids)
submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/with_examples2.csv", index=False)

In [69]:
# Improve system and user prompt. Examples in system, and better instructions

def create_prompt_with_examples_3(acronym, text, options):
    system = """Tu es un modèle expert en expansion d'acronymes ferroviaires.
Ton rôle est d'identifier la ou les définitions correctes d'un acronyme dans un texte.
Réponds uniquement avec une liste Python d'indices, ex. [0] ou [1, 2] ou []. \nExemples:"""
    
    examples = find_examples(acronym).head(5).copy()
    for idx, example in examples.iterrows():
        system += f'\nTexte exemple : "{example["text"]}"\nOptions: '
        for j, opt in enumerate(example['options'].keys()):
            system += f'\n{j}. : {opt}'
        system = f'\nReponse correcte : {[i for i, value in enumerate(example["options"].values()) if value]}\n'
    user = f'Texte : "{text}"\nAcronyme : {acronym}\n'
    for i, opt in enumerate(options):
        user += f"Option {i} : {opt}\n"
    user += "Réponds avec la liste des numéros corrects :"
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": user}]
    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat

In [70]:
inputs = []
for index, row in test_df.iterrows():
    prompt = create_prompt_with_examples_3(row['acronym'], row['text'], row['options'])
    inputs.append(prompt)

outputs = pipe(inputs, do_sample=False, batch_size=4)

print("done")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

done


In [84]:
predicted_ids = extract_predicted_ids(outputs)
print(predicted_ids)
submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/with_examples3.csv", index=False)

[[1], [1, 3], [0], [1], [3], [2], [1], [1], [0, 1, 2, 5, 10], [1, 2], [1], [0], [1, 3], [3], [3], [1, 3], [1], [2], [3], [2], [3], [1], [1], [1], [0, 3], [2], [1, 2], [1], [1], [3], [1, 3], [9], [9, 3, 7], [0, 2], [1, 3], [0], [1], [1], [], [1, 3], [1], [0], [2], [3], [3], [1], [1], [1], [3], [1], [1], [3], [1], [1], [0, 2], [3], [0, 1], [3], [1, 2], [1, 2], [2], [2], [0], [1, 3], [1], [1, 2], [0, 1, 2, 6, 7, 8, 9, 13], [1], [1, 3], [0], [9], [3], [1], [2], [1, 2, 3], [1], [2, 5], [1], [1, 3], [0, 1, 4], [1, 2], [1], [2], [2], [1], [1], [2], [1, 2], [0], [1], [2], [2], [2, 6], [2], [2], [1], [0], [1, 2, 3], [2], [5], [0, 1], [9], [0], [1], [1], [2], [1], [1, 2], [2], [3], [1], [2], [1], [1, 3], [1], [1, 3], [2], [3], [1], [0], [2, 3], [2], [2], [1], [3], [1], [2], [2, 3], [9], [0, 3], [1], [1, 3, 4, 5, 8], [1, 2], [3], [2, 3], [0], [1, 4], [1], [0], [1], [1], [1], [0, 1], [1, 2], [1, 2], [0, 1], [3], [1, 3], [0], [2], [2], [3], [1], [9, 1, 7], [2], [1, 2], [0], [1, 2], [3], [1], [1], [

In [85]:
previous = pd.read_csv("./predictions/with_examples2.csv")
current = pd.read_csv("./predictions/with_examples3.csv")

count = 0
for index, row in previous.iterrows():
    if row['prediction'] == current['prediction'].iloc[index]:
        count += 1
print(f"similarity: {count/len(previous)}")

similarity: 0.4026974951830443


In [122]:
from sentence_transformers import SentenceTransformer, util

embedder = SentenceTransformer("intfloat/multilingual-e5-base")

def find_examples_2(acronym, text, k=5):
    # Filter same acronym
    subdf = train_df[train_df["acronym"] == acronym].copy()
    if len(subdf) == 0:
        return train_df.sample(k)  # fallback

    # Encode candidate contexts and test context
    text_emb = embedder.encode(text, convert_to_tensor=True)
    corpus_embs = embedder.encode(subdf["text"].tolist(), convert_to_tensor=True)

    # Compute cosine similarity
    sims = util.cos_sim(text_emb, corpus_embs)[0]
    subdf.loc[:, "similarity"] = sims.cpu().numpy()
    return subdf.sort_values("similarity", ascending=False).head(k)

from IPython.display import display

print(find_examples(test_df.iloc[2]["acronym"])[["acronym", "text"]])
with pd.option_context('display.max_colwidth', None, 'display.max_rows', None):
    print(find_examples_2(test_df.iloc[2]["acronym"], test_df.iloc[2]["text"], 3)[["acronym", "text", "similarity"]])

    acronym                                               text
36      EAS  COGC centre opérationnel de gestion des circul...
82      EAS  DBC détecteur boite chaude EAS équipement agen...
116     EAS  CLE consigne locale d'exploitation COGC centre...
229     EAS  AUTOR autorail BAPR block automatique à permis...
249     EAS  COGC centre opérationnel de gestion des circul...
261     EAS  TIV de type C. COGC centre opérationnel de ges...
307     EAS   DISPOSITIONS PARTICULIERES RELATIVES A L’AUT...
375     EAS  Dispositions particulières concernant les trai...
379     EAS  AGC automoteur grande capacité AUTOR autorail ...
392     EAS  CLE consigne locale d'exploitation COGC centre...
401     EAS  AUTOR autorail CLE consigne locale d'exploitat...
407     EAS  figurant pas dans la CLE propre à l'établissem...
410     EAS  Dans les établissements où il est fait mention...
419     EAS  _x0001_ Sartrouville : - C 370 pour les engins...
434     EAS  CCLR Chef circulation local régulateur cen

In [123]:
# Improve system and user prompt. Examples in system, and better instructions

def create_prompt_with_embedder_examples(acronym, text, options):
    system = """Tu es un modèle expert en expansion d'acronymes ferroviaires.
Ton rôle est d'identifier la ou les définitions correctes d'un acronyme dans un texte.
Réponds uniquement avec une liste Python d'indices, ex. [0] ou [1, 2] ou []. \nExemples:"""
    
    examples = find_examples2(acronym, text)
    for idx, example in examples.iterrows():
        system += f'\nTexte exemple : "{example["text"]}\nAcronyme: {example["acronym"]}"\nOptions: '
        for j, opt in enumerate(example['options'].keys()):
            system += f'\n{j}. : {opt}'
        system += f'\nReponse correcte : {[i for i, value in enumerate(example["options"].values()) if value]}\n'
    user = f'Texte : "{text}"\nAcronyme : {acronym}\n'
    for i, opt in enumerate(options):
        user += f"Option {i} : {opt}\n"
    user += "Réponds avec la liste des numéros corrects :"
    prompt = [
    {"role": "system", "content": system},
    {"role": "user", "content": user}]
    chat = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    return chat

create_prompt_with_embedder_examples(test_df.iloc[2]["acronym"], test_df.iloc[2]["text"], test_df.iloc[2]["options"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf["similarity"] = sims.cpu().numpy()


'<s>[INST] Tu es un modèle expert en expansion d\'acronymes ferroviaires.\nTon rôle est d\'identifier la ou les définitions correctes d\'un acronyme dans un texte.\nRéponds uniquement avec une liste Python d\'indices, ex. [0] ou [1, 2] ou []. \nExemples:\nTexte exemple : "DBC détecteur boite chaude EAS équipement agent seul EF entreprise ferroviaire EIC établissement infrastructure circulation du service chargé de la gestion du trafic et des circulations sur le RFN \nAcronyme: EAS"\nOptions: \n0. : ETALANS\n1. : Équipement Agent Seul. Concrètement, pour qu\'un train soit EAS, il faut que le matériel soit équipé : de moyens de surveillance lors d\'arrêts en gare, de la commande de fermeture des portes, de la veille automatique (VA), de la radio sol-train (RST), de la sonorisation (annonces voyageurs), de la communication par interphonie en cas de signal d\'alarme (SAI). Si toutes ces conditions sont réunies, le conducteur peut être le seul agent présent à bord du train.\nReponse correct

In [124]:
inputs = []
for index, row in test_df.iterrows():
    prompt = create_prompt_with_embedder_examples(row['acronym'], row['text'], row['options'])
    inputs.append(prompt)

outputs = pipe(inputs, do_sample=False, batch_size=4)

predicted_ids = extract_predicted_ids(outputs)
submission = pd.DataFrame({"id": test_df.index, "prediction":predicted_ids})
submission.head()
submission.to_csv("./predictions/with_embedder_examples.csv", index=False)

previous = pd.read_csv("./predictions/with_examples3.csv")

count = 0
for index, row in previous.iterrows():
    if row['prediction'] == submission['prediction'].iloc[index]:
        count += 1
print(f"similarity: {count/len(previous)}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf["similarity"] = sims.cpu().numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf["similarity"] = sims.cpu().numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf["similarity"] = sims.cpu().numpy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 34.19 MiB is free. Process 2686602 has 28.45 GiB memory in use. Process 4186628 has 3.24 GiB memory in use. Of the allocated memory 27.65 GiB is allocated by PyTorch, and 443.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)