# Import 

!pip install pandas openai transformers

In [2]:
import pandas as pd
from openai import OpenAI

# DATA

In [17]:
df = pd.read_csv("raw_queries.csv")
df.head()
df["source"].value_counts()

source
ms_marco             20000
natural_questions    19999
rag_12000             9590
Name: count, dtype: int64

# Models

In [16]:
from openai import OpenAI

# Initialise le client Groq (à faire une seule fois)
client = OpenAI(
    api_key="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",
    base_url="https://api.groq.com/openai/v1"
)

# Prompt template avec {query}
PROMPT_TEMPLATE = """
Your task is to take a raw search query and rewrite it to improve retrieval performance in a search engine or RAG system.

Follow these guidelines:
- Correct any grammar or spelling errors.
- Rephrase the query naturally and clearly.
-  Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it concise and relevant.

DO NOT SAY ANYTHING ELSE THAN THE QUERY

### Example
Input: danger screen sleep 
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: {query}
"""

def rewrite_query_llm(query: str, temperature: float = 0.5) -> str:
    prompt = PROMPT_TEMPLATE.format(query=query)

    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=128
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[ERROR] {e}"

In [7]:
from transformers import pipeline
import nltk
import spacy
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# 1. Correction — modèle bien configuré
corrector = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction")

# 2. Reformulation
paraphraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")

# 3. Expansion via WordNet sur les mots clés
def synonymize_key_terms(text, max_synonyms=2):
    doc = nlp(text)
    final_tokens = []
    for token in doc:
        if token.pos_ in {"NOUN", "VERB", "ADJ"} and token.is_alpha and len(token.text) > 3:
            syns = wordnet.synsets(token.text)
            lemmas = list(set(
                lemma.name().replace('_', ' ') 
                for s in syns for lemma in s.lemmas() 
                if lemma.name().lower() != token.text.lower()
            ))
            if lemmas:
                final_tokens.append(f"{token.text} ({', '.join(lemmas[:max_synonyms])})")
            else:
                final_tokens.append(token.text)
        else:
            final_tokens.append(token.text)
    return " ".join(final_tokens)

# Pipeline complet
def rewrite_pipeline(query, verbose=False):
    corrected = corrector(query, max_length=64, do_sample=False)[0]["generated_text"]
    explicit = paraphraser(corrected, max_length=64, do_sample=False)[0]["generated_text"]
    enriched = synonymize_key_terms(explicit)

    if verbose:
        print(f"\nRaw        : {query}")
        print(f"Corrected  : {corrected}")
        print(f"Explicit   : {explicit}")
        print(f"Enriched   : {enriched}")
    return enriched


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Device set to use cpu
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu


In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Chargement du modèle Flan-T5
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


FLAN_PROMPT_TEMPLATE = """
Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearly
- Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it short and useful

### Example
Input: danger screen sleep
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: {query}
"""

# Création du pipeline
flan_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Fonction de rewriting
def rewrite_with_flan(query: str, temperature=0.5, max_tokens=100) -> str:
    prompt = FLAN_PROMPT_TEMPLATE.format(query=query)
    result = flan_pipe(
        prompt,
        max_length=max_tokens,
        do_sample=True,
        temperature=temperature
    )[0]["generated_text"]
    return result.strip()

Device set to use cpu


# Tests comparatifs

In [17]:
sample = df.sample(30, random_state=123)

In [19]:
sample_queries_list = sample['query'].values
print(sample_queries_list)

['what do chromosomes look like in bacteria'
 'what are chromosomes kid definition'
 'who coined the term aerobics and in what year'
 "What is the proposed change to the parental leave payments scheme by the Coalition's small business spokesman, Mr Billson?"
 "What is Sean Kane's educational background?"
 "What is the concern of the Savannah Tree Foundation regarding the new law school's parking lot?"
 'name the animal from which we get wool' 'what is keg coupler'
 'What are the functions of the KitchenAid Mixer attachment pack?'
 'where is uc santa barbara located'
 'what is mitochondrial disease symptoms' 'liable definition law'
 'what language do people from iceland speak'
 'What is the author struggling with?'
 'wii fit u vs wii fit plus difference'
 'who sang if you like pina coladas lyrics'
 'who are the freedom riders and what are they trying to accomplish'
 'best method to cook a sirloin tip steak'
 'What is the legal status of prostitution in Hamilton, Canada?'
 'What is the p

In [11]:
%%time
for q in sample_queries_list:
    print(q)
    print(FLAN_PROMPT_TEMPLATE.format(query=q))
    print(rewrite_with_flan(q))
    print('-'*50)

yahya name meaning

Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearly
- Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it short and useful

### Example
Input: danger screen sleep
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: yahya name meaning



Output: The meaning of the name yahya is:
--------------------------------------------------
where does the electron transport chain occur in prokaryotes and eukaryotes

Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearly
- Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it short and useful

### Example
Input: danger screen sleep
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: where does the electron transport chain occur in prokaryotes and eukaryotes

Where does the electron transport chain occur in prokaryotes and eukaryotes?
--------------------------------------------------
what happened to the house in amityville horror

Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearl

In [20]:
%%time
for q in sample_queries_list[18:20]:
    print("-"*50)
    print(rewrite_query_llm(q,))

--------------------------------------------------
What is the legal framework (regulations, laws) surrounding prostitution in Hamilton, Ontario, Canada?
--------------------------------------------------
What is the objective (goal, aim) of the 2015 report compiled by the Pakistan Federal Union of Journalists (PFUJ)?
CPU times: user 28.2 ms, sys: 12.4 ms, total: 40.6 ms
Wall time: 703 ms


In [13]:
df.shape

(52823, 2)

# Tests on refined_queries

In [69]:
refined_queries_df = pd.read_csv("refined_queries.csv")
df = pd.read_csv("raw_queries.csv")

In [70]:
df["query"] = df["query"].str.strip()
refined_queries_df['query'] = refined_queries_df['query'].str.strip()

In [72]:
paired_queries_df = pd.merge(refined_queries_df, df, on="query", how="left")

# Tests on paired_queries

In [None]:
paired_queries_df = pd.read_csv("paired_queries.csv")


In [79]:
paired_queries_df.shape

(9973, 4)

In [80]:
paired_queries_df_train, paired_queries_df_test = paired_queries_df.loc[:9800, :], paired_queries_df.loc[9800:, :]

In [85]:
paired_queries_df_train.to_csv("paired_queries_train.csv", index=False)

In [84]:
paired_queries_df_test.to_csv("paired_queries_test.csv", index=False)

# Training

In [1]:
import subprocess
subprocess.run(["pip", "install", "transformers", "scikit-learn", "openai", "accelerate", "pandas", "wandb", "datasets", "peft"])

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting openai
  Downloading openai-1.75.0-py3-none-any.whl.metadata (25 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting wandb
  Downloading wandb-0.19.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp312-cp31

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2025.3.2 requires fsspec==2025.3.2.*, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0m

Successfully installed accelerate-1.6.0 annotated-types-0.7.0 anyio-4.9.0 click-8.1.8 datasets-3.5.0 dill-0.3.8 docker-pycreds-0.4.0 fsspec-2024.12.0 gitdb-4.0.12 gitpython-3.1.44 h11-0.14.0 httpcore-1.0.8 httpx-0.28.1 huggingface-hub-0.30.2 jiter-0.9.0 joblib-1.4.2 multiprocess-0.70.16 openai-1.75.0 pandas-2.2.3 peft-0.15.2 protobuf-5.29.4 pydantic-2.11.3 pydantic-core-2.33.1 pytz-2025.2 pyyaml-6.0.2 regex-2024.11.6 safetensors-0.5.3 scikit-learn-1.6.1 scipy-1.15.2 sentry-sdk-2.26.1 setproctitle-1.3.5 smmap-5.0.2 sniffio-1.3.1 threadpoolctl-3.6.0 tokenizers-0.21.1 transformers-4.51.3 typing-inspection-0.4.0 tzdata-2025.2 wandb-0.19.9 xxhash-3.5.0


CompletedProcess(args=['pip', 'install', 'transformers', 'scikit-learn', 'openai', 'accelerate', 'pandas', 'wandb', 'datasets', 'peft'], returncode=0)

In [2]:
subprocess.run(["wandb", "login", "d1beea7e0d2ac446ba4460daa9aff7ddfc3fe41c"])

wandb: No netrc file found, creating one.
wandb: Appending key for api.wandb.ai to your netrc file: /home/onyxia/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


CompletedProcess(args=['wandb', 'login', 'd1beea7e0d2ac446ba4460daa9aff7ddfc3fe41c'], returncode=0)

In [None]:
import subprocess
subprocess.run(["python", "finetunning_T5.py"])

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: vincent-gimenes to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.19.9
wandb: Run data is saved locally in /home/onyxia/work/NLP_ENSAE/wandb/run-20250420_143022-4udepec2
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run flan-t5-small-3epochs-gpu-2
wandb: ⭐️ View project at https://wandb.ai/NLP_ENSAE/finetunning%20T5
wandb: 🚀 View run at https://wandb.ai/NLP_ENSAE/finetunning%20T5/runs/4udepec2


# Tests post training

In [2]:
import pandas as pd
df_test = pd.read_csv('data_folder/paired_queries_test.csv')

In [3]:
queries = df_test["noisy_query"].values[:20]

## T5 Small

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Chargement du modèle fine-tuné
model_path = "./flan-t5-small-rewriting"  # ou base/large selon ce que tu as entraîné
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
for query in queries:
    input_text = "Rewrite, add synonyms (in parentheses), but avoid repeating any word.: " + query

    # Tokenisation
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Génération
    outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=2.5)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("🔎 Input query :", query)
    print("✍️ Rewritten   :", rewritten)

🔎 Input query : Who directed the movie "The Haunting" in 1999?
✍️ Rewritten   : Who directed the movie "The Haunting" in 1999?
🔎 Input query : who has the wost rcord in baseball history
✍️ Rewritten   : Who has the longest (lowest, highest) record in baseball history (history, history)?
🔎 Input query : when did the us become allies with britain
✍️ Rewritten   : When did the United States become allies (allegiances, alliances) with Britain (Britain, British Empire)?
🔎 Input query : are marigolds annuals or perennials
✍️ Rewritten   : Are marigolds annual (annual, perennial) or perennial (seasonal, seasonal)?
🔎 Input query : when sid wonder woman staet usng ashielg
✍️ Rewritten   : When the Wonder Woman (woman, woman) is portrayed as an actress (singer, character) in her novel "The Stranger"?
🔎 Input query : where did the mourning dove get its name
✍️ Rewritten   : Where did the mourning (serial, apparition) originate (get its name, establish)?
🔎 Input query : wht is the city mseu in sr 

## T5 base

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Chargement du modèle fine-tuné
model_path = "./flan-t5-base-rewriting"  
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")

In [33]:
for query in queries:
    input_text = "Rewrite the query using synonyms (in parentheses). Do not repeat any word more than once.:" + query

    # Tokenisation
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Génération
    outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=2.5, temperature=0.8, do_sample=True)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("🔎 Input query :", query)
    print("✍️ Rewritten   :", rewritten)

🔎 Input query : Who directed the movie "The Haunting" in 1999?
✍️ Rewritten   : Who directed (co-directed, penned) the 1999 film "The Haunting"?
🔎 Input query : who has the wost rcord in baseball history
✍️ Rewritten   : Who currently holds the record for the most wins (records, wins) in baseball history?
🔎 Input query : when did the us become allies with britain
✍️ Rewritten   : When did the United States become allies (allies, alliances) with Britain?
🔎 Input query : are marigolds annuals or perennials
✍️ Rewritten   : Are marigolds perennials or annuals?
🔎 Input query : when sid wonder woman staet usng ashielg
✍️ Rewritten   : When did Wonder Woman (wonder, character) awaken from a dream (imagination, vision)?
🔎 Input query : where did the mourning dove get its name
✍️ Rewritten   : What is the origin (source, genesis) of the mourning dove's name (name, ancestry)?
🔎 Input query : wht is the city mseu in sr lus
✍️ Rewritten   : What is the city (town, town) in South Luise, Spain (Spa

## T5 Large (lora 1)

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Chargement du modèle fine-tuné
model_path = "./flan-t5-large-rewriting-lora"  
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
for query in queries:
    input_text = "Improve the query: " + query

    # Tokenisation
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Génération
    outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.2, no_repeat_ngram_size=2)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("🔎 Input query :", query)
    print("✍️ Rewritten   :", rewritten)

🔎 Input query : ho vouces dale on kinf of the hill
✍️ Rewritten   : What is the name of the valley (dale, valley) on the summit of Mount Kilimanjaro?
🔎 Input query : ha features ewre availalt on B-R for th e0211 ppeing Dauz
✍️ Rewritten   : What are the features (features, amenities) available on B-R for the E0211 eing Dauz?
🔎 Input query : wha is the color of horseshoe crab blood
✍️ Rewritten   : What is the color (tone, hue) of horseshoe crab blood (blood, blood)?
🔎 Input query : did king david ever existed
✍️ Rewritten   : Did King David (the King of Israel) ever exist (exist, exist)?
🔎 Input query : What is the primary purpose of any corporation according to economist Roger Martin?
✍️ Rewritten   : What is the primary objective (purpose, goal) of any corporation (organization, business) according to economist Roger Martin?
🔎 Input query : what is teacjing english as a second language
✍️ Rewritten   : What is the definition (meaning, meaning) of teaching English as a second language

## T5 Large (lora 2)

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Chargement du modèle fine-tuné
model_path = "./flan-t5-large-rewriting-lora-2"  
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [None]:
for query in queries:
    input_text = "Improve the query: " + query

    # Tokenisation
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Génération
    outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.2, no_repeat_ngram_size=2)
    rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("🔎 Input query :", query)
    print("✍️ Rewritten   :", rewritten)

🔎 Input query : Who directed the movie "The Haunting" in 1999?
✍️ Rewritten   : Who was the director (writer, producer) of the 1999 film (movie, picture) "The Haunting"?
🔎 Input query : who has the wost rcord in baseball history
✍️ Rewritten   : Who holds the world record (most wins, victories) in baseball history (history, history)?
🔎 Input query : when did the us become allies with britain
✍️ Rewritten   : When did the United States become a ally (partner, partner) with Britain (UK, Great Britain)?
🔎 Input query : are marigolds annuals or perennials
✍️ Rewritten   : Are marigolds annuals or perennials (symbiotic, perennial)?
🔎 Input query : when sid wonder woman staet usng ashielg
✍️ Rewritten   : When did Wonder Woman first appear (appear, appear) in the Marvel Cinematic Universe (MCA)?
🔎 Input query : where did the mourning dove get its name
✍️ Rewritten   : What is the origin (source, derivation) of the name of the mourning dove (bird, bird)?
🔎 Input query : wht is the city mseu i