# Import 

!pip install pandas openai transformers

In [2]:
import pandas as pd
from openai import OpenAI

# DATA

In [17]:
df = pd.read_csv("raw_queries.csv")
df.head()
df["source"].value_counts()

source
ms_marco             20000
natural_questions    19999
rag_12000             9590
Name: count, dtype: int64

# Models

In [16]:
from openai import OpenAI

# Initialise le client Groq (à faire une seule fois)
client = OpenAI(
    api_key="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",
    base_url="https://api.groq.com/openai/v1"
)

# Prompt template avec {query}
PROMPT_TEMPLATE = """
Your task is to take a raw search query and rewrite it to improve retrieval performance in a search engine or RAG system.

Follow these guidelines:
- Correct any grammar or spelling errors.
- Rephrase the query naturally and clearly.
-  Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it concise and relevant.

DO NOT SAY ANYTHING ELSE THAN THE QUERY

### Example
Input: danger screen sleep 
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: {query}
"""

def rewrite_query_llm(query: str, temperature: float = 0.5) -> str:
    prompt = PROMPT_TEMPLATE.format(query=query)

    try:
        response = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=128
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"[ERROR] {e}"

In [7]:
from transformers import pipeline
import nltk
import spacy
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# 1. Correction — modèle bien configuré
corrector = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction")

# 2. Reformulation
paraphraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")

# 3. Expansion via WordNet sur les mots clés
def synonymize_key_terms(text, max_synonyms=2):
    doc = nlp(text)
    final_tokens = []
    for token in doc:
        if token.pos_ in {"NOUN", "VERB", "ADJ"} and token.is_alpha and len(token.text) > 3:
            syns = wordnet.synsets(token.text)
            lemmas = list(set(
                lemma.name().replace('_', ' ') 
                for s in syns for lemma in s.lemmas() 
                if lemma.name().lower() != token.text.lower()
            ))
            if lemmas:
                final_tokens.append(f"{token.text} ({', '.join(lemmas[:max_synonyms])})")
            else:
                final_tokens.append(token.text)
        else:
            final_tokens.append(token.text)
    return " ".join(final_tokens)

# Pipeline complet
def rewrite_pipeline(query, verbose=False):
    corrected = corrector(query, max_length=64, do_sample=False)[0]["generated_text"]
    explicit = paraphraser(corrected, max_length=64, do_sample=False)[0]["generated_text"]
    enriched = synonymize_key_terms(explicit)

    if verbose:
        print(f"\nRaw        : {query}")
        print(f"Corrected  : {corrected}")
        print(f"Explicit   : {explicit}")
        print(f"Enriched   : {enriched}")
    return enriched


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Device set to use cpu
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu


In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Chargement du modèle Flan-T5
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


FLAN_PROMPT_TEMPLATE = """
Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearly
- Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it short and useful

### Example
Input: danger screen sleep
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: {query}
"""

# Création du pipeline
flan_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Fonction de rewriting
def rewrite_with_flan(query: str, temperature=0.5, max_tokens=100) -> str:
    prompt = FLAN_PROMPT_TEMPLATE.format(query=query)
    result = flan_pipe(
        prompt,
        max_length=max_tokens,
        do_sample=True,
        temperature=temperature
    )[0]["generated_text"]
    return result.strip()

Device set to use cpu


# Tests comparatifs

In [17]:
sample = df.sample(30, random_state=123)

In [19]:
sample_queries_list = sample['query'].values
print(sample_queries_list)

['what do chromosomes look like in bacteria'
 'what are chromosomes kid definition'
 'who coined the term aerobics and in what year'
 "What is the proposed change to the parental leave payments scheme by the Coalition's small business spokesman, Mr Billson?"
 "What is Sean Kane's educational background?"
 "What is the concern of the Savannah Tree Foundation regarding the new law school's parking lot?"
 'name the animal from which we get wool' 'what is keg coupler'
 'What are the functions of the KitchenAid Mixer attachment pack?'
 'where is uc santa barbara located'
 'what is mitochondrial disease symptoms' 'liable definition law'
 'what language do people from iceland speak'
 'What is the author struggling with?'
 'wii fit u vs wii fit plus difference'
 'who sang if you like pina coladas lyrics'
 'who are the freedom riders and what are they trying to accomplish'
 'best method to cook a sirloin tip steak'
 'What is the legal status of prostitution in Hamilton, Canada?'
 'What is the p

In [11]:
%%time
for q in sample_queries_list:
    print(q)
    print(FLAN_PROMPT_TEMPLATE.format(query=q))
    print(rewrite_with_flan(q))
    print('-'*50)

yahya name meaning

Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearly
- Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it short and useful

### Example
Input: danger screen sleep
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: yahya name meaning



Output: The meaning of the name yahya is:
--------------------------------------------------
where does the electron transport chain occur in prokaryotes and eukaryotes

Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearly
- Add synonyms or related terms only to the most informative words (rare, domain-specific) in parentheses
- Keep it short and useful

### Example
Input: danger screen sleep
Output: What are the risks (hazards, dangers) of using screens (monitors, displays) before sleep (slumber, bedtime)?

Now rewrite this query:
Input: where does the electron transport chain occur in prokaryotes and eukaryotes

Where does the electron transport chain occur in prokaryotes and eukaryotes?
--------------------------------------------------
what happened to the house in amityville horror

Rewrite the following query to improve information retrieval.
- Correct grammar or spelling
- Rephrase it naturally and clearl

In [20]:
%%time
for q in sample_queries_list[18:20]:
    print("-"*50)
    print(rewrite_query_llm(q,))

--------------------------------------------------
What is the legal framework (regulations, laws) surrounding prostitution in Hamilton, Ontario, Canada?
--------------------------------------------------
What is the objective (goal, aim) of the 2015 report compiled by the Pakistan Federal Union of Journalists (PFUJ)?
CPU times: user 28.2 ms, sys: 12.4 ms, total: 40.6 ms
Wall time: 703 ms


In [13]:
df.shape

(52823, 2)

# Tests on refined_queries

In [69]:
refined_queries_df = pd.read_csv("refined_queries.csv")
df = pd.read_csv("raw_queries.csv")

In [70]:
df["query"] = df["query"].str.strip()
refined_queries_df['query'] = refined_queries_df['query'].str.strip()

In [72]:
paired_queries_df = pd.merge(refined_queries_df, df, on="query", how="left")

# Tests on paired_queries

In [None]:
paired_queries_df = pd.read_csv("paired_queries.csv")


In [79]:
paired_queries_df.shape

(9973, 4)

In [80]:
paired_queries_df_train, paired_queries_df_test = paired_queries_df.loc[:9800, :], paired_queries_df.loc[9800:, :]

In [85]:
paired_queries_df_train.to_csv("paired_queries_train.csv", index=False)

In [84]:
paired_queries_df_test.to_csv("paired_queries_test.csv", index=False)