# 0. Import

In [1]:
# 0.1 — Téléchargement NLTK (une seule fois)
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\regis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import os, json, itertools
from pathlib import Path
import fitz
from langdetect import detect

import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
import mlflow

from sentence_transformers import SentenceTransformer
from langchain_ollama import ChatOllama
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
# 0.3 — Vérification GPU PyTorch
assert torch.cuda.is_available(), "CUDA non disponible !"
print("✅ PyTorch CUDA OK :", torch.cuda.get_device_name(0))

✅ PyTorch CUDA OK : NVIDIA GeForce RTX 4060 Laptop GPU


# 1. configuration

In [4]:
BASE_DIR    = Path("data");     BASE_DIR.mkdir(exist_ok=True)
RAW_PDF_DIR = BASE_DIR / "raw_pdf"; RAW_PDF_DIR.mkdir(exist_ok=True)
LLM_MODEL   = "mistral"

- https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
- https://chatgpt.com/c/6877401e-7b98-8000-abd1-c8a22773f439

In [5]:
general_cfg = {
  "embedding_models": ["all-mpnet-base-v2",],
  "chunk_sizes":      [512,768,1024],
  "chunk_overlaps":   [128],
  "top_k_list":       [10,15,20],
  "similarities":     ["cosine"]
}

In [6]:
with open("config.json","w") as f: json.dump(general_cfg,f,indent=2)
print("✅ Global config written to config.json")

✅ Global config written to config.json


In [7]:
embedders = {m: SentenceTransformer(m,device="cuda")
             for m in general_cfg["embedding_models"]}



# 2. functions

In [8]:
def get_next_exp_dir(base:Path)->Path:
    idxs=[int(d.name.split("_")[1])for d in base.iterdir()
          if d.is_dir() and d.name.startswith("exp_")]
    nxt= max(idxs)+1 if idxs else 0
    p=base/f"exp_{nxt}"; p.mkdir(exist_ok=True)
    return p

In [9]:
def extract_text(p:Path)->str:
    doc=fitz.open(str(p))
    return "\n".join(pg.get_text() for pg in doc)

In [10]:
def chunk_text(text,cs,ov)->list[dict]:
    sents=sent_tokenize(text)
    chunks=[]; curr=[]; cnt=0
    for s in sents:
        curr.append(s); cnt+=len(s)
        if cnt>=cs:
            txt=" ".join(curr)
            chunks.append({"text":txt,"lang":detect(txt)})
            curr=curr[-ov:]; cnt=sum(len(x) for x in curr)
    if curr:
        txt=" ".join(curr)
        chunks.append({"text":txt,"lang":detect(txt)})
    return chunks

In [11]:
def build_torch_index(embs:np.ndarray,use_cosine:bool)->torch.Tensor:
    t=torch.from_numpy(embs).to("cuda")
    return F.normalize(t,2,1) if use_cosine else t

In [12]:
def eval_torch_lang(tensor_index:torch.Tensor,
                    q_embs:torch.Tensor,
                    langs:list[str],
                    top_k:int)->(float,float):
    recalls=[]; mrrs=[]
    for i,qv in enumerate(q_embs):
        lang=langs[i]
        mask = torch.tensor([l==lang for l in langs], device="cuda")
        inds = retrieve_topk_lang(tensor_index, qv, mask, top_k)
        rec = float(i in inds)
        if rec:
            rank=int((inds==i).nonzero()[0])+1; mrr=1.0/rank
        else:
            mrr=0.0
        recalls.append(rec); mrrs.append(mrr)
    return float(np.mean(recalls)), float(np.mean(mrrs))

In [13]:
def eval_torch_lang(tensor_index:torch.Tensor,
                    q_embs:torch.Tensor,
                    langs:list[str],
                    top_k:int)->(float,float):
    recalls=[]; mrrs=[]
    # q_embs: Tensor (n_q, dim) already on GPU
    for i,qv in enumerate(q_embs):
        lang=langs[i]
        mask = torch.tensor([l==lang for l in langs], device="cuda")
        inds = retrieve_topk_lang(tensor_index, qv, mask, top_k)
        rec = float(i in inds)
        if rec:
            rank=int((inds==i).nonzero()[0])+1; mrr=1.0/rank
        else: mrr=0.0
        recalls.append(rec); mrrs.append(mrr)
    return float(np.mean(recalls)), float(np.mean(mrrs))

# 3. experimentations

In [14]:
# %%
cfg = general_cfg
pdfs = list(RAW_PDF_DIR.glob("*.pdf"))
if not pdfs:
    raise FileNotFoundError("Aucun PDF trouvé dans data/raw_pdf")

In [None]:
class QAResponse(BaseModel):
    question: str = Field(description="A single clear question")
    answer: str = Field(description="The answer to that question")

# Setup the output parser
json_parser = PydanticOutputParser(pydantic_object=QAResponse)

# Create a cleaner prompt template
prompt_template = PromptTemplate(
    template=(
        "Based on the text below ({lang}), generate EXACTLY ONE question-and-answer pair.\n\n"
        "Text:\n{doc}\n\n"
        "Respond ONLY with a JSON object containing a question and answer extracted from the text.\n"
        "{format_instructions}\n"
    ),
    input_variables=["lang", "doc"],
    partial_variables={"format_instructions": json_parser.get_format_instructions()}
)

# Initialize the chain
llm = ChatOllama(model=LLM_MODEL)
qag = LLMChain(llm=llm, prompt=prompt_template, verbose=False)

In [None]:
for emb_model, cs, ov, tk, sim in itertools.product(
    cfg["embedding_models"],
    cfg["chunk_sizes"],
    cfg["chunk_overlaps"],
    cfg["top_k_list"],
    cfg["similarities"],
):
    exp_dir = get_next_exp_dir(BASE_DIR)
    print(f"\n▶ Exp={exp_dir.name} | model={emb_model} | chunk={cs}/{ov} | topk={tk} | sim={sim}")
    mlflow.start_run(run_name=exp_dir.name)
    mlflow.log_params({"model": emb_model, "cs": cs, "ov": ov, "top_k": tk, "sim": sim})

    # 3.1 Chunking
    flat = []
    for pdf in pdfs:
        text = extract_text(pdf)
        try:
            doc_lang = detect(text)
        except:
            doc_lang = "en"
        chunks = chunk_text(text, cs, ov)
        for idx, chunk in enumerate(chunks):
            flat.append({
                "doc": pdf.stem,
                "chunk_id": idx,
                "text": chunk["text"],
                "lang": doc_lang,
            })
    mlflow.log_metric("num_chunks", len(flat))

    # Sauvegarde des chunks
    (exp_dir / "chunks").mkdir(exist_ok=True)
    for item in flat:
        fname = f"{item['doc']}_chunk_{item['chunk_id']:04d}.txt"
        (exp_dir / "chunks" / fname).write_text(item["text"], encoding="utf-8")

    # 3.2 Embeddings & Index
    embedder = embedders[emb_model]
    texts = [item["text"] for item in flat]
    emb_t = embedder.encode(texts, batch_size=64, convert_to_tensor=True, device="cuda", show_progress_bar=True)
    arr = emb_t.cpu().numpy().astype("float32")
    tensor_index = build_torch_index(arr, sim == "cosine")

    # 3.3 Q/A Generation (JSON via Pydantic)
    df = pd.DataFrame(flat)

    questions, answers = [], []
    for i, row in df.iterrows():
        lang = "FR" if row["lang"].startswith("fr") else "EN"
        result = qag({"lang": lang, "doc": row["text"]})
        # Parse the output text to extract the JSON
        try:
            parsed_output = json_parser.parse(result["text"])
            questions.append(parsed_output.question)
            answers.append(parsed_output.answer)
        except Exception as e:
            print(f"Error parsing output for row {i}: {e}")
            questions.append("")
            answers.append("")
    df["question"] = questions
    df["reference_answer"] = answers

    csv_path = exp_dir / "questions.csv"
    df.to_csv(csv_path, index=False, encoding="utf-8")
    mlflow.log_artifact(str(csv_path))

    # 3.4 Retrieval evaluation
    mean_recall, mean_mrr = eval_torch_lang(tensor_index, emb_t, df['lang'].tolist(), tk)
    mlflow.log_metric(f"mean_recall@{tk}", mean_recall)
    mlflow.log_metric(f"mean_mrr@{tk}", mean_mrr)

    # 3.5 Fin
    config = {"model": emb_model, "cs": cs, "ov": ov, "top_k": tk, "sim": sim}
    with open(exp_dir / "config.json", "w") as cfg_file:
        json.dump(config, cfg_file, indent=2)
    mlflow.log_artifact(str(exp_dir / "config.json"))
    mlflow.end_run()  # termine l'exp
    print("✔ Expérience terminée avec succès.")


▶ Exp=exp_0 | model=all-mpnet-base-v2 | chunk=512/128 | topk=10 | sim=cosine


Batches:   0%|          | 0/166 [00:00<?, ?it/s]

  qag = LLMChain(llm=llm, prompt=prompt_template, output_parser=json_parser, verbose=False)
  qa_resp: QAResponse = qag({"lang": lang, "doc": row["text"]})


ValueError: Missing some input keys: {'\n  "question"'}

mlflow.end_run()