In [1]:
!pip install pymupdf
!pip install --upgrade transformers datasets accelerate bitsandbytes peft trl sentence-transformers
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7
Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.18.1-py3-none-any.whl.metadata (14 kB)
Collecting trl
  Downloading trl-0.27.1-py3-none-any.whl.metadata (11 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.2.2-py3-n

In [2]:
import fitz
import os
pdf_folder = '/kaggle/input/forensic/'
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text("text") + "\n\n"
    doc.close()
    return full_text.strip()
all_text = ""
for pdf in pdf_files:
    text = extract_text_from_pdf(pdf)
    all_text += f"\n\n-- Document: {os.path.basename(pdf)} -- \n\n" + text

with open("corpus_forensic.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

print(f"Texte total extrait {len(all_text)}")

Texte total extrait 1151740


In [3]:
def chunk_text(text, max_length=1200):
    chunks = []
    current = ""
    for line in text.split("\n"):
        if len(current) + len(line) > max_length:
            chunks.append(current.strip())
            current = line
        else:
            current += " " + line
    if current:
        chunks.append(current.strip())
    return chunks

chunks = chunk_text(all_text)
print(f"{len(chunks)} chunks créés")

981 chunks créés


In [None]:
def extract_json_from_response(response):
    start = response.find('[')
    if start == -1:
        print("Aucun tableau JSON trouvé")
        return []
    
    candidate = response[start:]
    
    brace_count = 0
    fixed = ""
    for char in candidate:
        fixed += char
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0 and fixed.endswith('}'):
                if not fixed.strip().endswith(']'):
                    fixed = fixed.rstrip() + ','
    
    if not fixed.strip().endswith(']'):
        fixed = fixed.rstrip(',') + ']'
    
    fixed = re.sub(r',\s*]', ']', fixed)
    fixed = re.sub(r'\}\s*,?\s*$', '}', fixed)
    
    try:
        qa_list = json.loads(fixed)
        valid_qa = [item for item in qa_list if isinstance(item, dict) and "question" in item and "answer" in item and item["answer"].strip()]
        if valid_qa:
            return valid_qa
        else:
            print("Aucun objet valide après filtre")
            return []
    except json.JSONDecodeError as e:
        print(f"Erreur même après réparation : {e}")
        print("JSON tenté :", fixed[:500])
        return []

In [23]:
from unsloth import FastLanguageModel
import json
import re
from tqdm.auto import tqdm
import torch

# === Chargement du modèle (déjà fait dans ton code) ===
model_name = "unsloth/gemma-2-9b-it-bnb-4bit"  # ou autre
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=4096,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

# === Fonction de génération (améliorée) ===
def generate_qa(chunk):
    prompt = f"""Tu es un générateur QA pour digital forensics. Pour ce chunk de texte UNIQUEMENT :

- Génère EXACTEMENT 6 à 8 questions-réponses différentes et pointues.
- Questions variées : outils, commandes, formats de fichiers, artefacts par OS, étapes d'analyse, timelines, registres, prefetch, etc.
- INTERDICTION ABSOLUE : ne répète JAMAIS une question ou un thème similaire à ceux déjà générés auparavant.
- Ne pose PAS de questions sur Volatility pslist, mémoire vive ou listes de processus sauf si le chunk en parle explicitement de façon nouvelle.
- Réponses : courtes, extraites mot pour mot ou presque du texte.
- Réponds UNIQUEMENT avec le JSON valide. Pas de texte avant/après. Pas de markdown.

Texte du chunk :
{chunk[:3000]}

Génère UNIQUEMENT le JSON valide maintenant. Varie fortement les questions.
Assure-toi de générer EXACTEMENT 6 à 8 paires complètes. Termine toujours par ] et rien après.
Tu DOIS générer exactement 6 paires complètes. Chaque paire doit être un objet JSON complet avec "question" et "answer" valides. Ne laisse JAMAIS un objet ouvert ou une virgule en trop. Si le chunk est court, génère moins mais toujours complet."""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1200,             # Monte beaucoup plus haut (tes paires sont longues)
            temperature=0.5,                 # Équilibre : pas trop bas
            top_p=0.9,
            do_sample=True,                  # Nécessaire pour éviter coupures déterministes
            repetition_penalty=1.3,          # Pénalise fortement les répétitions / boucles
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response  # on retourne la raw response, parsing après

# === Fonction de parsing robuste (améliorée) ===

def extract_json_from_response(response):
    # Étape 1 : Trouver le début et la fin potentielle du tableau
    start = response.find('[')
    if start == -1:
        print("Aucun [ trouvé → pas de JSON")
        return []
    
    candidate = response[start:]
    
    # Étape 2 : Nettoyage agressif
    # Supprimer virgules multiples ,, ou trailing ,
    candidate = re.sub(r',,+', ',', candidate)
    candidate = re.sub(r',\s*([}\]])', r'\1', candidate)  # virgule avant } ou ]
    
    # Étape 3 : Équilibrer les accolades et crochets
    brace_open = candidate.count('{')
    brace_close = candidate.count('}')
    bracket_open = candidate.count('[')
    bracket_close = candidate.count(']')
    
    if brace_open > brace_close:
        candidate += '}' * (brace_open - brace_close)
    if bracket_open > bracket_close:
        candidate += ']' * (bracket_open - bracket_close)
    
    # Étape 4 : Couper au premier ] valide (éviter texte parasite après)
    end_match = re.search(r'\].*?$', candidate, re.DOTALL)
    if end_match:
        candidate = candidate[:end_match.end()]
    
    # Étape 5 : Tentative de parse
    try:
        data = json.loads(candidate)
        if isinstance(data, list):
            # Garder seulement les paires complètes
            valid_pairs = []
            for item in data:
                if (isinstance(item, dict) and 
                    "question" in item and isinstance(item["question"], str) and 
                    "answer" in item and isinstance(item["answer"], str) and 
                    item["answer"].strip()):  # answer non vide
                    valid_pairs.append(item)
            if valid_pairs:
                return valid_pairs
            else:
                print("Aucune paire valide après filtre")
        else:
            print("Pas une liste")
    except json.JSONDecodeError as e:
        print(f"Erreur JSON finale : {e}")
        print("JSON après réparation (premiers 600 chars) :", candidate[:600])
    
    # Dernier recours : extraction manuelle basique des paires
    manual_pairs = []
    pattern = r'\{\s*"question"\s*:\s*"([^"]*)"\s*,\s*"answer"\s*:\s*"([^"]*)"\s*\}'
    for match in re.finditer(pattern, response):
        q, a = match.groups()
        if a.strip():
            manual_pairs.append({"question": q, "answer": a})
    
    if manual_pairs:
        print(f"Récupéré {len(manual_pairs)} paires via regex fallback")
        return manual_pairs
    
    return []

def fallback_regex_extraction(response):
    pairs = []
    # Pattern plus souple : accepte guillemets simples ou doubles, espaces variables
    pattern = r'{\s*["\']?question["\']?\s*:\s*["\']([^"\']*)["\']\s*,\s*["\']?answer["\']?\s*:\s*["\']([^"\']*)["\']\s*}'
    for match in re.finditer(pattern, response, re.IGNORECASE | re.DOTALL):
        q, a = match.groups()
        q = q.strip().replace('\\', '')  # enlever escapes inutiles
        a = a.strip().replace('\\', '')
        if q and a:
            pairs.append({"question": q, "answer": a})
    return pairs

# === Boucle de génération CORRECTE ===
all_qa_pairs = []

for chunk in tqdm(chunks, desc="Génération Q/R"):
    if not chunk.strip():  # skip chunks vides
        continue
        
    raw_response = generate_qa(chunk)
    qa_list = extract_json_from_response(raw_response)
    
    if qa_list:
        all_qa_pairs.extend(qa_list)
    else:
        print(f"Échec parsing sur un chunk → ignoré")

# === Sauvegarde ===
with open("forensic_qa_dataset.json", "w", encoding="utf-8") as f:
    json.dump(all_qa_pairs, f, ensure_ascii=False, indent=2)

print(f"Total Q/R générées et parsées : {len(all_qa_pairs)}")

==((====))==  Unsloth 2026.1.4: Fast Gemma2 patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Génération Q/R:   0%|          | 0/981 [00:00<?, ?it/s]

Erreur JSON finale : Expecting ':' delimiter: line 1 column 695 (char 694)
JSON après réparation (premiers 600 chars) : [{"question": "Quelles sont les versions des systèmes d'exploitation utilisés dans ces tests ?", "answer": "FreeBSD 4.4"}, {"question": "Quels types de partitions ont été représentés lors des tests ?", "answer": "FAT16, FAT32, NTFS ou Linux EXT2"}, {"question": "Où se situent les laboratoires où ces tests ont eu lieu ?", "answer": "Computer Forensics Tool Testing Lab at the National Institute of Standards and Technology"}, {"question": "Quel était l'objectif principal de ce document ?", "answer": "Provide enough information about the testing process"}, {"question": "Qui devrait lire ce documen
Récupéré 5 paires via regex fallback
Erreur JSON finale : Expecting ',' delimiter: line 1 column 380 (char 379)
JSON après réparation (premiers 600 chars) : [{"question": "Quels sont des exemples de marques déposées mentionnées dans le document ?", "answer": "\"Power Quest Corpo

In [2]:
import json
import random
from collections import Counter
import re
from pathlib import Path
from IPython.display import FileLink
import pandas as pd
import numpy as np
#Questions reponses Path
json_path = Path("/kaggle/input/qa-clean/questions_reponses_300.json")

if not json_path.exists():
    raise FileNotFoundError("Le fichier JSON n'est pas trouvé. Vérifie le chemin.")

with open(json_path, 'r', encoding='utf-8') as f:
    raw_qa = json.load(f)

print(f"Total paires brutes chargées : {len(raw_qa)}")

def is_valid_pair(item):
    if not isinstance(item, dict):
        return False
    q = item.get("question", "").strip()
    a = item.get("answer", "").strip()
    return bool(q and a and len(q) > 10 and len(a) > 2)

seen = {}
clean_qa = []

for item in raw_qa:
    if not is_valid_pair(item):
        continue
    q_norm = re.sub(r'\s+', ' ', item["question"].lower().strip())
    if q_norm not in seen:
        seen[q_norm] = True
        clean_qa.append({
            "question": item["question"].strip(),
            "answer": item["answer"].strip()
        })

print(f"Après nettoyage et dédoublonnage : {len(clean_qa)} paires uniques valides")
# Clean & Fully ready dataset Q/A
clean_path = "/kaggle/working/forensic_qa_clean.json"
with open(clean_path, "w", encoding="utf-8") as f:
    json.dump(clean_qa, f, ensure_ascii=False, indent=2)

FileLink('forensic_qa_clean.json')

# 300 Q/A Samples as dataset
if len(clean_qa) >= 300:
    sampled_qa = random.sample(clean_qa, 300)
else:
    sampled_qa = clean_qa
    print(f"Moins de 300 paires uniques → on prend tout ({len(sampled_qa)})")

questions = [item["question"] for item in sampled_qa]
ground_truths = [item["answer"] for item in sampled_qa]

print(f"Prêt à tester {len(questions)} questions.")

#Check stats
q_counts = Counter(questions)
dupl = {q: c for q, c in q_counts.items() if c > 1}
if dupl:
    print(f"\nIl reste {len(dupl)} questions dupliquées (max {max(dupl.values())} fois)")
else:
    print("Aucun doublon restant après nettoyage.")

Total paires brutes chargées : 300
Après nettoyage et dédoublonnage : 0 paires uniques valides
Moins de 300 paires uniques → on prend tout (0)
Prêt à tester 0 questions.
Aucun doublon restant après nettoyage.


In [5]:
import json
import random
from pathlib import Path

json_path = Path("/kaggle/input/qa-clean/questions_reponses_300.json")

if not json_path.exists():
    print("Fichier non trouvé ! Vérifie le chemin.")
else:
    with open(json_path, 'r', encoding='utf-8') as f:
        all_qa = json.load(f)
    
    print(f"Total paires chargées : {len(all_qa)}")
    
    if all_qa and isinstance(all_qa[0], dict) and "question" in all_qa[0] and "answer" in all_qa[0]:
        print("Exemple première paire :", all_qa[0])
    else:
        print("Structure inattendue ! Vérifie le format.")
    
    if len(all_qa) >= 300:
        sampled_qa = random.sample(all_qa, 300)
    else:
        sampled_qa = all_qa
        print(f"Moins de 300 paires → on prend tout ({len(sampled_qa)})")
    
    questions = [item["question"] for item in sampled_qa]
    ground_truths = [item["reponse"] for item in sampled_qa]
    
    print(f"Prêt à tester {len(questions)} questions.")

Total paires chargées : 300
Structure inattendue ! Vérifie le format.
Prêt à tester 300 questions.


In [6]:
from unsloth import FastLanguageModel
import torch
from tqdm.auto import tqdm

def load_model(model_name):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer

def generate_answer(model, tokenizer, question):
    prompt = f"""Tu es un expert en digital forensics. Réponds **UNIQUEMENT** par la réponse courte exacte. Pas de phrase complète, pas d'explication, pas d'introduction, pas de conclusion. Juste le contenu factuel attendu.

Question : {question}

Réponse :"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            temperature=0.0,
            do_sample=False,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "Réponse :" in response:
        return response.split("Réponse :")[-1].strip()
    else:
        return response.strip().replace(prompt, "").strip()

models_to_test = {
    "Gemma-2-9B": "unsloth/gemma-2-9b-it-bnb-4bit",
    "Qwen2.5-7B": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    "Llama-3.2-3B": "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
}

results = {}

for name, model_path in models_to_test.items():
    print(f"\n=== Test de {name} ===")
    
    model, tokenizer = load_model(model_path)
    
    predictions = []
    for q in tqdm(questions, desc=f"Génération {name}"):
        pred = generate_answer(model, tokenizer, q)
        predictions.append(pred)
    
    results[name] = predictions
    
    del model, tokenizer
    torch.cuda.empty_cache()
    
    print(f"{name} terminé ({len(predictions)} réponses générées)")


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!

=== Test de Gemma-2-9B ===
==((====))==  Unsloth 2026.1.4: Fast Gemma2 patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Génération Gemma-2-9B:   0%|          | 0/300 [00:00<?, ?it/s]

Gemma-2-9B terminé (300 réponses générées)

=== Test de Qwen2.5-7B ===
==((====))==  Unsloth 2026.1.4: Fast Qwen2 patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Génération Qwen2.5-7B:   0%|          | 0/300 [00:00<?, ?it/s]

Qwen2.5-7B terminé (300 réponses générées)

=== Test de Llama-3.2-3B ===
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Génération Llama-3.2-3B:   0%|          | 0/300 [00:00<?, ?it/s]

Llama-3.2-3B terminé (300 réponses générées)


In [7]:
!pip install rouge-score bert-score nltk

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [9]:
def clean_response(text):
    text = text.strip()
    prefixes = ["La réponse est", "Il s'agit de", "C'est", "Réponse :", "L'outil est", "L'action est"]
    for p in prefixes:
        text = text.replace(p, "", 1).strip()
    text = text.split('\n')[0].split('.')[0].strip()
    text = re.sub(r'`|\*|_', '', text).strip()
    return text


for model_name in results:
    results[model_name] = [clean_response(p) for p in results[model_name]]

In [10]:
import re
import string
from collections import Counter
from rouge_score import rouge_scorer
from bert_score import score
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def normalize_answer(s):
    s = s.lower()
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', '', s)
    s = ' '.join(s.split())
    return s.strip()

def exact_match(pred, gt):
    return normalize_answer(pred) == normalize_answer(gt)

def f1_token(pred, gt):
    p = normalize_answer(pred).split()
    g = normalize_answer(gt).split()
    common = Counter(p) & Counter(g)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = num_same / len(p)
    recall = num_same / len(g)
    return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Calcul scores
scores = {}

for model_name, preds in results.items():
    em = []
    f1 = []
    rougeL = []
    
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    
    for p, gt in zip(preds, ground_truths):
        em.append(1 if exact_match(p, gt) else 0)
        f1.append(f1_token(p, gt))
        rougeL.append(scorer.score(gt, p)['rougeL'].fmeasure)
    
    _, _, bert_f1 = score(preds, ground_truths, lang="en", rescale_with_baseline=True)
    
    scores[model_name] = {
        "Exact Match": np.mean(em),
        "F1 Token": np.mean(f1),
        "ROUGE-L": np.mean(rougeL),
        "BERTScore F1": bert_f1.mean().item()
    }
print("Exemples de prédictions vs ground truth (10 premiers) :\n")
for i in range(min(10, len(questions))):
    print(f"Question : {questions[i]}")
    print(f"Ground truth : {ground_truths[i]}")
    print(f"Gemma-2-9B   : {results['Gemma-2-9B'][i]}")
    print(f"Qwen2.5-7B   : {results['Qwen2.5-7B'][i]}")
    print(f"Llama-3.2-3B : {results['Llama-3.2-3B'][i]}")
    print("-" * 80)

# Metrics details for each model
import pandas as pd
df = pd.DataFrame(scores).T
df = df.round(4)
print("\n=== Résultats comparatifs ===")
print(df)


best_model = df["BERTScore F1"].idxmax()
print(f"\nMeilleur modèle selon BERTScore : {best_model} ({df.loc[best_model, 'BERTScore F1']:.4f})")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Exemples de prédictions vs ground truth (10 premiers) :

Question : (freebsddd-setup-procs.pdf) Quel intitulé est donné exactement dans «freebsddd-setup-procs.pdf» ? Repère: "1. /bin/csh -f"
Ground truth : 1. #!/bin/csh -f
Gemma-2-9B   : FreeBSD setup procedures
Qwen2.5-7B   : mem
Llama-3.2-3B : csh
--------------------------------------------------------------------------------
Question : (The_new_ISO_IEC_27037_acquisition_and_pr.pdf) Quel intitulé est donné exactement dans «The_new_ISO_IEC_27037_acquisition_and_pr.pdf» ? Repère: "oltre ad essere in grado di assolvere i"
Ground truth : oltre ad essere in grado di assolvere i compiti del
Gemma-2-9B   : Acquisition and Preservation of Digital Evidence
Qwen2.5-7B   : Acquisition and Preservation of Evidence for Digital Forensic Investigations
Llama-3.2-3B : Acquisition and preservation of digital evidence
--------------------------------------------------------------------------------
Question : (SWGDE_Best_Practices_for_Digital_Forensic



In [11]:
with open("/kaggle/working/predictions.json", "w") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

In [12]:
!pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [13]:
import fitz
import os

# Knowledge base Path
pdf_folder = "/kaggle/input/forensic"
chunks = []

for pdf_file in os.listdir(pdf_folder):
    if pdf_file.lower().endswith(".pdf"):
        path = os.path.join(pdf_folder, pdf_file)
        doc = fitz.open(path)
        text = ""
        for page in doc:
            text += page.get_text("text") + "\n\n"
        doc.close()

        current = ""
        for line in text.split("\n"):
            if len(current) + len(line) > 1200:
                chunks.append(current.strip())
                current = line
            else:
                current += " " + line
        if current:
            chunks.append(current.strip())

print(f"Base de connaissance prête : {len(chunks)} chunks")

Base de connaissance prête : 986 chunks


In [14]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

print("Embedding des chunks...")
chunk_embeddings = embedder.encode(chunks, show_progress_bar=True, batch_size=32)
chunk_embeddings = np.array(chunk_embeddings).astype('float32')

dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(chunk_embeddings)
index.add(chunk_embeddings)

print(f"Index FAISS créé avec {index.ntotal} vecteurs")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding des chunks...


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Index FAISS créé avec 986 vecteurs


In [15]:
def rag_generate(question, model, tokenizer, top_k=3):
    q_emb = embedder.encode([question])[0].astype('float32')
    faiss.normalize_L2(q_emb.reshape(1, -1))
    

    distances, indices = index.search(q_emb.reshape(1, -1), top_k)
    
    retrieved = []
    for idx in indices[0]:
        if idx != -1:
            retrieved.append(chunks[idx])
    
    context = "\n\n".join(retrieved)
    
    # Prompt RAG
    prompt = f"""Tu es un expert en digital forensics. Utilise UNIQUEMENT les informations suivantes pour répondre.

Contexte extrait :
{context}

Question : {question}

Réponds UNIQUEMENT par la réponse courte et exacte, sans explication :
Réponse :"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            temperature=0.0,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "Réponse :" in response:
        return response.split("Réponse :")[-1].strip()
    return response.strip()

In [18]:
from unsloth import FastLanguageModel

model_name = "unsloth/gemma-2-9b-it-bnb-4bit"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)

rag_predictions = []
for q in tqdm(questions, desc="Génération RAG"):
    pred = rag_generate(q, model, tokenizer, top_k=3)
    rag_predictions.append(pred)

del model, tokenizer
torch.cuda.empty_cache()

==((====))==  Unsloth 2026.1.4: Fast Gemma2 patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Génération RAG:   0%|          | 0/300 [00:00<?, ?it/s]

In [19]:
rag_em = [exact_match(p, gt) for p, gt in zip(rag_predictions, ground_truths)]
rag_f1 = [f1_token(p, gt) for p, gt in zip(rag_predictions, ground_truths)]
rag_rouge = [scorer.score(gt, p)['rougeL'].fmeasure for p, gt in zip(rag_predictions, ground_truths)]
_, _, rag_bert = score(rag_predictions, ground_truths, lang="fr" if "fr" in questions[0].lower() else "en")

rag_scores = {
    "Exact Match": np.mean(rag_em),
    "F1 Token": np.mean(rag_f1),
    "ROUGE-L": np.mean(rag_rouge),
    "BERTScore F1": rag_bert.mean().item()
}

print("\nScores SANS RAG (précédents) :")
print(df)

print("\nScores AVEC RAG :")
print(pd.Series(rag_scores).to_frame().T.round(4))

print(f"Gain BERTScore : {rag_scores['BERTScore F1'] - df.loc['Llama-3.2-3B', 'BERTScore F1']:.4f}")


Scores SANS RAG (précédents) :
              Exact Match  F1 Token  ROUGE-L  BERTScore F1
Gemma-2-9B         0.0933    0.3598   0.3856        0.0059
Qwen2.5-7B         0.0433    0.2252   0.2431       -0.1662
Llama-3.2-3B       0.0900    0.3555   0.3812       -0.1462

Scores AVEC RAG :
   Exact Match  F1 Token  ROUGE-L  BERTScore F1
0       0.1833    0.4432   0.4755        0.7378
Gain BERTScore : 0.8840


