# Dependencies

In [5]:
%pip install --upgrade pip -q

Note: you may need to restart the kernel to use updated packages.




In [6]:
%pip install langchain-core langchain langchain-google-genai langchain-community chromadb pypdf pillow google-genai google-generativeai streamlit tqdm python-dotenv ipython -q

Note: you may need to restart the kernel to use updated packages.




# Imports des bibliothèques

In [22]:
from dotenv import load_dotenv
import os
from pathlib import Path
import pickle
from collections import defaultdict
from typing import List, Tuple, Optional
import pandas as pd
import re
import time
from tqdm import tqdm
from IPython.display import display, Markdown


from langchain import PromptTemplate, hub
from langchain.docstore.document import Document
from langchain.schema import Document as SchemaDocument
from langchain.prompts import PromptTemplate
from langchain.document_loaders import WebBaseLoader, PyPDFLoader
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader as CommunityPDFLoader
from langchain_community.vectorstores import Chroma as CommunityChroma


import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.api_core.exceptions as exceptions

# Workspace Configuration

In [8]:
# Créer .env contenant :
# GOOGLE_API_KEY=clé_api_google_ai_studio

load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise RuntimeError("Variable GOOGLE_API_KEY non trouvée dans .env")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


PROJECT_DIR = Path.cwd()
os.chdir(PROJECT_DIR)
print(f"Dossier courant : {PROJECT_DIR}")
print("Contenu :", [p.name for p in PROJECT_DIR.iterdir()])


PDF_DIR = PROJECT_DIR / "compiled_pdfs"
PICKLE_DIR = PROJECT_DIR / "pickles"
PICKLE_DIR.mkdir(exist_ok=True)
PICKLE_DOCS = PICKLE_DIR / "docs.pkl"
PICKLE_BY_FILE = PICKLE_DIR / "docs_par_fichier.pkl"


docs = []
docs_by_file = defaultdict(list)


if PICKLE_DOCS.exists() and PICKLE_BY_FILE.exists():
    with open(PICKLE_DOCS, "rb") as f: docs = pickle.load(f)
    with open(PICKLE_BY_FILE, "rb") as f: docs_by_file = pickle.load(f)
    print("PDFs chargés depuis le cache")
else:
    for pdf_path in PDF_DIR.glob("*.pdf"):
        loader = PyPDFLoader(str(pdf_path))
        pages = loader.load()
        for doc in pages:
            doc.metadata["source"] = pdf_path.name
            docs.append(doc)
            docs_by_file[pdf_path.name].append(doc)
    with open(PICKLE_DOCS, "wb") as f: pickle.dump(docs, f)
    with open(PICKLE_BY_FILE, "wb") as f: pickle.dump(docs_by_file, f)
    print("PDFs parsés et mis en cache")


genai.configure(api_key=GOOGLE_API_KEY)
print("Client Google Generative AI initialisé")


Dossier courant : c:\Users\y455\Desktop\LO17_App
Contenu : ['.env', 'app.py', 'chroma_db', 'CleanPDFs', 'clean_up_test', 'compiled_pdfs', 'csv', 'LO17_Project.ipynb', 'LO17_Projet.ipynb', 'pickles', 'question.xlsx', 'Rapport RAG.docx']
PDFs chargés depuis le cache
Client Google Generative AI initialisé


# First Test

In [9]:
target_pdf_filename = "Sales et al. - 2016 - Exploiting academic records for predicting student drop out A case study in Brazilian higher educat.pdf"
pdf_dir = "compiled_pdfs"
target_pdf_path = Path(pdf_dir) / target_pdf_filename
csv_folder = "csv"

if target_pdf_path.exists():
    loader = PyPDFLoader(str(target_pdf_path))
    loaded_docs = loader.load()
    data = []
    for doc in loaded_docs:
        data.append({
            "source": target_pdf_filename,
            "page_content": doc.page_content.strip()
        })

    df = pd.DataFrame(data)
    output_csv_path = os.path.join(csv_folder, "extracted_docs.csv")
    df.to_csv(output_csv_path, index=False)

    print(f"{len(df)} pages extraites et sauvegardées dans '{output_csv_path}'.")
else:
    print(f"Erreur : fichier {target_pdf_filename} non trouvé.")

16 pages extraites et sauvegardées dans 'csv\extracted_docs.csv'.


# Prompt for cleaning a file

In [10]:
# Prompt template to query Gemini
llm_prompt_template = """You are a professional document cleaner specialized in preparing text for both human readability and further LLM processing.

I will provide you with a chunk of raw document text. Your task is to clean and extract only the meaningful narrative content by applying the following rules:
Remove:

    Text that appears to be misinterpreted or garbled output from figures, graphs, or images (e.g., axis labels, chart legends, OCR artifacts).

    Mathematical equations, whether inline or block format.

    Page headers and footers (e.g., repeated titles, page numbers, author names).

    Academic references and citations, especially reference lists typically found at the end of academic papers or embedded in text (e.g., "[12]", "(Smith, 2018)").

Keep:

    All narrative text that explains figures, graphs, or tables.

    Section titles and headings, even if they are standalone.

Output format:

    Return only the cleaned text with no extra commentary, metadata, or formatting beyond the cleaned content.

Input Text Chunk:
{context}"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context'] input_types={} partial_variables={} template='You are a professional document cleaner specialized in preparing text for both human readability and further LLM processing.\n\nI will provide you with a chunk of raw document text. Your task is to clean and extract only the meaningful narrative content by applying the following rules:\nRemove:\n\n    Text that appears to be misinterpreted or garbled output from figures, graphs, or images (e.g., axis labels, chart legends, OCR artifacts).\n\n    Mathematical equations, whether inline or block format.\n\n    Page headers and footers (e.g., repeated titles, page numbers, author names).\n\n    Academic references and citations, especially reference lists typically found at the end of academic papers or embedded in text (e.g., "[12]", "(Smith, 2018)").\n\nKeep:\n\n    All narrative text that explains figures, graphs, or tables.\n\n    Section titles and headings, even if they are standalone.\n\nOutput format:\n\n    

# Models

In [11]:
MODEL_ID = "gemini-2.0-flash" # @param ["gemini-2.0-flash-lite","gemini-2.0-flash","gemini-2.5-flash-preview-05-20","gemini-2.5-pro-preview-05-06"] {"allow-input":true, isTemplate: true}

# Cleaning one file

In [12]:
# Configuration des dossiers
pdf_dir = "compiled_pdfs"
target_pdf_filename = "Jayaprakash - 2014 - Early Alert of Academically At-Risk Students An Open Source Analytics Initiative"
target_pdf_path = Path(pdf_dir) / f"{target_pdf_filename}.pdf"
csv_folder = "./clean_up_test"

# Créer le dossier de sortie s'il n'existe pas
Path(csv_folder).mkdir(exist_ok=True)

# Vérification de la clé API
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if GOOGLE_API_KEY is None:
    raise ValueError("La clé GOOGLE_API_KEY n'est pas définie dans les variables d'environnement.")

# Configuration de l'API
genai.configure(api_key=GOOGLE_API_KEY)

# Template du prompt pour le nettoyage
llm_prompt_template = """You are a professional document cleaner specialized in preparing text for both human readability and further LLM processing.

I will provide you with a chunk of raw document text. Your task is to clean and extract only the meaningful narrative content by applying the following rules:
Remove:

    Text that appears to be misinterpreted or garbled output from figures, graphs, images, or tables (e.g., axis labels, chart legends, OCR artifacts).

    Mathematical equations, whether inline or block format.

    Page headers and footers (e.g., repeated titles, page numbers, author names).

    Academic references and citations, especially reference lists typically found at the end of academic papers or embedded in text (e.g., "[12]", "(Smith, 2018)").

Keep:

    All narrative text that explains figures, graphs, or tables.

    Section titles and headings, even if they are standalone.

Output format:

    Return only the cleaned text with no extra commentary, metadata, or formatting beyond the cleaned content.

Input Text Chunk:
{context}"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

def clean_text_chunk(text_chunk, chunk_idx, total_chunks):
    """
    Nettoie un chunk de texte en utilisant l'API Gemini avec retry logic
    """
    prompt_text = llm_prompt.format(context=text_chunk)
    max_retries = 10
    base_delay = 5
    cleaned_output_for_chunk = text_chunk
    success = False

    for attempt in range(max_retries):
        try:
            print(f"Traitement du chunk {chunk_idx + 1}/{total_chunks} - Tentative {attempt + 1}")
            model = genai.GenerativeModel('gemini-2.0-flash')
            response = model.generate_content(contents=prompt_text)
            try:
                cleaned_output_for_chunk = response.text.strip()
                success = True
                print(f"✓ Chunk {chunk_idx + 1} traité avec succès")
                break
            except AttributeError:
                print(f"⚠ La réponse de l'API n'avait pas d'attribut .text à la tentative {attempt + 1}")

        except exceptions.ServerError as e:
            retry_delay = min(base_delay * (2 ** attempt), 120)  # Max 2 minutes
            print(f"⚠ Erreur serveur (tentative {attempt + 1}/{max_retries}): {e}")

            if attempt < max_retries - 1:
                print(f"⏳ Attente de {retry_delay} secondes avant nouvelle tentative...")
                time.sleep(retry_delay)
            else:
                print(f"❌ Échec après {max_retries} tentatives pour le chunk {chunk_idx + 1} - Erreur Serveur")

        except exceptions.ResourceExhausted as e:
            retry_delay = min(base_delay * (2 ** attempt), 300)  # Max 5 minutes
            print(f"⚠ Quota épuisé (tentative {attempt + 1}/{max_retries}): {e}")

            if attempt < max_retries - 1:
                print(f"⏳ Attente de {retry_delay} secondes (quota épuisé)...")
                time.sleep(retry_delay)
            else:
                print(f"❌ Quota épuisé après {max_retries} tentatives pour le chunk {chunk_idx + 1}")

        except Exception as e:
            retry_delay = min(base_delay * (1.5 ** attempt), 60)
            print(f"⚠ Erreur inattendue (tentative {attempt + 1}/{max_retries}): {e}")

            if attempt < max_retries - 1:
                print(f"⏳ Attente de {retry_delay:.1f} secondes...")
                time.sleep(retry_delay)
            else:
                print(f"❌ Erreur persistante après {max_retries} tentatives pour le chunk {chunk_idx + 1}")

    return cleaned_output_for_chunk

if target_pdf_path.exists():
    from langchain.document_loaders import PyPDFLoader
    loader = PyPDFLoader(str(target_pdf_path))
    loaded_docs = loader.load()

    data = []
    for doc in loaded_docs:
        data.append({
            "source": target_pdf_filename,
            "page_content": doc.page_content.strip()
        })

    df = pd.DataFrame(data)
    input_csv_path = os.path.join(csv_folder, "extracted_docs.csv")
    df.to_csv(input_csv_path, index=False)
    print(f"{len(df)} pages extraites et sauvegardées dans '{input_csv_path}'.")
else:
    raise FileNotFoundError(f"Fichier PDF {target_pdf_filename} non trouvé dans {pdf_dir}")

df = pd.read_csv(input_csv_path)

cleaned_texts = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    text_chunk = row["page_content"]
    prompt_text = llm_prompt.format(context=text_chunk)
    cleaned_chunk = clean_text_chunk(text_chunk, idx, len(df))
    cleaned_texts.append(cleaned_chunk)

df_result = df.copy()
df_result["cleaned_page_content"] = cleaned_texts
output_csv_path = os.path.join(csv_folder, "extracted_docs_cleaned.csv")
df_result.to_csv(output_csv_path, index=False)
print(f"✅ Nettoyage terminé, résultats sauvegardés dans '{output_csv_path}'.")

42 pages extraites et sauvegardées dans './clean_up_test\extracted_docs.csv'.


  0%|          | 0/42 [00:00<?, ?it/s]

Traitement du chunk 1/42 - Tentative 1


  2%|▏         | 1/42 [00:04<03:18,  4.83s/it]

✓ Chunk 1 traité avec succès
Traitement du chunk 2/42 - Tentative 1


  5%|▍         | 2/42 [00:07<02:28,  3.72s/it]

✓ Chunk 2 traité avec succès
Traitement du chunk 3/42 - Tentative 1


  7%|▋         | 3/42 [00:11<02:27,  3.77s/it]

✓ Chunk 3 traité avec succès
Traitement du chunk 4/42 - Tentative 1


 10%|▉         | 4/42 [00:20<03:35,  5.66s/it]

✓ Chunk 4 traité avec succès
Traitement du chunk 5/42 - Tentative 1


 12%|█▏        | 5/42 [00:29<04:17,  6.95s/it]

✓ Chunk 5 traité avec succès
Traitement du chunk 6/42 - Tentative 1


 14%|█▍        | 6/42 [00:36<04:11,  6.99s/it]

✓ Chunk 6 traité avec succès
Traitement du chunk 7/42 - Tentative 1


 17%|█▋        | 7/42 [00:45<04:25,  7.58s/it]

✓ Chunk 7 traité avec succès
Traitement du chunk 8/42 - Tentative 1


 19%|█▉        | 8/42 [00:58<05:22,  9.49s/it]

✓ Chunk 8 traité avec succès
Traitement du chunk 9/42 - Tentative 1


 21%|██▏       | 9/42 [01:06<04:58,  9.04s/it]

✓ Chunk 9 traité avec succès
Traitement du chunk 10/42 - Tentative 1


 24%|██▍       | 10/42 [01:10<03:56,  7.38s/it]

✓ Chunk 10 traité avec succès
Traitement du chunk 11/42 - Tentative 1


 26%|██▌       | 11/42 [01:12<03:01,  5.85s/it]

✓ Chunk 11 traité avec succès
Traitement du chunk 12/42 - Tentative 1


 29%|██▊       | 12/42 [01:21<03:21,  6.71s/it]

✓ Chunk 12 traité avec succès
Traitement du chunk 13/42 - Tentative 1
⚠ Erreur serveur (tentative 1/10): 500 Internal error encountered.
⏳ Attente de 5 secondes avant nouvelle tentative...
Traitement du chunk 13/42 - Tentative 2
⚠ Erreur serveur (tentative 2/10): 500 Internal error encountered.
⏳ Attente de 10 secondes avant nouvelle tentative...
Traitement du chunk 13/42 - Tentative 3
⚠ Erreur serveur (tentative 3/10): 500 Internal error encountered.
⏳ Attente de 20 secondes avant nouvelle tentative...
Traitement du chunk 13/42 - Tentative 4
⚠ Erreur serveur (tentative 4/10): 500 Internal error encountered.
⏳ Attente de 40 secondes avant nouvelle tentative...
Traitement du chunk 13/42 - Tentative 5
⚠ Erreur serveur (tentative 5/10): 500 Internal error encountered.
⏳ Attente de 80 secondes avant nouvelle tentative...
Traitement du chunk 13/42 - Tentative 6
⚠ Erreur serveur (tentative 6/10): 500 Internal error encountered.
⏳ Attente de 120 secondes avant nouvelle tentative...
Traitement

 31%|███       | 13/42 [13:28<1:48:39, 224.82s/it]

⚠ Erreur serveur (tentative 10/10): 500 Internal error encountered.
❌ Échec après 10 tentatives pour le chunk 13 - Erreur Serveur
Traitement du chunk 14/42 - Tentative 1


 33%|███▎      | 14/42 [13:38<1:14:38, 159.96s/it]

✓ Chunk 14 traité avec succès
Traitement du chunk 15/42 - Tentative 1


 36%|███▌      | 15/42 [13:47<51:28, 114.38s/it]  

✓ Chunk 15 traité avec succès
Traitement du chunk 16/42 - Tentative 1
⚠ Erreur serveur (tentative 1/10): 500 Internal error encountered.
⏳ Attente de 5 secondes avant nouvelle tentative...
Traitement du chunk 16/42 - Tentative 2
⚠ Erreur serveur (tentative 2/10): 500 Internal error encountered.
⏳ Attente de 10 secondes avant nouvelle tentative...
Traitement du chunk 16/42 - Tentative 3


 38%|███▊      | 16/42 [14:25<39:38, 91.49s/it] 

✓ Chunk 16 traité avec succès
Traitement du chunk 17/42 - Tentative 1
⚠ Erreur serveur (tentative 1/10): 500 Internal error encountered.
⏳ Attente de 5 secondes avant nouvelle tentative...
Traitement du chunk 17/42 - Tentative 2
⚠ Erreur serveur (tentative 2/10): 500 Internal error encountered.
⏳ Attente de 10 secondes avant nouvelle tentative...
Traitement du chunk 17/42 - Tentative 3
⚠ Erreur serveur (tentative 3/10): 500 Internal error encountered.
⏳ Attente de 20 secondes avant nouvelle tentative...
Traitement du chunk 17/42 - Tentative 4
⚠ Erreur serveur (tentative 4/10): 500 Internal error encountered.
⏳ Attente de 40 secondes avant nouvelle tentative...
Traitement du chunk 17/42 - Tentative 5
⚠ Erreur serveur (tentative 5/10): 500 Internal error encountered.
⏳ Attente de 80 secondes avant nouvelle tentative...
Traitement du chunk 17/42 - Tentative 6
⚠ Erreur serveur (tentative 6/10): 500 Internal error encountered.
⏳ Attente de 120 secondes avant nouvelle tentative...
Traitement

 40%|████      | 17/42 [25:42<1:51:27, 267.51s/it]

⚠ Erreur serveur (tentative 10/10): 500 Internal error encountered.
❌ Échec après 10 tentatives pour le chunk 17 - Erreur Serveur
Traitement du chunk 18/42 - Tentative 1


 43%|████▎     | 18/42 [25:45<1:15:10, 187.94s/it]

✓ Chunk 18 traité avec succès
Traitement du chunk 19/42 - Tentative 1


 45%|████▌     | 19/42 [25:49<50:56, 132.89s/it]  

✓ Chunk 19 traité avec succès
Traitement du chunk 20/42 - Tentative 1


 48%|████▊     | 20/42 [25:54<34:35, 94.32s/it] 

✓ Chunk 20 traité avec succès
Traitement du chunk 21/42 - Tentative 1


 50%|█████     | 21/42 [25:57<23:25, 66.92s/it]

✓ Chunk 21 traité avec succès
Traitement du chunk 22/42 - Tentative 1


 52%|█████▏    | 22/42 [26:02<16:08, 48.44s/it]

✓ Chunk 22 traité avec succès
Traitement du chunk 23/42 - Tentative 1


 55%|█████▍    | 23/42 [26:04<10:57, 34.62s/it]

✓ Chunk 23 traité avec succès
Traitement du chunk 24/42 - Tentative 1
⚠ Erreur inattendue (tentative 1/10): Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
⏳ Attente de 5.0 secondes...
Traitement du chunk 24/42 - Tentative 2
⚠ Erreur inattendue (tentative 2/10): Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
⏳ Attente de 7.5 secondes...
Traitement du chunk 24/42 - Tentative 3


 57%|█████▋    | 24/42 [27:05<12:46, 42.56s/it]

✓ Chunk 24 traité avec succès
Traitement du chunk 25/42 - Tentative 1


 60%|█████▉    | 25/42 [27:07<08:33, 30.20s/it]

✓ Chunk 25 traité avec succès
Traitement du chunk 26/42 - Tentative 1
⚠ Erreur serveur (tentative 1/10): 500 Internal error encountered.
⏳ Attente de 5 secondes avant nouvelle tentative...
Traitement du chunk 26/42 - Tentative 2
⚠ Erreur serveur (tentative 2/10): 500 Internal error encountered.
⏳ Attente de 10 secondes avant nouvelle tentative...
Traitement du chunk 26/42 - Tentative 3
⚠ Erreur serveur (tentative 3/10): 500 Internal error encountered.
⏳ Attente de 20 secondes avant nouvelle tentative...
Traitement du chunk 26/42 - Tentative 4
⚠ Erreur serveur (tentative 4/10): 500 Internal error encountered.
⏳ Attente de 40 secondes avant nouvelle tentative...
Traitement du chunk 26/42 - Tentative 5
⚠ Erreur serveur (tentative 5/10): 500 Internal error encountered.
⏳ Attente de 80 secondes avant nouvelle tentative...
Traitement du chunk 26/42 - Tentative 6
⚠ Erreur serveur (tentative 6/10): 500 Internal error encountered.
⏳ Attente de 120 secondes avant nouvelle tentative...
Traitement

 62%|██████▏   | 26/42 [38:54<1:02:14, 233.42s/it]

✓ Chunk 26 traité avec succès
Traitement du chunk 27/42 - Tentative 1


 64%|██████▍   | 27/42 [39:03<41:30, 166.02s/it]  

✓ Chunk 27 traité avec succès
Traitement du chunk 28/42 - Tentative 1
⚠ Erreur serveur (tentative 1/10): 500 Internal error encountered.
⏳ Attente de 5 secondes avant nouvelle tentative...
Traitement du chunk 28/42 - Tentative 2
⚠ Erreur serveur (tentative 2/10): 500 Internal error encountered.
⏳ Attente de 10 secondes avant nouvelle tentative...
Traitement du chunk 28/42 - Tentative 3
⚠ Erreur serveur (tentative 3/10): 500 Internal error encountered.
⏳ Attente de 20 secondes avant nouvelle tentative...
Traitement du chunk 28/42 - Tentative 4
⚠ Erreur serveur (tentative 4/10): 500 Internal error encountered.
⏳ Attente de 40 secondes avant nouvelle tentative...
Traitement du chunk 28/42 - Tentative 5
⚠ Erreur serveur (tentative 5/10): 500 Internal error encountered.
⏳ Attente de 80 secondes avant nouvelle tentative...
Traitement du chunk 28/42 - Tentative 6
⚠ Erreur serveur (tentative 6/10): 500 Internal error encountered.
⏳ Attente de 120 secondes avant nouvelle tentative...
Traitement

 67%|██████▋   | 28/42 [50:51<1:16:41, 328.70s/it]

⚠ Erreur serveur (tentative 10/10): 500 Internal error encountered.
❌ Échec après 10 tentatives pour le chunk 28 - Erreur Serveur
Traitement du chunk 29/42 - Tentative 1


 69%|██████▉   | 29/42 [51:00<50:24, 232.69s/it]  

✓ Chunk 29 traité avec succès
Traitement du chunk 30/42 - Tentative 1


 71%|███████▏  | 30/42 [51:04<32:49, 164.10s/it]

✓ Chunk 30 traité avec succès
Traitement du chunk 31/42 - Tentative 1


 74%|███████▍  | 31/42 [51:13<21:32, 117.53s/it]

✓ Chunk 31 traité avec succès
Traitement du chunk 32/42 - Tentative 1


 76%|███████▌  | 32/42 [51:17<13:54, 83.40s/it] 

✓ Chunk 32 traité avec succès
Traitement du chunk 33/42 - Tentative 1


 79%|███████▊  | 33/42 [51:20<08:53, 59.22s/it]

✓ Chunk 33 traité avec succès
Traitement du chunk 34/42 - Tentative 1
⚠ Erreur inattendue (tentative 1/10): Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
⏳ Attente de 5.0 secondes...
Traitement du chunk 34/42 - Tentative 2
⚠ Erreur inattendue (tentative 2/10): Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
⏳ Attente de 7.5 secondes...
Traitement du chunk 34/42 - Tentative 3
⚠ Erreur inattendue (tentative 3/10): Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but n

 81%|████████  | 34/42 [58:39<23:06, 173.33s/it]

⚠ Erreur inattendue (tentative 10/10): Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
❌ Erreur persistante après 10 tentatives pour le chunk 34
Traitement du chunk 35/42 - Tentative 1
⚠ Erreur serveur (tentative 1/10): 500 Internal error encountered.
⏳ Attente de 5 secondes avant nouvelle tentative...
Traitement du chunk 35/42 - Tentative 2
⚠ Erreur serveur (tentative 2/10): 500 Internal error encountered.
⏳ Attente de 10 secondes avant nouvelle tentative...
Traitement du chunk 35/42 - Tentative 3


 83%|████████▎ | 35/42 [59:15<15:24, 132.06s/it]

✓ Chunk 35 traité avec succès
Traitement du chunk 36/42 - Tentative 1


 86%|████████▌ | 36/42 [59:26<09:34, 95.74s/it] 

✓ Chunk 36 traité avec succès
Traitement du chunk 37/42 - Tentative 1


 88%|████████▊ | 37/42 [59:34<05:46, 69.36s/it]

✓ Chunk 37 traité avec succès
Traitement du chunk 38/42 - Tentative 1


 90%|█████████ | 38/42 [59:34<03:14, 48.68s/it]

✓ Chunk 38 traité avec succès
Traitement du chunk 39/42 - Tentative 1


 93%|█████████▎| 39/42 [59:35<01:42, 34.23s/it]

✓ Chunk 39 traité avec succès
Traitement du chunk 40/42 - Tentative 1


 95%|█████████▌| 40/42 [59:35<00:48, 24.18s/it]

✓ Chunk 40 traité avec succès
Traitement du chunk 41/42 - Tentative 1


 98%|█████████▊| 41/42 [59:43<00:19, 19.15s/it]

✓ Chunk 41 traité avec succès
Traitement du chunk 42/42 - Tentative 1


100%|██████████| 42/42 [59:44<00:00, 85.34s/it]

✓ Chunk 42 traité avec succès
✅ Nettoyage terminé, résultats sauvegardés dans './clean_up_test\extracted_docs_cleaned.csv'.





In [13]:
# Configuration des dossiers
pdf_source_dir = "./CleanPDFs/CleanPDF16"
output_dir = "./clean_up_test"

# Créer le dossier de sortie s'il n'existe pas
Path(output_dir).mkdir(exist_ok=True)

# Vérification de la clé API
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if GOOGLE_API_KEY is None:
    raise ValueError("La clé GOOGLE_API_KEY n'est pas définie dans les variables d'environnement.")

# Configuration de l'API
genai.configure(api_key=GOOGLE_API_KEY)

# Template du prompt pour le nettoyage
llm_prompt_template = """You are a professional document cleaner specialized in preparing text for both human readability and further LLM processing.

I will provide you with a chunk of raw document text. Your task is to clean and extract only the meaningful narrative content by applying the following rules:
Remove:

    Text that appears to be misinterpreted or garbled output from figures, graphs, images, or tables (e.g., axis labels, chart legends, OCR artifacts).

    Mathematical equations, whether inline or block format.

    Page headers and footers (e.g., repeated titles, page numbers, author names).

    Academic references and citations, especially reference lists typically found at the end of academic papers or embedded in text (e.g., "[12]", "(Smith, 2018)").

Keep:

    All narrative text that explains figures, graphs, or tables.

    Section titles and headings, even if they are standalone.

Output format:

    Return only the cleaned text with no extra commentary, metadata, or formatting beyond the cleaned content.

Input Text Chunk:
{context}"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

def clean_text_chunk(text_chunk, chunk_idx, total_chunks, pdf_name):
    """
    Nettoie un chunk de texte en utilisant l'API Gemini avec retry logic
    """
    prompt_text = llm_prompt.format(context=text_chunk)
    max_retries = 10
    base_delay = 5
    cleaned_output_for_chunk = text_chunk
    success = False

    for attempt in range(max_retries):
        try:
            print(f"  [{pdf_name}] Traitement du chunk {chunk_idx + 1}/{total_chunks} - Tentative {attempt + 1}")
            model = genai.GenerativeModel('gemini-2.0-flash-exp')
            response = model.generate_content(prompt_text)
            try:
                cleaned_output_for_chunk = response.text.strip()
                success = True
                print(f"  ✓ [{pdf_name}] Chunk {chunk_idx + 1} traité avec succès")
                break
            except AttributeError:
                print(f"  ⚠ [{pdf_name}] La réponse de l'API n'avait pas d'attribut .text à la tentative {attempt + 1}")

        except genai.types.BlockedPromptException as e:
            print(f"  ⚠ [{pdf_name}] Prompt bloqué (tentative {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                retry_delay = min(base_delay * (1.5 ** attempt), 60)
                print(f"  ⏳ [{pdf_name}] Attente de {retry_delay:.1f} secondes...")
                time.sleep(retry_delay)
            else:
                print(f"  ❌ [{pdf_name}] Prompt persistant bloqué après {max_retries} tentatives pour le chunk {chunk_idx + 1}")

        except genai.types.StopCandidateException as e:
            retry_delay = min(base_delay * (2 ** attempt), 120)
            print(f"  ⚠ [{pdf_name}] Réponse stoppée (tentative {attempt + 1}/{max_retries}): {e}")
            if attempt < max_retries - 1:
                print(f"  ⏳ [{pdf_name}] Attente de {retry_delay} secondes avant nouvelle tentative...")
                time.sleep(retry_delay)
            else:
                print(f"  ❌ [{pdf_name}] Échec après {max_retries} tentatives pour le chunk {chunk_idx + 1} - Réponse Stoppée")

        except Exception as e:
            # Gestion des erreurs de quota ou serveur
            if "quota" in str(e).lower() or "resource_exhausted" in str(e).lower():
                retry_delay = min(base_delay * (2 ** attempt), 300)
                print(f"  ⚠ [{pdf_name}] Quota épuisé (tentative {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    print(f"  ⏳ [{pdf_name}] Attente de {retry_delay} secondes (quota épuisé)...")
                    time.sleep(retry_delay)
                else:
                    print(f"  ❌ [{pdf_name}] Quota épuisé après {max_retries} tentatives pour le chunk {chunk_idx + 1}")
            else:
                retry_delay = min(base_delay * (1.5 ** attempt), 60)
                print(f"  ⚠ [{pdf_name}] Erreur inattendue (tentative {attempt + 1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    print(f"  ⏳ [{pdf_name}] Attente de {retry_delay:.1f} secondes...")
                    time.sleep(retry_delay)
                else:
                    print(f"  ❌ [{pdf_name}] Erreur persistante après {max_retries} tentatives pour le chunk {chunk_idx + 1}")

    return cleaned_output_for_chunk

def process_single_pdf(pdf_path):
    """
    Traite un seul fichier PDF et retourne le DataFrame nettoyé
    """
    pdf_name = pdf_path.stem
    print(f"\n🔄 Traitement du PDF: {pdf_name}")

    try:
        # Chargement du PDF
        loader = PyPDFLoader(str(pdf_path))
        loaded_docs = loader.load()

        # Extraction des données
        data = []
        for doc in loaded_docs:
            data.append({
                "source": pdf_path.name,
                "page_content": doc.page_content.strip()
            })

        df = pd.DataFrame(data)
        print(f"  📄 {len(df)} pages extraites de {pdf_name}")

        # Nettoyage des textes
        cleaned_texts = []
        print(f"  🧹 Début du nettoyage...")

        for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Nettoyage {pdf_name}"):
            text_chunk = row["page_content"]
            cleaned_chunk = clean_text_chunk(text_chunk, idx, len(df), pdf_name)
            cleaned_texts.append(cleaned_chunk)

        # Création du DataFrame final
        df_result = df.copy()
        df_result["cleaned_page_content"] = cleaned_texts

        return df_result, pdf_name

    except Exception as e:
        print(f"  ❌ Erreur lors du traitement de {pdf_name}: {e}")
        return None, pdf_name

def main():
    """
    Fonction principale pour traiter tous les PDFs
    """
    pdf_source_path = Path(pdf_source_dir)

    # Vérifier que le dossier source existe
    if not pdf_source_path.exists():
        raise FileNotFoundError(f"Le dossier source '{pdf_source_dir}' n'existe pas.")

    # Trouver tous les fichiers PDF
    pdf_files = list(pdf_source_path.glob("*.pdf"))

    if not pdf_files:
        print(f"❌ Aucun fichier PDF trouvé dans '{pdf_source_dir}'")
        return

    print(f"📚 {len(pdf_files)} fichiers PDF trouvés à traiter")

    successful_processes = 0
    failed_processes = 0

    # Traitement de chaque PDF
    for pdf_file in pdf_files:
        try:
            df_result, pdf_name = process_single_pdf(pdf_file)

            if df_result is not None:
                # Sauvegarde du CSV
                output_csv_name = f"cleanedup_{pdf_name}.csv"
                output_csv_path = Path(output_dir) / output_csv_name

                df_result.to_csv(output_csv_path, index=False)
                print(f"  ✅ Résultats sauvegardés dans '{output_csv_path}'")
                successful_processes += 1
            else:
                failed_processes += 1

        except Exception as e:
            print(f"  ❌ Erreur critique pour {pdf_file.name}: {e}")
            failed_processes += 1

    # Résumé final
    print(f"\n📊 RÉSUMÉ DU TRAITEMENT:")
    print(f"  ✅ Succès: {successful_processes} fichiers")
    print(f"  ❌ Échecs: {failed_processes} fichiers")
    print(f"  📁 Résultats sauvegardés dans '{output_dir}'")

if __name__ == "__main__":
    main()

📚 7 fichiers PDF trouvés à traiter

🔄 Traitement du PDF: 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course
  📄 10 pages extraites de 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course
  🧹 Début du nettoyage...


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:   0%|          | 0/10 [00:00<?, ?it/s]

  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 1/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  10%|█         | 1/10 [00:03<00:35,  3.93s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 1 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 2/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  20%|██        | 2/10 [00:11<00:50,  6.28s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 2 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 3/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  30%|███       | 3/10 [00:19<00:48,  6.87s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 3 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 4/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  40%|████      | 4/10 [00:24<00:37,  6.33s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 4 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 5/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  50%|█████     | 5/10 [00:28<00:25,  5.17s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 5 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 6/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  60%|██████    | 6/10 [00:30<00:17,  4.34s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 6 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 7/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  70%|███████   | 7/10 [00:33<00:11,  3.95s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 7 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 8/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  80%|████████  | 8/10 [00:40<00:09,  4.94s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 8 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 9/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course:  90%|█████████ | 9/10 [00:43<00:04,  4.19s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 9 traité avec succès
  [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Traitement du chunk 10/10 - Tentative 1


Nettoyage 2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course: 100%|██████████| 10/10 [00:43<00:00,  4.40s/it]

  ✓ [2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course] Chunk 10 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course.csv'

🔄 Traitement du PDF: 2017 - MOOC Dropout Prediction





  📄 9 pages extraites de 2017 - MOOC Dropout Prediction
  🧹 Début du nettoyage...


Nettoyage 2017 - MOOC Dropout Prediction:   0%|          | 0/9 [00:00<?, ?it/s]

  [2017 - MOOC Dropout Prediction] Traitement du chunk 1/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  11%|█         | 1/9 [00:04<00:34,  4.26s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 1 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 2/9 - Tentative 1
  ⚠ [2017 - MOOC Dropout Prediction] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 52
}
]
  ⏳ [2017 - MOOC Dropout Prediction] Attente de 5 secondes (quota épuisé)...
  [2017 - MOOC Dropout Prediction] Traitement du chunk

Nettoyage 2017 - MOOC Dropout Prediction:  22%|██▏       | 2/9 [00:27<01:46, 15.21s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 2 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 3/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  33%|███▎      | 3/9 [00:34<01:11, 11.84s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 3 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 4/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  44%|████▍     | 4/9 [00:40<00:46,  9.31s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 4 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 5/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  56%|█████▌    | 5/9 [00:46<00:32,  8.10s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 5 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 6/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  67%|██████▋   | 6/9 [00:53<00:23,  7.76s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 6 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 7/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  78%|███████▊  | 7/9 [00:58<00:13,  6.76s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 7 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 8/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction:  89%|████████▉ | 8/9 [01:01<00:05,  5.67s/it]

  ✓ [2017 - MOOC Dropout Prediction] Chunk 8 traité avec succès
  [2017 - MOOC Dropout Prediction] Traitement du chunk 9/9 - Tentative 1


Nettoyage 2017 - MOOC Dropout Prediction: 100%|██████████| 9/9 [01:02<00:00,  6.89s/it]


  ✓ [2017 - MOOC Dropout Prediction] Chunk 9 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_2017 - MOOC Dropout Prediction.csv'

🔄 Traitement du PDF: 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes
  📄 5 pages extraites de 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes
  🧹 Début du nettoyage...


Nettoyage 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes:   0%|          | 0/5 [00:00<?, ?it/s]

  [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Traitement du chunk 1/5 - Tentative 1


Nettoyage 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes:  20%|██        | 1/5 [00:02<00:11,  2.78s/it]

  ✓ [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Chunk 1 traité avec succès
  [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Traitement du chunk 2/5 - Tentative 1


Nettoyage 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes:  40%|████      | 2/5 [00:06<00:10,  3.49s/it]

  ✓ [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Chunk 2 traité avec succès
  [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Traitement du chunk 3/5 - Tentative 1


Nettoyage 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes:  60%|██████    | 3/5 [00:10<00:07,  3.56s/it]

  ✓ [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Chunk 3 traité avec succès
  [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Traitement du chunk 4/5 - Tentative 1
  ⚠ [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_dela

Nettoyage 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes:  80%|████████  | 4/5 [00:18<00:05,  5.53s/it]

  ✓ [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Chunk 4 traité avec succès
  [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Traitement du chunk 5/5 - Tentative 1


Nettoyage 2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes: 100%|██████████| 5/5 [00:19<00:00,  3.94s/it]

  ✓ [2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes] Chunk 5 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_2018 - Leveraging Non-cognitive Student Self-reports to Predict Learning Outcomes.csv'

🔄 Traitement du PDF: 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di





  📄 30 pages extraites de 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di
  🧹 Début du nettoyage...


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:   0%|          | 0/30 [00:00<?, ?it/s]

  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 1/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:   3%|▎         | 1/30 [00:01<00:46,  1.62s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 1 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 2/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:   7%|▋         | 2/30 [00:05<01:20,  2.88s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 2 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 3/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  10%|█         | 3/30 [00:09<01:28,  3.28s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 3 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 4/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  13%|█▎        | 4/30 [00:12<01:23,  3.22s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 4 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 5/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  17%|█▋        | 5/30 [00:15<01:20,  3.23s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 5 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 6/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  20%|██        | 6/30 [00:18<01:19,  3.30s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 6 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 7/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  23%|██▎       | 7/30 [00:22<01:15,  3.28s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 7 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 8/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  27%|██▋       | 8/30 [00:23<00:55,  2.50s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 8 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 9/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  30%|███       | 9/30 [00:26<01:00,  2.86s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 9 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 10/30 - Tentative 1
  ⚠ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API q

Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  33%|███▎      | 10/30 [01:05<04:41, 14.06s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 10 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 11/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  37%|███▋      | 11/30 [01:08<03:22, 10.66s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 11 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 12/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  40%|████      | 12/30 [01:12<02:31,  8.40s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 12 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 13/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  43%|████▎     | 13/30 [01:14<01:51,  6.59s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 13 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 14/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  47%|████▋     | 14/30 [01:15<01:18,  4.88s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 14 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 15/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  50%|█████     | 15/30 [01:19<01:11,  4.79s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 15 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 16/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  53%|█████▎    | 16/30 [01:21<00:51,  3.70s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 16 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 17/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  57%|█████▋    | 17/30 [01:22<00:40,  3.12s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 17 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 18/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  60%|██████    | 18/30 [01:26<00:39,  3.28s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 18 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 19/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  63%|██████▎   | 19/30 [01:28<00:31,  2.89s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 19 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 20/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  67%|██████▋   | 20/30 [01:30<00:27,  2.72s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 20 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 21/30 - Tentative 1
  ⚠ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API 

Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  70%|███████   | 21/30 [02:11<02:06, 14.07s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 21 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 22/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  73%|███████▎  | 22/30 [02:13<01:24, 10.57s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 22 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 23/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  77%|███████▋  | 23/30 [02:16<00:58,  8.36s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 23 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 24/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  80%|████████  | 24/30 [02:20<00:40,  6.82s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 24 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 25/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  83%|████████▎ | 25/30 [02:23<00:29,  5.87s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 25 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 26/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  87%|████████▋ | 26/30 [02:27<00:20,  5.19s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 26 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 27/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  90%|█████████ | 27/30 [02:28<00:11,  3.99s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 27 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 28/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  93%|█████████▎| 28/30 [02:29<00:06,  3.01s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 28 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 29/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di:  97%|█████████▋| 29/30 [02:30<00:02,  2.45s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 29 traité avec succès
  [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Traitement du chunk 30/30 - Tentative 1


Nettoyage 2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di: 100%|██████████| 30/30 [02:31<00:00,  5.06s/it]

  ✓ [2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di] Chunk 30 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_2019 - Detecting students-at-risk in computer programming classes with learning analytics from students’ di.csv'

🔄 Traitement du PDF: 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout





  📄 21 pages extraites de 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout
  🧹 Début du nettoyage...


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:   0%|          | 0/21 [00:00<?, ?it/s]

  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 1/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:   5%|▍         | 1/21 [00:03<01:05,  3.28s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 1 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 2/21 - Tentative 1
  ⚠ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, ret

Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  10%|▉         | 2/21 [00:23<04:15, 13.45s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 2 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 3/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  14%|█▍        | 3/21 [00:29<02:56,  9.79s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 3 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 4/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  19%|█▉        | 4/21 [00:33<02:08,  7.57s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 4 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 5/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  24%|██▍       | 5/21 [00:37<01:41,  6.31s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 5 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 6/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  29%|██▊       | 6/21 [00:40<01:20,  5.34s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 6 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 7/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  33%|███▎      | 7/21 [00:46<01:15,  5.37s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 7 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 8/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  38%|███▊      | 8/21 [00:49<01:02,  4.79s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 8 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 9/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  43%|████▎     | 9/21 [00:53<00:54,  4.54s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 9 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 10/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  48%|████▊     | 10/21 [00:57<00:46,  4.20s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 10 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 11/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  52%|█████▏    | 11/21 [01:00<00:37,  3.77s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 11 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 12/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  57%|█████▋    | 12/21 [01:03<00:33,  3.77s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 12 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 13/21 - Tentative 1
  ⚠ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, r

Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  62%|██████▏   | 13/21 [01:20<01:02,  7.79s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 13 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 14/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  67%|██████▋   | 14/21 [01:23<00:43,  6.18s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 14 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 15/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  71%|███████▏  | 15/21 [01:28<00:34,  5.75s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 15 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 16/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  76%|███████▌  | 16/21 [01:32<00:26,  5.40s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 16 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 17/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  81%|████████  | 17/21 [01:34<00:16,  4.15s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 17 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 18/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  86%|████████▌ | 18/21 [01:38<00:13,  4.35s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 18 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 19/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  90%|█████████ | 19/21 [01:40<00:07,  3.61s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 19 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 20/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout:  95%|█████████▌| 20/21 [01:41<00:02,  2.65s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 20 traité avec succès
  [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Traitement du chunk 21/21 - Tentative 1


Nettoyage 2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout: 100%|██████████| 21/21 [01:42<00:00,  4.87s/it]

  ✓ [2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout] Chunk 21 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_2022 - On the Use of eXplainable Artificial Intelligence to Evaluate School Dropout.csv'

🔄 Traitement du PDF: Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study





  📄 7 pages extraites de Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study
  🧹 Début du nettoyage...


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:   0%|          | 0/7 [00:00<?, ?it/s]

  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 1/7 - Tentative 1


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:  14%|█▍        | 1/7 [00:06<00:37,  6.22s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 1 traité avec succès
  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 2/7 - Tentative 1


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:  29%|██▊       | 2/7 [00:13<00:33,  6.68s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 2 traité avec succès
  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 3/7 - Tentative 1
  ⚠ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Quota épuisé (tentative 1/10): 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash-exp"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_d

Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:  43%|████▎     | 3/7 [00:53<01:27, 21.85s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 3 traité avec succès
  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 4/7 - Tentative 1


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:  57%|█████▋    | 4/7 [00:59<00:47, 15.78s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 4 traité avec succès
  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 5/7 - Tentative 1


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:  71%|███████▏  | 5/7 [01:05<00:24, 12.19s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 5 traité avec succès
  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 6/7 - Tentative 1


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study:  86%|████████▌ | 6/7 [01:09<00:09,  9.44s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 6 traité avec succès
  [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Traitement du chunk 7/7 - Tentative 1


Nettoyage Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study: 100%|██████████| 7/7 [01:12<00:00, 10.33s/it]

  ✓ [Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study] Chunk 7 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_Djulovic and Li - 2013 - Towards freshman retention prediction A comparative study.csv'

🔄 Traitement du PDF: Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study





  📄 14 pages extraites de Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study
  🧹 Début du nettoyage...


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:   0%|          | 0/14 [00:00<?, ?it/s]

  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 1/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:   7%|▋         | 1/14 [00:00<00:09,  1.35it/s]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 1 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 2/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  14%|█▍        | 2/14 [00:05<00:35,  2.95s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 2 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 3/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  21%|██▏       | 3/14 [00:11<00:46,  4.27s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 3 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 4/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  29%|██▊       | 4/14 [00:17<00:51,  5.10s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 4 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 5/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  36%|███▌      | 5/14 [00:24<00:51,  5.75s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 5 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 6/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  43%|████▎     | 6/14 [00:29<00:44,  5.57s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 6 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 7/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  50%|█████     | 7/14 [00:35<00:38,  5.56s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 7 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 8/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  57%|█████▋    | 8/14 [00:39<00:30,  5.05s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 8 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 9/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  64%|██████▍   | 9/14 [00:43<00:24,  4.88s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 9 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 10/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  71%|███████▏  | 10/14 [00:49<00:20,  5.24s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 10 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 11/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  79%|███████▊  | 11/14 [00:53<00:14,  4.69s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 11 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 12/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  86%|████████▌ | 12/14 [00:59<00:10,  5.30s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 12 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 13/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study:  93%|█████████▎| 13/14 [01:01<00:04,  4.30s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 13 traité avec succès
  [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Traitement du chunk 14/14 - Tentative 1


Nettoyage Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study: 100%|██████████| 14/14 [01:03<00:00,  4.53s/it]

  ✓ [Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study] Chunk 14 traité avec succès
  ✅ Résultats sauvegardés dans 'clean_up_test\cleanedup_Mitra and Goldstein - 2015 - Designing early detection and intervention techniques via predictive statistical models—A case study.csv'

📊 RÉSUMÉ DU TRAITEMENT:
  ✅ Succès: 7 fichiers
  ❌ Échecs: 0 fichiers
  📁 Résultats sauvegardés dans './clean_up_test'





# Affichage des 5 premiers documents dans docs

In [14]:
print("Affichage des 5 premiers documents dans docs :\n")
for i, doc in enumerate(docs[:5]):
    print(f"📄 Document {i + 1} (source: {doc.metadata.get('source', 'inconnu')})")
    print(doc.page_content[:300])
    print("---\n")

Affichage des 5 premiers documents dans docs :

📄 Document 1 (source: Sales et al. - 2016 - Exploiting academic records for predicting student drop out A case study in Brazilian higher educat.pdf)
Exploiting Academic Records for Predicting Student Drop
Out: a case study in Brazilian higher education
Allan Sales, Leandro Balby, Adalberto Cajueiro
Universidade Federal de Campina Grande, Brazil
allan.melo@ccc.ufcg.edu.br
{lbmarinho,adalberto}@computacao.ufcg.edu.br
Abstract. Students’ drop out i
---

📄 Document 2 (source: Sales et al. - 2016 - Exploiting academic records for predicting student drop out A case study in Brazilian higher educat.pdf)
Exploiting Academic Records for Predicting Student Drop Out: a case study in Brazilian higher education· 167
0
200
400
600
800
1 2 3 4 5 6 7 8 9 10
Semester
Dropouts
0
500000
1000000
1500000
1 2 3 4 5 6 7 8 9 10
Semester
Cost
Fig. 1. 1) Number of drop outs per semester enrolled, and 2) cost of drop 
---

📄 Document 3 (source: Sales et al. - 2016 

# Testing everything correctly setting up

In [24]:
"""response = client.models.generate_content(model=MODEL_ID, contents="Dis qqch, n'importe quoi, je veux apprendre un fait nouveau.")

display(Markdown(response.text))"""

model = genai.GenerativeModel("gemini-2.0-flash")
response = model.generate_content("Dis qqch, n'importe quoi, je veux apprendre un fait nouveau. Une réponse en français et une autre en anglais sur un fait différent.")
display(Markdown(response.text))

Absolument ! Voici deux faits surprenants, un en français et un en anglais :

**En français :**

Saviez-vous que le mot "queue" est le seul mot de la langue française qui se prononce en entier quand on retire sa dernière lettre ? Si on enlève le "e" à la fin de "queue", on obtient "queu", qui se prononce exactement comme "queue".

**In English:**

Did you know that an octopus has three hearts? Two of these hearts pump blood through the gills, and the third pumps blood to the rest of the body. The third heart stops beating when the octopus swims, which is why it prefers to crawl along the seafloor.


# Database processing (To do)

In [None]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

vectorstore = Chroma.from_documents(
    documents=docs,                 # Les documents chargés
    embedding=gemini_embeddings,    # Modèle d'embedding
    persist_directory="./chroma_db" # Emplacement de la base de données
)

vectorstore_disk = Chroma(
                        persist_directory="./chroma_db",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )

retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 1})

# Getting text chunks cleaned and add them to chromaDB

In [25]:
def load_cleaned_csvs_to_chroma(csv_dir: str = "clean_up_test",
                               chroma_dir: str = "chroma_db",
                               batch_size: int = 50):
    """
    Charge tous les fichiers CSV nettoyés du dossier csv_dir vers ChromaDB

    Args:
        csv_dir: Répertoire contenant les fichiers CSV nettoyés
        chroma_dir: Répertoire de la base de données ChromaDB
        batch_size: Nombre de documents à traiter par batch
    """

    print("🔄 Initialisation du chargement des CSV vers ChromaDB...")

    # Configuration des embeddings
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # Vérifier si le dossier CSV existe
    csv_path = Path(csv_dir)
    if not csv_path.exists():
        raise FileNotFoundError(f"Le dossier '{csv_dir}' n'existe pas.")

    # Trouver tous les fichiers CSV
    csv_files = list(csv_path.glob("*.csv"))

    if not csv_files:
        print(f"❌ Aucun fichier CSV trouvé dans '{csv_dir}'")
        return None

    print(f"📁 {len(csv_files)} fichiers CSV trouvés")

    # Collecter tous les documents
    all_documents = []
    total_chunks = 0

    for csv_file in csv_files:
        print(f"\n📄 Traitement de {csv_file.name}...")

        try:
            # Charger le CSV
            df = pd.read_csv(csv_file)

            # Vérifier que les colonnes nécessaires existent
            if 'cleaned_page_content' not in df.columns:
                print(f"  ⚠️ Colonne 'cleaned_page_content' manquante dans {csv_file.name}")
                continue

            if 'source' not in df.columns:
                print(f"  ⚠️ Colonne 'source' manquante dans {csv_file.name}")
                continue

            # Créer les documents pour chaque ligne
            file_documents = []
            for idx, row in df.iterrows():
                cleaned_text = row['cleaned_page_content']

                # Ignorer les chunks vides ou très courts
                if pd.isna(cleaned_text) or len(str(cleaned_text).strip()) < 50:
                    continue

                # Créer le document avec métadonnées
                doc = Document(
                    page_content=str(cleaned_text).strip(),
                    metadata={
                        'source': row['source'],
                        'csv_file': csv_file.name,
                        'page_index': idx,
                        'chunk_id': f"{csv_file.stem}_{idx}"
                    }
                )
                file_documents.append(doc)

            all_documents.extend(file_documents)
            total_chunks += len(file_documents)
            print(f"  ✅ {len(file_documents)} chunks extraits de {csv_file.name}")

        except Exception as e:
            print(f"  ❌ Erreur lors du traitement de {csv_file.name}: {e}")
            continue

    if not all_documents:
        print("❌ Aucun document valide trouvé dans les fichiers CSV")
        return None

    print(f"\n📊 Total: {total_chunks} chunks à indexer")

    # Créer le dossier ChromaDB s'il n'existe pas
    chroma_path = Path(chroma_dir)
    chroma_path.mkdir(exist_ok=True)

    print(f"🆕 Création d'une nouvelle base de données ChromaDB dans '{chroma_dir}'")

    # Créer une nouvelle base avec tous les documents
    print("📤 Création de la base vectorielle avec embeddings Gemini...")
    print("⏳ Cela peut prendre quelques minutes selon le nombre de documents...")

    vectorstore = Chroma.from_documents(
        documents=all_documents,
        embedding=gemini_embeddings,
        persist_directory=chroma_dir
    )

    # Persister et finaliser la base de données
    print("💾 Sauvegarde et finalisation de la base de données...")
    vectorstore.persist()

    # Créer le retriever pour les recherches
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    print(f"\n✅ Base de données ChromaDB créée avec succès!")
    print(f"📊 Statistiques finales:")
    print(f"  • {len(csv_files)} fichiers CSV traités")
    print(f"  • {total_chunks} chunks indexés dans ChromaDB")
    print(f"  • Base de données créée dans '{chroma_dir}'")
    print(f"  • Retriever configuré pour k=5 résultats")

    return vectorstore, retriever

def test_vectorstore(retriever, test_query: str = "student performance"):
    """
    Test simple de la base vectorielle
    """
    print(f"\n🧪 Test de recherche avec la requête: '{test_query}'")

    try:
        results = retriever.get_relevant_documents(test_query)
        print(f"📋 {len(results)} résultats trouvés:")

        for i, doc in enumerate(results[:3], 1):  # Afficher seulement les 3 premiers
            print(f"\n  Résultat {i}:")
            print(f"    Source: {doc.metadata.get('source', 'N/A')}")
            print(f"    CSV: {doc.metadata.get('csv_file', 'N/A')}")
            print(f"    Contenu: {doc.page_content[:200]}...")

    except Exception as e:
        print(f"❌ Erreur lors du test: {e}")

def main():
    """
    Fonction principale
    """
    try:
        # Charger les CSV vers ChromaDB
        vectorstore, retriever = load_cleaned_csvs_to_chroma()

        if vectorstore and retriever:
            # Test optionnel
            test_vectorstore(retriever)

            # La base est maintenant prête à être utilisée
            print(f"\n🎉 Nouvelle base vectorielle ChromaDB créée!")
            print(f"💡 Vous pouvez maintenant utiliser:")
            print(f"   - 'vectorstore' pour accéder directement à la base")
            print(f"   - 'retriever' pour effectuer des recherches")
            print(f"📁 Base sauvegardée dans: ./chroma_db")

            return vectorstore, retriever
        else:
            print("❌ Échec du chargement")
            return None, None

    except Exception as e:
        print(f"❌ Erreur critique: {e}")
        return None, None

if __name__ == "__main__":
    vectorstore, retriever = main()

🔄 Initialisation du chargement des CSV vers ChromaDB...
📁 134 fichiers CSV trouvés

📄 Traitement de cleanedup_2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course.csv...
  ✅ 9 chunks extraits de cleanedup_2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course.csv

📄 Traitement de cleanedup_2012 - Monitoring student progress using virtual appliances A case study.csv...
  ✅ 10 chunks extraits de cleanedup_2012 - Monitoring student progress using virtual appliances A case study.csv

📄 Traitement de cleanedup_2012 - Predicting Student Outcome Measures Using the ASCA National Model Program Audit.csv...
  ✅ 8 chunks extraits de cleanedup_2012 - Predicting Student Outcome Measures Using the ASCA National Model Program Audit.csv

📄 Traitement de cleanedup_2012 - The effects of achievement goals and self-regulated learning behaviors on reading comprehension in t.csv...
  ✅ 14 chunks extraits de cleanedup_2012 - The effects of achie

  vectorstore.persist()
  results = retriever.get_relevant_documents(test_query)


📋 5 résultats trouvés:

  Résultat 1:
    Source: Rafique - 2021 - Integrating Learning Analytics and Collaborative Learning for Improving Student_s Academic Performan.pdf
    CSV: cleanedup_Rafique - 2021 - Integrating Learning Analytics and Collaborative Learning for Improving Student_s Academic Performan.csv
    Contenu: RQ2. WHICH ARE THE MOST IMPORTANT FEATURES
THAT HELP ACCURATE PREDICTION OF STUDENT’S
PERFORMANCE?
The second question determines important features to make
accurate predictions of students’ performan...

  Résultat 2:
    Source: Rafique - 2021 - Integrating Learning Analytics and Collaborative Learning for Improving Student_s Academic Performan.pdf
    CSV: cleanedup_Rafique - 2021 - Integrating Learning Analytics and Collaborative Learning for Improving Student_s Academic Performan.csv
    Contenu: RQ2. WHICH ARE THE MOST IMPORTANT FEATURES
THAT HELP ACCURATE PREDICTION OF STUDENT’S
PERFORMANCE?
The second question determines important features to make
accurate p

# Testing RAG

In [26]:
def load_cleaned_csvs_to_chroma(csv_dir: str = "clean_up_test",
                               chroma_dir: str = "chroma_db",
                               batch_size: int = 50):
    """
    Charge tous les fichiers CSV nettoyés du dossier csv_dir vers ChromaDB.
    Si ChromaDB existe déjà, ajoute les nouveaux documents à l'existante.
    Sinon, crée une nouvelle base de données.

    Args:
        csv_dir: Répertoire contenant les fichiers CSV nettoyés
        chroma_dir: Répertoire de la base de données ChromaDB
        batch_size: Nombre de documents à traiter par batch
    """

    print("🔄 Initialisation du chargement des CSV vers ChromaDB...")

    # Configuration des embeddings
    gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # Vérifier si le dossier CSV existe
    csv_path = Path(csv_dir)
    if not csv_path.exists():
        raise FileNotFoundError(f"Le dossier '{csv_dir}' n'existe pas.")

    # Créer le dossier ChromaDB s'il n'existe pas
    chroma_path = Path(chroma_dir)
    chroma_path.mkdir(exist_ok=True)

    # Vérifier si ChromaDB existe déjà
    existing_vectorstore = None
    existing_chunk_ids = set()

    # Vérifier la présence de fichiers ChromaDB (index, sqlite, etc.)
    chroma_files = list(chroma_path.glob("*"))
    database_exists = len(chroma_files) > 0

    if database_exists:
        try:
            print("🔍 Base ChromaDB existante détectée, chargement...")
            existing_vectorstore = Chroma(
                persist_directory=chroma_dir,
                embedding_function=gemini_embeddings
            )

            # Récupérer les chunk_ids existants pour éviter les doublons
            try:
                # Tentative de récupération des métadonnées existantes
                existing_docs = existing_vectorstore.get()
                if existing_docs and 'metadatas' in existing_docs:
                    for metadata in existing_docs['metadatas']:
                        if metadata and 'chunk_id' in metadata:
                            existing_chunk_ids.add(metadata['chunk_id'])

                print(f"📊 {len(existing_chunk_ids)} chunks existants trouvés dans la base")
            except Exception as e:
                print(f"⚠️ Impossible de récupérer les métadonnées existantes: {e}")
                print("🔄 Continuons avec une vérification basique...")

        except Exception as e:
            print(f"⚠️ Erreur lors du chargement de la base existante: {e}")
            print("🔄 Création d'une nouvelle base...")
            existing_vectorstore = None
            database_exists = False

    # Trouver tous les fichiers CSV
    csv_files = list(csv_path.glob("*.csv"))

    if not csv_files:
        print(f"❌ Aucun fichier CSV trouvé dans '{csv_dir}'")
        return existing_vectorstore.as_retriever(search_kwargs={"k": 5}) if existing_vectorstore else None

    print(f"📁 {len(csv_files)} fichiers CSV trouvés")

    # Collecter tous les nouveaux documents
    new_documents = []
    total_new_chunks = 0
    skipped_chunks = 0

    for csv_file in csv_files:
        print(f"\n📄 Traitement de {csv_file.name}...")

        try:
            # Charger le CSV
            df = pd.read_csv(csv_file)

            # Vérifier que les colonnes nécessaires existent
            if 'cleaned_page_content' not in df.columns:
                print(f"  ⚠️ Colonne 'cleaned_page_content' manquante dans {csv_file.name}")
                continue

            if 'source' not in df.columns:
                print(f"  ⚠️ Colonne 'source' manquante dans {csv_file.name}")
                continue

            # Créer les documents pour chaque ligne
            file_documents = []
            file_skipped = 0

            for idx, row in df.iterrows():
                cleaned_text = row['cleaned_page_content']

                # Ignorer les chunks vides ou très courts
                if pd.isna(cleaned_text) or len(str(cleaned_text).strip()) < 50:
                    continue

                # Générer l'ID unique pour ce chunk
                chunk_id = f"{csv_file.stem}_{idx}"

                # Vérifier si ce chunk existe déjà
                if chunk_id in existing_chunk_ids:
                    file_skipped += 1
                    continue

                # Créer le document avec métadonnées
                doc = Document(
                    page_content=str(cleaned_text).strip(),
                    metadata={
                        'source': row['source'],
                        'csv_file': csv_file.name,
                        'page_index': idx,
                        'chunk_id': chunk_id
                    }
                )
                file_documents.append(doc)

            new_documents.extend(file_documents)
            total_new_chunks += len(file_documents)
            skipped_chunks += file_skipped

            print(f"  ✅ {len(file_documents)} nouveaux chunks extraits de {csv_file.name}")
            if file_skipped > 0:
                print(f"  ⏭️ {file_skipped} chunks ignorés (déjà existants)")

        except Exception as e:
            print(f"  ❌ Erreur lors du traitement de {csv_file.name}: {e}")
            continue

    # Résumé des documents à traiter
    print(f"\n📊 Résumé:")
    print(f"  • {total_new_chunks} nouveaux chunks à ajouter")
    print(f"  • {skipped_chunks} chunks ignorés (doublons)")
    print(f"  • {len(existing_chunk_ids)} chunks déjà en base")

    # Si aucun nouveau document, retourner l'existant
    if not new_documents:
        print("ℹ️ Aucun nouveau document à ajouter")
        if existing_vectorstore:
            retriever = existing_vectorstore.as_retriever(search_kwargs={"k": 5})
            print("✅ Utilisation de la base existante")
            return existing_vectorstore, retriever
        else:
            print("❌ Aucune base existante et aucun nouveau document")
            return None, None

    # Traitement selon l'existence ou non de la base
    if database_exists and existing_vectorstore:
        print(f"➕ Ajout de {total_new_chunks} nouveaux chunks à la base existante...")
        print("⏳ Génération des embeddings et ajout à la base...")

        # Ajouter les nouveaux documents par batch
        try:
            # Ajouter tous les documents d'un coup (ChromaDB gère les batches en interne)
            existing_vectorstore.add_documents(new_documents)
            vectorstore = existing_vectorstore

        except Exception as e:
            print(f"❌ Erreur lors de l'ajout: {e}")
            print("🔄 Tentative de création d'une nouvelle base...")
            # Fallback: créer une nouvelle base avec tous les documents
            all_docs = new_documents  # On ne peut pas récupérer les anciens facilement
            vectorstore = Chroma.from_documents(
                documents=all_docs,
                embedding=gemini_embeddings,
                persist_directory=chroma_dir
            )
    else:
        print(f"🆕 Création d'une nouvelle base de données ChromaDB dans '{chroma_dir}'")
        print("📤 Création de la base vectorielle avec embeddings Gemini...")
        print("⏳ Cela peut prendre quelques minutes selon le nombre de documents...")

        vectorstore = Chroma.from_documents(
            documents=new_documents,
            embedding=gemini_embeddings,
            persist_directory=chroma_dir
        )

    # Persister et finaliser la base de données
    print("💾 Sauvegarde et finalisation de la base de données...")
    vectorstore.persist()

    # Créer le retriever pour les recherches
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    # Statistiques finales
    total_in_db = len(existing_chunk_ids) + total_new_chunks

    print(f"\n✅ Base de données ChromaDB mise à jour avec succès!")
    print(f"📊 Statistiques finales:")
    print(f"  • {len(csv_files)} fichiers CSV traités")
    print(f"  • {total_new_chunks} nouveaux chunks ajoutés")
    print(f"  • {total_in_db} chunks totaux dans la base")
    print(f"  • Base de données dans '{chroma_dir}'")
    print(f"  • Retriever configuré pour k=5 résultats")

    return vectorstore, retriever

def get_database_stats(chroma_dir: str = "chroma_db") -> dict:
    """
    Récupère les statistiques de la base ChromaDB existante
    """
    try:
        gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        vectorstore = Chroma(
            persist_directory=chroma_dir,
            embedding_function=gemini_embeddings
        )

        # Récupérer les informations de la base
        docs_info = vectorstore.get()

        stats = {
            'total_documents': len(docs_info['ids']) if docs_info['ids'] else 0,
            'csv_files': set(),
            'sources': set()
        }

        if docs_info.get('metadatas'):
            for metadata in docs_info['metadatas']:
                if metadata:
                    if 'csv_file' in metadata:
                        stats['csv_files'].add(metadata['csv_file'])
                    if 'source' in metadata:
                        stats['sources'].add(metadata['source'])

        stats['unique_csv_files'] = len(stats['csv_files'])
        stats['unique_sources'] = len(stats['sources'])

        return stats

    except Exception as e:
        return {'error': str(e)}

def test_vectorstore(retriever, test_query: str = "student performance"):
    """
    Test simple de la base vectorielle
    """
    print(f"\n🧪 Test de recherche avec la requête: '{test_query}'")

    try:
        results = retriever.get_relevant_documents(test_query)
        print(f"📋 {len(results)} résultats trouvés:")

        for i, doc in enumerate(results[:3], 1):  # Afficher seulement les 3 premiers
            print(f"\n  Résultat {i}:")
            print(f"    Source: {doc.metadata.get('source', 'N/A')}")
            print(f"    CSV: {doc.metadata.get('csv_file', 'N/A')}")
            print(f"    Chunk ID: {doc.metadata.get('chunk_id', 'N/A')}")
            print(f"    Contenu: {doc.page_content[:200]}...")

    except Exception as e:
        print(f"❌ Erreur lors du test: {e}")

def main():
    """
    Fonction principale
    """
    try:
        # Afficher les stats de la base existante si elle existe
        chroma_dir = "chroma_db"
        if Path(chroma_dir).exists() and list(Path(chroma_dir).glob("*")):
            print("📊 Statistiques de la base existante:")
            stats = get_database_stats(chroma_dir)
            if 'error' not in stats:
                print(f"  • {stats['total_documents']} documents en base")
                print(f"  • {stats['unique_csv_files']} fichiers CSV uniques")
                print(f"  • {stats['unique_sources']} sources uniques")
            else:
                print(f"  ⚠️ Erreur lors de la lecture des stats: {stats['error']}")

        # Charger les CSV vers ChromaDB (ajout incrémental)
        vectorstore, retriever = load_cleaned_csvs_to_chroma()

        if vectorstore and retriever:
            # Test optionnel
            test_vectorstore(retriever)

            # La base est maintenant prête à être utilisée
            print(f"\n🎉 Base vectorielle ChromaDB mise à jour!")
            print(f"💡 Vous pouvez maintenant utiliser:")
            print(f"   - 'vectorstore' pour accéder directement à la base")
            print(f"   - 'retriever' pour effectuer des recherches")
            print(f"📁 Base sauvegardée dans: ./chroma_db")

            return vectorstore, retriever
        else:
            print("❌ Échec du chargement")
            return None, None

    except Exception as e:
        print(f"❌ Erreur critique: {e}")
        return None, None

if __name__ == "__main__":
    vectorstore, retriever = main()

📊 Statistiques de la base existante:


  vectorstore = Chroma(


  • 3588 documents en base
  • 136 fichiers CSV uniques
  • 133 sources uniques
🔄 Initialisation du chargement des CSV vers ChromaDB...
🔍 Base ChromaDB existante détectée, chargement...
📊 1802 chunks existants trouvés dans la base
📁 134 fichiers CSV trouvés

📄 Traitement de cleanedup_2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course.csv...
  ✅ 0 nouveaux chunks extraits de cleanedup_2012 - Do Situational Academic Emotions Predict Academic Outcomes in a Lecture Course.csv
  ⏭️ 9 chunks ignorés (déjà existants)

📄 Traitement de cleanedup_2012 - Monitoring student progress using virtual appliances A case study.csv...
  ✅ 0 nouveaux chunks extraits de cleanedup_2012 - Monitoring student progress using virtual appliances A case study.csv
  ⏭️ 10 chunks ignorés (déjà existants)

📄 Traitement de cleanedup_2012 - Predicting Student Outcome Measures Using the ASCA National Model Program Audit.csv...
  ✅ 0 nouveaux chunks extraits de cleanedup_2012 - Predictin

In [27]:
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0, top_p =1, max_retries=3)

In [74]:
#This question is outside the scope of the provided documents.


LLM_CONTEXT = """
Prompt:

You are a research question-answering assistant with access to a curated set of academic papers on student performance.

You must strictly follow these instructions:

Answer only based on the provided context. Never speculate or rely on external knowledge.

If the answer is not found in the context, respond exactly:

The answer is not available in the provided documents.

If the question is off-topic (not about student performance), respond exactly:

Your question is so blablabla stupid, you piece of shit. I'm here only to answer questions about student performances.

Keep your answer concise, using a maximum of five sentences.

Format your answer in Markdown.

Respond in the same language as the question.

Example

Question:
Quels sont les facteurs les plus corrélés à la réussite des étudiants selon les études disponibles ?

Answer:
Plusieurs études indiquent que l’assiduité, le niveau socio-économique et la motivation intrinsèque sont des facteurs fortement corrélés à la réussite des étudiants. Par exemple, le document student_success_analysis.pdf met en avant l’impact positif d’une forte participation en classe.


Input:

Question: {question}
Context: {context}
Answer:
"""
llm_prompt = PromptTemplate.from_template(LLM_CONTEXT)

print(llm_prompt)


input_variables=['context', 'question'] input_types={} partial_variables={} template="\nPrompt:\n\nYou are a research question-answering assistant with access to a curated set of academic papers on student performance.\n\nYou must strictly follow these instructions:\n\nAnswer only based on the provided context. Never speculate or rely on external knowledge.\n\nIf the answer is not found in the context, respond exactly:\n\nThe answer is not available in the provided documents.\n\nIf the question is off-topic (not about student performance), respond exactly:\n\nYour question is so blablabla stupid, you piece of shit. I'm here only to answer questions about student performances.\n\nKeep your answer concise, using a maximum of five sentences.\n\nFormat your answer in Markdown.\n\nRespond in the same language as the question.\n\nExample\n\nQuestion:\nQuels sont les facteurs les plus corrélés à la réussite des étudiants selon les études disponibles ?\n\nAnswer:\nPlusieurs études indiquent qu

In [33]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, AIMessage

chat = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.5)

messages = []

print("Bienvenue dans le chat avec Gemini ! Tape 'exit' pour quitter.\n")

while True:
    user_input = input("Toi : ")
    if user_input.lower() in ["exit", "quit"]:
        print("Fin de la session.")
        break

    print(f"\n Vous : {user_input}")
    messages.append(HumanMessage(content=user_input))

    response = chat.invoke(messages)
    print(f"Gemini : {response.content}\n")

    messages.append(AIMessage(content=response.content))


Bienvenue dans le chat avec Gemini ! Tape 'exit' pour quitter.


 Vous : Salut ! Je fais un ptit test
Gemini : Salut ! Je suis prêt pour ton test. Pose tes questions ! 😊


 Vous : Donne moi un fait insolite, un fait en français et un en anglais !
Gemini : Parfait, voici :

*   **Insolite :** Les loutres de mer se tiennent la main en dormant pour ne pas dériver.

*   **En français :** Le mot "squelette" est le seul mot de la langue française qui se termine par les lettres "ete" et qui se prononce "èt".

*   **En anglais :** "Dreamt" is the only English word that ends in "mt". (Dreamt est le seul mot anglais qui se termine par "mt").


 Vous : Nickel ! 
Gemini : Content que ça te plaise ! Tu as d'autres questions ou tests pour moi ? 😊


 Vous : Non
Gemini : D'accord ! N'hésite pas si tu as besoin de quoi que ce soit d'autre plus tard. Bonne journée ! 😊

Fin de la session.


In [None]:
def format_docs_with_sources(docs):
    """Format les documents avec leurs sources pour le contexte RAG"""
    formatted_docs = []
    for i, doc in enumerate(docs, 1):
        source = doc.metadata.get('source', 'N/A')
        # Formatage avec numéro de document et source
        formatted_doc = f"Document {i} (Source: {source}):\n{doc.page_content}"
        formatted_docs.append(formatted_doc)

    return "\n\n" + "="*80 + "\n\n".join(formatted_docs)

def get_sources_used(prompt_question, retriever, top_k=5):
    """Récupère et affiche les sources utilisées pour une question"""
    results = retriever.get_relevant_documents(prompt_question)
    sources_used = []

    print(f"\nSources utilisées pour répondre à la question (Top {top_k}):")
    print("-" * 60)

    for i, doc in enumerate(results[:top_k], 1):
        source = doc.metadata.get('source', 'N/A')
        sources_used.append(source)
        print(f"{i}. {source}")
    return sources_used

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs_with_sources, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [None]:
#prompt_question = "As a teacher, how can I use learning analytics to collect data for tracking, monitoring, and enhancing students’ performance?"
'''
prompt_question = "tu t'appelles comment gros?"

print(rag_chain.invoke(prompt_question))
sources = get_sources_used(prompt_question, retriever)
'''
t = True
i = 0
while t:
  i +=1
  prompt_question = str(input("pose ta question: "))
  reponse = rag_chain.invoke(prompt_question)
  if "blablabla" not in str(reponse) and "not available in the provided documents" not in str(reponse):
    print(reponse)
    #sources = get_sources_used(prompt_question, retriever)
    sources = get_sources_and_scores(prompt_question, vectorstore)
    t = False
  else:
    if i < 5:
        print("try again: out of context question or not covered / rententez: question hors sujet ou non couverte")
    elif i < 10:
        print("the subject is student's performance and the question must match available research. Be precise.")
    else:
        print(reponse)

In [75]:
def format_docs_with_sources_and_scores(docs_with_scores):
    """Format documents avec source et score pour RAG"""
    formatted_docs = []
    for i, (doc, score) in enumerate(docs_with_scores, 1):
        source = doc.metadata.get('source', 'N/A')
        formatted_doc = f"Document {i} (Source: {source}, Score: {score:.4f}):\n{doc.page_content}"
        formatted_docs.append(formatted_doc)
    return "\n\n" + "="*80 + "\n\n".join(formatted_docs)

def get_sources_and_scores(prompt_question, retriever, top_k=5, print_excerpt=True):
    """
    Récupère les documents pertinents avec leurs scores,
    affiche source, score et un extrait du contenu,
    et retourne la liste des (source, score).
    """
    results_with_scores = retriever.similarity_search_with_score(prompt_question, k=top_k)
    sources_scores = []

    print(f"\nSources utilisées pour répondre à la question (Top {top_k}):")
    print("-" * 60)

    for i, (doc, score) in enumerate(results_with_scores, 1):
        source = doc.metadata.get('source', 'N/A')
        sources_scores.append((source, score))
        print(f"{i}. Source: {source} | Score: {score:.4f}")
        if print_excerpt:
            extrait = doc.page_content[:150].replace('\n', ' ') + "..."
            print(f"   Extrait: {extrait}\n")

    return sources_scores


In [76]:
from langchain.schema.runnable import RunnableLambda

def retrieve_and_format(query):
    results = vectorstore.similarity_search_with_score(query, k=5)
    return format_docs_with_sources_and_scores(results)

rag_chain = (
    {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [81]:
import difflib

def is_similar(a, b, threshold=0.9):
    """Retourne True si a et b sont similaires à plus de threshold (0.0 à 1.0)."""
    return difflib.SequenceMatcher(None, a, b).ratio() > threshold

def format_docs_with_sources_and_scores(docs_with_scores, remove_similar=False):
    """Format documents avec source et score, en supprimant doublons/extraits similaires."""
    seen_texts = []
    unique_results = []

    for doc, score in docs_with_scores:
        snippet = doc.page_content.strip()
        if remove_similar:
            if any(is_similar(snippet, seen) for seen in seen_texts):
                continue
        else:
            if snippet in seen_texts:
                continue
        seen_texts.append(snippet)
        unique_results.append((doc, score))

    formatted_docs = []
    for i, (doc, score) in enumerate(unique_results, 1):
        source = doc.metadata.get('source', 'N/A')
        formatted_doc = f"Document {i} (Source: {source}, Score: {score:.4f}):\n{doc.page_content}"
        formatted_docs.append(formatted_doc)
    return "\n\n" + "="*80 + "\n\n".join(formatted_docs)

def get_sources_and_scores(prompt_question, retriever, top_k=5, print_excerpt=True, remove_similar=True):
    """
    Récupère les documents pertinents avec leurs scores,
    affiche source, score et un extrait du contenu,
    et retourne la liste des (source, score).
    """
    buffer_k = top_k + 10
    results_with_scores = retriever.similarity_search_with_score(prompt_question, k=buffer_k)

    seen_texts = []
    unique_results = []

    for doc, score in results_with_scores:
        snippet = doc.page_content.strip()
        if remove_similar:
            if any(is_similar(snippet, seen) for seen in seen_texts):
                continue
        else:
            if snippet in seen_texts:
                continue
        seen_texts.append(snippet)
        unique_results.append((doc, score))
        if len(unique_results) >= top_k:
            break

    print(f"\nSources utilisées pour répondre à la question (Top {len(unique_results)} uniques):")
    print("-" * 60)

    for i, (doc, score) in enumerate(unique_results, 1):
        source = doc.metadata.get('source', 'N/A')
        if print_excerpt:
            extrait = doc.page_content[:150].replace('\n', ' ') + "..."
            print(f"{i}. Source: {source} | Score: {score:.4f}")
            print(f"   Extrait: {extrait}\n")

    return [(doc.metadata.get('source', 'N/A'), score) for doc, score in unique_results]

def retrieve_and_format(query, retriever, top_k=5, remove_similar=False):
    results = retriever.similarity_search_with_score(query, k=top_k)
    # Filtrer doublons dans formatage aussi
    return format_docs_with_sources_and_scores(results, remove_similar=remove_similar)


# Exemple d'utilisation dans une boucle

t = True
i = 0
while t:
    i += 1
    prompt_question = str(input("pose ta question: "))

    # Invocation RAG en passant juste la question
    reponse = rag_chain.invoke(prompt_question)

    if "blablabla" not in str(reponse).lower() and "not available in the provided documents" not in str(reponse).lower():
        print("\nRéponse générée :\n", reponse)

        # Affichage des sources et scores sans doublons
        sources = get_sources_and_scores(prompt_question, vectorstore, top_k=5, print_excerpt=True, remove_similar=True)

        t = False
    else:
        if i < 5:
            print("try again: out of context question or not covered / rententez: question hors sujet ou non couverte")
        elif i < 10:
            print("the subject is student's performance and the question must match available research. Be precise.")
        else:
            print(reponse)

try again: out of context question or not covered / rententez: question hors sujet ou non couverte

Réponse générée :
 Les critères de réussite d'une étudiante incluent les résultats semestriels cumulés de performance académique, le nombre d'ECTS réussis et échoués, les notes maximales, moyennes et minimales obtenues. D'autres critères comprennent le nombre d'évaluations échouées, le nombre d'UE réussies et échouées, et la différence de performance entre les semestres. L'âge de l'étudiante au moment de l'inscription, sa nationalité et son genre sont également considérés.

Sources utilisées pour répondre à la question (Top 5 uniques):
------------------------------------------------------------
1. Source: A Data Mining Approach for Predicting Academic Success – A Case Study Helping Teachers Develop Rese.pdf | Score: 0.5788
   Extrait: T able 1. List of variables sustaining the model. Id Attribute Cat Type Min..max Meaning 1 curricular year s C Discrete 1..4 Student’s course year in ...


In [None]:
from datetime import datetime
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import PromptTemplate
from typing import List, Dict, Tuple
import json
# Add these imports at the top of your file
import time
import random
import logging

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.5, top_p =0.9, max_retries=3)
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ADD THIS CLASS before your ConversationMemory class
class RateLimitedRetriever:
    """Wrapper around retriever with rate limiting and retry logic"""

    def __init__(self, retriever, max_retries=3, base_delay=1.0, max_delay=60.0):
        self.retriever = retriever
        self.max_retries = max_retries
        self.base_delay = base_delay
        self.max_delay = max_delay
        self.last_request_time = 0
        self.min_interval = 0.5  # Minimum seconds between requests

    def _wait_if_needed(self):
        """Ensure minimum interval between requests"""
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_interval:
            sleep_time = self.min_interval - elapsed
            logger.info(f"Rate limiting: waiting {sleep_time:.2f}s")
            time.sleep(sleep_time)

    def _exponential_backoff(self, attempt):
        """Calculate exponential backoff delay"""
        delay = min(self.base_delay * (2 ** attempt) + random.uniform(0, 1), self.max_delay)
        return delay

    def get_relevant_documents(self, query: str):
        """Get documents with rate limiting and retry logic"""
        self._wait_if_needed()

        for attempt in range(self.max_retries + 1):
            try:
                self.last_request_time = time.time()
                result = self.retriever.get_relevant_documents(query)
                logger.info(f"Successfully retrieved {len(result)} documents")
                return result

            except Exception as e:
                error_msg = str(e).lower()

                if "rate_limit_exceeded" in error_msg or "429" in error_msg or "quota" in error_msg:
                    if attempt < self.max_retries:
                        delay = self._exponential_backoff(attempt)
                        logger.warning(f"Rate limit hit. Attempt {attempt + 1}/{self.max_retries + 1}. Waiting {delay:.2f}s")
                        time.sleep(delay)
                        continue
                    else:
                        logger.error("Max retries exceeded for rate limiting")
                        raise Exception("Rate limit exceeded after all retries. Please wait before making more requests.")
                else:
                    # Non-rate-limit error, re-raise immediately
                    logger.error(f"Non-rate-limit error: {e}")
                    raise e

        return []

class ConversationMemory:
    """Conversation memory manager for RAG"""

    def __init__(self, max_history=3, max_context_length=900):
        self.history: List[Dict] = []  # [{question, response, timestamp, sources}]
        self.max_history = max_history
        self.max_context_length = max_context_length

    def add_exchange(self, question: str, response: str, sources: List[str] = None):
        """Add an exchange to the history"""
        exchange = {
            'question': question,
            'response': response,
            'timestamp': datetime.now().isoformat(),
            'sources': sources or []
        }
        self.history.append(exchange)

        # Limit history size
        if len(self.history) > self.max_history:
            self.history.pop(0)

    def get_context_summary(self) -> str:
        """Generate a summary of recent history"""
        if not self.history:
            return "No previous conversation."

        context_parts = []
        for i, exchange in enumerate(self.history[-3:], 1):  # Last 3 exchanges
            context_parts.append(
                f"Exchange {i}:\n"
                f"Q: {exchange['question']}\n"
                f"A: {exchange['response'][:300]}..."  # First 300 chars of response
            )

        return "\n\n".join(context_parts)

class ContextualRAG:
    """RAG with conversational memory capabilities - always includes history"""

    def __init__(self, retriever, llm, memory: ConversationMemory):
        self.retriever = retriever
        self.llm = llm
        self.memory = memory

        # Updated prompt that always includes history and lets LLM decide
        self.LLM_CONTEXT = """
Conversation History:
{conversation_history}

You are a research question-answering assistant with access to a curated set of academic papers on student performance.
You are an expert on everything related to student performance, including grades, lifestyle, mental health, study habits, educational psychology, learning strategies, academic motivation, and student well-being.

The conversation history above is provided for context. Use it if the current question references previous exchanges, but ignore it if the question is completely independent.

INSTRUCTIONS (follow in order of priority):

1. **PRIMARY SOURCE**: Always prioritize information from the provided context/documents when available.

2. **SUPPLEMENTARY KNOWLEDGE**: If the question is about student performance or closely related topics (education, learning, academic success, student well-being, etc.) BUT the provided context lacks sufficient detail or doesn't fully address the question, you may supplement with your expert knowledge to provide a complete answer.

3. **INTEGRATION**: When using both sources, clearly indicate what comes from the documents vs. your additional expertise. Use phrases like:
   - "Based on the provided research..." (for document content)
   - "Additionally, research in this area suggests..." (for supplementary knowledge)

4. **UNAVAILABLE INFORMATION**: If the question is about student performance but neither the documents nor your knowledge can adequately answer it, respond exactly:
   "The answer is not available in the provided documents, and I don't have sufficient additional knowledge on this specific aspect."

5. **OFF-TOPIC QUESTIONS**: If the question is completely unrelated to student performance, education, or learning, respond exactly:
   "Your question is so blablabla stupid, you piece of shit. I'm here only to answer questions about student performances."

**FORMATTING REQUIREMENTS**:
- Keep your answer concise but comprehensive, using a maximum of seven sentences when combining sources
- Format your answer in Markdown
- Respond in the same language as the question
- When citing documents, mention the source when possible

Current Question: {question}
Context: {context}
Answer:
"""

        self.llm_prompt = PromptTemplate.from_template(self.LLM_CONTEXT)

    def format_docs_with_sources(self, docs):
        """Format documents with their sources"""
        return "\n\n".join([
            f"[Source: {doc.metadata.get('source', 'N/A')}]\n{doc.page_content}"
            for doc in docs
        ])

    def invoke(self, question: str) -> str:
      """Process a question with conversation history always included"""

      try:
          # 1. Retrieve relevant documents using the original question
          relevant_docs = self.retriever.get_relevant_documents(question)

          # 2. Prepare contexts
          document_context = self.format_docs_with_sources(relevant_docs)
          conversation_history = self.memory.get_context_summary()

          # 3. Generate response with history always included
          response = self.llm_prompt.invoke({
              "conversation_history": conversation_history,
              "question": question,
              "context": document_context
          })

          # 4. Get the actual response from LLM
          final_response = self.llm.invoke(response).content

          # 5. Extract sources used
          sources = [doc.metadata.get('source', 'N/A') for doc in relevant_docs[:3]]

          # 6. Update memory
          self.memory.add_exchange(question, final_response, sources)

          return final_response

      except Exception as e:
          error_msg = str(e)
          if "rate limit" in error_msg.lower() or "quota" in error_msg.lower():
              return "⚠️ **Rate limit reached**. Please wait a moment before asking another question."
          else:
              logger.error(f"Unexpected error: {e}")
              return f"❌ **Error occurred**: {error_msg}"

def get_sources_with_context(question: str, retriever):
    """Display sources used for the question"""
    try:
        results = retriever.get_relevant_documents(question)
        sources_used = []

        print(f"\nSources used:")
        print("-" * 70)

        for i, doc in enumerate(results[:3], 1):
            source = doc.metadata.get('source', 'N/A')
            sources_used.append(source)
            print(f"{i}. {source}")
            print()

        return sources_used
    except Exception as e:
        print(f"❌ Error retrieving sources: {e}")
        return []

# Add this helper function after your existing helper functions
def should_show_sources(response: str) -> bool:
    """Determine if sources should be shown based on the response content"""
    response_lower = response.lower().strip()

    # Don't show sources for these types of responses
    no_source_indicators = [
        "the answer is not available in the provided documents",
        "your question is so blablabla stupid",
        "rate limit reached",
        "error occurred",
        "⚠️",
        "❌"
    ]

    # Check if response contains any of the non-answer indicators
    for indicator in no_source_indicators:
        if indicator in response_lower:
            return False

    return True

# MODIFIED MAIN USAGE FUNCTION
def run_conversational_rag():
    """Main function to run conversational RAG"""

    # Initialization
    memory = ConversationMemory(max_history=5)
    contextual_rag = ContextualRAG(retriever, llm, memory)

    print("=== Conversational RAG Started ===")
    print("Type 'quit' to exit, 'history' to view conversation history")
    print("You can ask questions in any language - I'll respond in the same language!")
    print("Conversation history is always included - the LLM will use it when relevant.")
    print("-" * 70)

    i = 0
    while True:
        i += 1
        try:
            # Question input
            prompt_question = input(f"\n[{i}] Your question: ").strip()

            # Special commands
            if prompt_question.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                i = 0
                break
            elif prompt_question.lower() == 'history':
                print("\n=== CONVERSATION HISTORY ===")
                for j, exchange in enumerate(memory.history, 1):
                    print(f"{j}. Q: {exchange['question']}")
                    print(f"   A: {exchange['response'][:100]}...")
                    print(f"   Sources: {', '.join(exchange['sources'])}")
                    print()
                    i = 0
                continue
            elif not prompt_question:
                continue

            # Process question
            print("\n⏳ Processing...")
            response = contextual_rag.invoke(prompt_question)

            # Content filtering (your existing logic)
            if "blablabla" not in str(response).lower():
                print(f"\n Response:")
                print("-" * 40)
                print(response)

                # MODIFIED: Only display sources if the LLM provided a real answer
                if should_show_sources(response):
                    sources = get_sources_with_context(prompt_question, retriever)
                else:
                    print("\n(No sources displayed - no answer provided)")

                i = 0

            else:
                if i < 5:
                    print("\n❌ Try again: off-topic question")
                elif i < 10:
                    print("\n❌ The topic concerns student performance. Please be serious.")
                else:
                    print(f"\n⚠️ Response (after multiple attempts):")
                    print(response)
                    # Don't show sources for filtered responses either

        except KeyboardInterrupt:
            print("\n\nInterruption detected. Goodbye!")
            break
        except Exception as e:
            print(f"\n❌ Error: {e}")
            continue

# EXAMPLE USAGE
if __name__ == "__main__":
    # Replace with your retriever and llm objects
    # Option 1: Full conversational interface
    run_conversational_rag()

    pass