In [None]:

import os
import psycopg
from pgvector import Vector
from pgvector.psycopg import register_vector
from transformers import pipeline, AutoTokenizer, AutoModel
import torch

DATA_DIR = "data/TRANS_TXT" 
db_params = {
    'dbname': 'rag_chatbot',
    'user': 'postgres',
    'password': '11649303',
    'host': 'localhost',
    'port': '5432'
}
VECTOR_DIM = 384  


print("Chargement du modèle d'embedding...")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
embedding_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
print("Modèle d'embedding chargé !")

print("Chargement du modèle de génération...")
generator = pipeline('text2text-generation', model='google/flan-t5-base')
print("Modèle de génération chargé !")



def embed_text(text: str):
    """Calcule l'embedding d'un texte via Hugging Face et renvoie une liste"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = embedding_model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().tolist()


def create_corpus_list(data_dir: str):
    """Récupère tous les textes depuis les fichiers du dossier"""
    corpus_list = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            file_path = os.path.join(data_dir, file_name)
            with open(file_path, "r", encoding="latin-1") as f:
                lines = [
                    line.strip().removeprefix("     ")
                    for line in f.readlines()
                    if line.strip() and not line.startswith("<")
                ]
                corpus_list.extend(lines)
    return corpus_list


def save_embedding(text: str, embedding: list, cursor):
    """Sauvegarde un embedding dans la base PostgreSQL"""
    cursor.execute(
        "INSERT INTO embeddings (corpus, embedding) VALUES (%s, %s)",
        (text, Vector(embedding)) 
    )


def search_pgvector(question: str, top_k: int = 3):
    """Recherche les textes les plus similaires via pgvector"""
    question_vector = Vector(embed_text(question))  
    conn_str = f"dbname={db_params['dbname']} user={db_params['user']} password={db_params['password']} host={db_params['host']} port={db_params['port']}"
    with psycopg.connect(conn_str) as conn:
        register_vector(conn)
        with conn.cursor() as cur:
            cur.execute("""
                SELECT corpus, embedding <-> %s AS distance
                FROM embeddings
                ORDER BY embedding <-> %s
                LIMIT %s
            """, (question_vector, question_vector, top_k))
            return cur.fetchall()


def llm_answer(question: str, top_k: int = 3):
    """Génère la réponse en combinant le contexte RAG et le modèle de génération"""
    results = search_pgvector(question, top_k)
    if not results:
        return "Je n'ai pas trouvé de réponse pertinente."
    context = " ".join([corpus for corpus, _ in results])
    prompt = f"Question: {question}\nContexte: {context}\nRéponds clairement :"
    answer = generator(prompt, max_length=200)[0]['generated_text']
    return answer



if __name__ == "__main__":
    conn_str = f"dbname={db_params['dbname']} user={db_params['user']} password={db_params['password']} host={db_params['host']} port={db_params['port']}"
    try:
        with psycopg.connect(conn_str) as conn:
            conn.autocommit = True
            register_vector(conn)
            with conn.cursor() as cur:
                cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
                cur.execute("""
                    CREATE TABLE IF NOT EXISTS embeddings (
                        id SERIAL PRIMARY KEY,
                        corpus TEXT,
                        embedding VECTOR(384)
                    )
                """)
                cur.execute("SELECT COUNT(*) FROM embeddings;")
                if cur.fetchone()[0] == 0:
                    print(f"Lecture des fichiers dans {DATA_DIR}...")
                    corpus_list = create_corpus_list(DATA_DIR)
                    print(f"{len(corpus_list)} textes trouvés, calcul des embeddings...")
                    for i, text in enumerate(corpus_list, 1):
                        embedding = embed_text(text)
                        save_embedding(text, embedding, cur)
                        print(f"[{i}/{len(corpus_list)}] traité")
                    print("Tous les embeddings sont sauvegardés !")
                else:
                    print("Embeddings déjà présents dans la table.")

    except Exception as e:
        print(f"Erreur PostgreSQL : {e}")


print("\n=== CHATBOT RAG PRÊT ===")
while True:
    question = input("Votre question : ")
    if question.lower() in ['quit', 'exit', 'q']:
        print("Au revoir !")
        break
    if question.strip():
        response = llm_answer(question, top_k=3)
        print("\nRéponse :")
        print(response)
        print("="*70)


Chargement du modèle d'embedding...
Modèle d'embedding chargé !
Chargement du modèle de génération...


Device set to use cpu


Modèle de génération chargé !
Embeddings déjà présents dans la table.

=== CHATBOT RAG PRÊT ===


UndefinedFunction: l'opérateur n'existe pas : double precision[] <-> vector
LINE 2:                 SELECT corpus, embedding <-> $1 AS distance
                                                 ^
HINT:  Aucun opérateur ne correspond au nom donné et aux types d'arguments.
Vous devez ajouter des conversions explicites de type.