# INSTALLER LES PACKAGES SUIVANTS

In [None]:
#!pip install tqdm numpy networkx spacy scikit-learn langchain langchain-community fastembed huggingface-hub transformers datasets ragas bitsandbytes
#!python -m spacy download en_core_web_sm
#!pip install torch
#!pip install langchain_huggingface
#!pip install unstructured
#!pip install "unstructured[docx]"
#!pip install langchain_experimental
#!pip install langchain_community
#!pip install --upgrade datasets 
#!pip install --upgrade langchain langchain_experimental
#!pip install evaluate
#!pip install rouge_score


In [None]:
import os
import torch
import numpy as np
import networkx as nx
import spacy
from tqdm import tqdm
import pandas as pd
from datasets import Dataset

from langchain.document_loaders import UnstructuredFileLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

from huggingface_hub import login
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from langchain.llms import HuggingFacePipeline


###############################################################################
# 1. Chargement / préparation Dataset
###############################################################################

docs_path = "/kaggle/input/dataset10/Dataset1"  # <-- Chemin vers vos .doc/.docx

# Lister les .doc et .docx
doc_files = [
    os.path.join(docs_path, file)
    for file in os.listdir(docs_path)
    if file.endswith(".doc") or file.endswith(".docx")
]
print(f"Nombre de fichiers .doc/.docx trouvés : {len(doc_files)}")

# Charger les documents
documents = []
for doc_file in tqdm(doc_files, desc="Chargement des fichiers"):
    loader = UnstructuredFileLoader(doc_file)
    # Chaque fichier peut contenir plusieurs pages : on étend la liste
    documents.extend(loader.load())

print(f"\nNombre total de documents chargés : {len(documents)}")

# Filtrer les documents vides ou trop courts
documents = [doc for doc in documents if doc.page_content and len(doc.page_content.strip()) > 10]
print(f"Nombre de documents après filtrage : {len(documents)}")


###############################################################################
# 2. Semantic Chunking (utilisation des embeddings BAAI/bge-base-en-v1.5)
###############################################################################

# Modèle d'embeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Création du chunker sémantique
semantic_chunker = SemanticChunker(
    embed_model,
    breakpoint_threshold_type="percentile"  # ou un autre mode si besoin
)

# Effectuer le découpage par lots pour ne pas surcharger la mémoire
batch_size = 10
batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]

semantic_chunks = []
for batch in tqdm(batches, desc="Processing documents in batches"):
    batch_content = [doc.page_content for doc in batch]
    semantic_chunks.extend(semantic_chunker.create_documents(batch_content))

# Retirer les chunks trop courts ou vides
semantic_chunks = [sc for sc in semantic_chunks if sc.page_content and len(sc.page_content.strip()) > 10]
print(f"Nombre de chunks sémantiques : {len(semantic_chunks)}")


###############################################################################
# 3. Construction d'un graphe avec les embeddings neuronaux
###############################################################################
nlp = spacy.load("en_core_web_sm")

def extract_keywords(chunk, nlp):
    doc_spacy = nlp(chunk.page_content)
    return set(ent.text.lower() for ent in doc_spacy.ents)

# Créer le graphe : chaque chunk est un noeud
G = nx.Graph()
for idx, chunk in enumerate(semantic_chunks):
    keywords = extract_keywords(chunk, nlp)
    G.add_node(idx, text=chunk.page_content, keywords=keywords)

# Calculer les embeddings de tous les chunks (une seule fois)
def create_chunk_embeddings(chunks, embed_model):
    texts = [chunk.page_content for chunk in chunks]
    # Renvoie une liste (ou un tableau) de vecteurs
    chunk_embeddings = embed_model.embed_documents(texts) 
    return np.array(chunk_embeddings)

chunk_embeddings = create_chunk_embeddings(semantic_chunks, embed_model)

# Ajouter des arêtes dans le graphe en fonction de la similarité cosinus
def enhance_graph_with_embeddings(G, chunk_embeddings, similarity_threshold=0.2):
    similarity_matrix = cosine_similarity(chunk_embeddings)
    n = len(similarity_matrix)
    
    for i in range(n):
        for j in range(i + 1, n):
            if similarity_matrix[i][j] > similarity_threshold:
                G.add_edge(i, j, embedding_weight=similarity_matrix[i][j])
    return G

G = enhance_graph_with_embeddings(G, chunk_embeddings, similarity_threshold=0.2)


###############################################################################
# 4. Fonction de retrieval : retrouver les nœuds (chunks) pertinents
###############################################################################

def semantic_node_retrieval(question, embed_model, chunk_embeddings):
    # Embed la question
    question_vector = embed_model.embed_query(question)
    question_vector = np.array(question_vector).reshape(1, -1)

    # Calculer la similarité entre la question et tous les embeddings de chunks
    similarities = cosine_similarity(question_vector, chunk_embeddings)[0]
    
    # Associer chaque chunk à son score
    node_similarities = list(enumerate(similarities))
    # Classer les chunks du plus pertinent au moins pertinent
    return sorted(node_similarities, key=lambda x: x[1], reverse=True)


###############################################################################
# 5. Chargement/Préparation du LLM Hugging Face (Zephyr 4-bit)
###############################################################################
# Connecter à HF (token ayant accès au modèle)
login("Your_Token")  # <-- À remplacer par VOTRE token

# BitsAndBytesConfig pour la quantization en 4 bits
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model_name = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

reader_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=0.7,
    repetition_penalty=1,
    return_full_text=False,
    max_new_tokens=500,
)

# Wrapper LangChain pour le pipeline HF
chat_model = HuggingFacePipeline(pipeline=reader_pipeline)


###############################################################################
# 6. RAG Workflow : fonction qui fait le retrieval + la génération
###############################################################################

def clean_text(text):
    text = text.strip()
    if text.lower().startswith("answer:"):
        text = text[len("answer:"):].strip()
    elif text.lower().startswith("question:"):
        text = text[len("question:"):].strip()
    return text

def graph_rag_retrieval(question):
    # 1) Récupérer les chunks les plus pertinents
    relevant_nodes = semantic_node_retrieval(question, embed_model, chunk_embeddings)

    # 2) Sélectionner les 3 meilleurs chunks
    top_k = 3
    context_chunks = []
    for node_idx, similarity in relevant_nodes[:top_k]:
        context_chunks.append(G.nodes[node_idx]["text"])

    # 3) Construire un prompt pour le LLM
    context_for_qa = "\n\n".join(context_chunks)
    final_prompt = (
        f"Using the following context, answer the question:\n\n"
        f"Question: {question}\n\n"
        f"Context:\n{context_for_qa}\n\n"
        "Answer:"
    )

    # 4) Obtenir la réponse du modèle
    rag_response = chat_model.invoke(final_prompt)

    return rag_response, context_chunks


###############################################################################
# 7. Exemple d'utilisation
###############################################################################
question = "How are 3GPP specifications numbered and what do the specific fields represent?"
response, context = graph_rag_retrieval(question)

print("Response:", response)
print("\nContexts Used:", context)


###############################################################################
# 8. (Optionnel) Génération d'un mini dataset Q&A pour évaluation
###############################################################################
questions = []
ground_truths = []
contexts = []
answers = []

# Générer Q&A pour quelques chunks
for chunk in semantic_chunks[10:20]:
    # a) Générer une question
    question_prompt = (
        "Create a question that can be answered by the following context:\n\n"
        f"{chunk.page_content}\n\nQuestion:"
    )
    question_text = clean_text(chat_model.invoke(question_prompt))
    questions.append(question_text)

    # b) Générer une "ground truth" (réponse en se basant UNIQUEMENT sur ce chunk)
    ground_truth_prompt = (
        f"Answer this question using ONLY the following context:\n\n"
        f"Question: {question_text}\n\nContext: {chunk.page_content}\n\nAnswer:"
    )
    ground_truth_text = clean_text(chat_model.invoke(ground_truth_prompt))
    ground_truths.append(ground_truth_text)

    # c) Générer la réponse RAG (graph)
    rag_response, used_context = graph_rag_retrieval(question_text)
    answer_text = clean_text(rag_response)
    answers.append(answer_text)
    contexts.append(used_context)

# Créer un Dataset HuggingFace
qagc_list = [
    {
        "question": q,
        "answer": a,
        "contexts": c,
        "ground_truth": gt
    }
    for q, a, c, gt in zip(questions, answers, contexts, ground_truths)
]

eval_dataset = Dataset.from_list(qagc_list)
eval_df = eval_dataset.to_pandas()
eval_df.to_csv("semantic_graph_rag_dataset.csv", index=False)

print("\nDataset and CSV generated successfully!")


Nombre de fichiers .doc/.docx trouvés : 1


  loader = UnstructuredFileLoader(doc_file)
Chargement des fichiers: 100%|██████████| 1/1 [00:04<00:00,  4.76s/it]



Nombre total de documents chargés : 1
Nombre de documents après filtrage : 1


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

Processing documents in batches: 100%|██████████| 1/1 [05:39<00:00, 339.02s/it]


Nombre de chunks sémantiques : 40


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0
  chat_model = HuggingFacePipeline(pipeline=reader_pipeline)


Response: 

The 3GPP specifications are numbered using a specific scheme that includes a major version number, a minor version number, and a release number. The major version number indicates significant changes to the specification, while the minor version number denotes less significant changes. The release number refers to the specific version of the specification within a major or minor version. The numbering scheme is as follows:

- Major version: x.y.z
  - x represents the major version number, which is incremented when significant changes are made to the specification.
  - y represents the minor version number, which is incremented when less significant changes are made.
  - z represents the release number, which is incremented for each new release of the specification within a major or minor version.

The version nomenclature follows the same numbering scheme, with each version having a major, minor, and release number. The release control mechanism involves creating new Releas

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Dataset and CSV generated successfully!


In [15]:
eval_df.head()

Unnamed: 0,question,answer,contexts,ground_truth
0,What is the role of the Support Team in managi...,The role of the Support Team in managing speci...,"[Known in some groups as ""text proposal"". spec...",The role of the Support Team in managing speci...
1,How is the handling of specifications regulate...,The handling of specifications in 3GPP is regu...,[z\tthe third digit is incremented when editor...,How is the handling of specifications regulate...
2,What is the significance of the three fields i...,The three fields in the version number associa...,[As distinct from those of the aa.8bb series. ...,The three fields in the version number associa...
3,Can you provide a summary of the relationship ...,"In the context of mobile systems, specificatio...",[4.0B\tReleases\n\nSpecifications are grouped ...,"In the context of mobile systems, specificatio..."
4,What is the purpose of performing a feasibilit...,The purpose of performing a feasibility study ...,"[In any complex engineering venture, it is nec...",The purpose of performing a feasibility study ...


In [16]:
from ragas import evaluate, RunConfig
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

# Configuration de l'exécution : timeout de 300 secondes et un seul job simultané
run_config = run_config = RunConfig(timeout=1000, max_workers=1)

result = evaluate(
    eval_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm=chat_model,
    embeddings=embed_model,
    raise_exceptions=False,  # pour afficher l'erreur en cas de problème
    run_config=run_config,
)

print("Résultats RAGAS :", result)


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Résultats RAGAS : {'context_precision': 0.9167, 'faithfulness': 1.0000, 'answer_relevancy': 0.8963, 'context_recall': 0.9000}


In [18]:
import evaluate
import rouge_score  # Add this import
import evaluate

def evaluate_rag_performance(dataset):
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")  # Load BLEU metric from 'evaluate'

    # Assuming your dataset is structured as before.
    # adjust as needed based on your dataset structure.
    results = {
        "rouge_score": rouge.compute(predictions=dataset['answer'], references=dataset['ground_truth']),
        "bleu_score": bleu.compute(predictions=dataset['answer'], references=dataset['ground_truth']) # Compute BLEU score
    }
    return results

# Example usage:
results = evaluate_rag_performance(eval_dataset)
print(results)

{'rouge_score': {'rouge1': 0.38174094668318037, 'rouge2': 0.1460659773926784, 'rougeL': 0.22439053122319286, 'rougeLsum': 0.27384987450096243}, 'bleu_score': {'bleu': 0.12265498542614062, 'precisions': [0.36469221835075494, 0.14263505635445006, 0.07608271556769411, 0.05718762240501371], 'brevity_penalty': 1.0, 'length_ratio': 1.1454545454545455, 'translation_length': 2583, 'reference_length': 2255}}


In [20]:
import pandas as pd

# Autoriser l'affichage complet du contenu
pd.set_option('display.max_colwidth', None)

# Puis afficher la deuxième ligne
print(eval_df.iloc[2])


question                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                