### Installation des d√©pendances

In [1]:
# library installations:
!pip install langchain \
    langchain-community \
    langchain-pinecone \
    transformers \
    peft \
    torch \
    accelerate \
    streamlit \
    pinecone-client \
    sentence-transformers \
    fastapi \
    uvicorn \
    pyngrok \
    nest-asyncio \
    bitsandbytes \
    pypdf

# for GPU optimizations:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.13-py3-none-any.whl.metadata (8.6 kB)
Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pinecone-client
  Downloading pinecone_client-6.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting pypdf
  Downloading pypdf-6.4.1-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-commun

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m521.0/521.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, but you have transformers 5.0.0.dev0 which is incompatible.[0m[31m
[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hd

In [None]:
!pip install -q pinecone langchain-huggingface langchain-community langchain-text-splitters pypdf


In [None]:
# # üì• Setup Pinecone dans Google Colab (Version corrig√©e)

# %%
# %% Imports
import os
from pathlib import Path
from getpass import getpass

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from pinecone import Pinecone, ServerlessSpec
import time

# %% [markdown]
# ## üîê Cl√© API Pinecone

# %%
PINECONE_API_KEY = getpass("üîë Collez votre cl√© API Pinecone (pcsk_...): ")
if not PINECONE_API_KEY.startswith("pcsk_"):
    raise ValueError("‚ùå La cl√© API Pinecone doit commencer par 'pcsk_'")

INDEX_NAME = "mental-health-raft"

# %% [markdown]
# ## üìÅ Charger les PDFs depuis /content

# %%
pdf_paths = [
    "/content/EmotionalIntelligence.pdf",
    "/content/Managing-Stress-Principles-and-Strategies-for-Health-and-Wellbeing.pdf",
    "/content/the-social-skills-guidebook-fhc-dr-notes.pdf"
]

for path in pdf_paths:
    if not Path(path).exists():
        raise FileNotFoundError(f"‚ùå Fichier manquant : {path}")

print("üìÑ Chargement des PDFs...")
documents = []
for path in pdf_paths:
    loader = PyPDFLoader(path)
    docs = loader.load()
    documents.extend(docs)

# %% [markdown]
# ## ‚úÇÔ∏è D√©coupage en chunks

# %%
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
print(f"‚úÖ {len(texts)} chunks cr√©√©s.")

# %% [markdown]
# ## üß† G√©n√©ration des embeddings (384D)

# %%
print("üß† G√©n√©ration des embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # ‚Üí 384D

# V√©rification de la dimension (optionnel mais utile)
test_emb = embeddings.embed_query("test")
print(f"‚úÖ Dimension des embeddings : {len(test_emb)}")

# %% [markdown]
# ## üì° Connexion √† Pinecone + cr√©ation de l'index (384D)

# %%
print("üì° Connexion √† Pinecone...")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Supprimer l'index existant s'il existe (pour √©viter les conflits de dimension)
if INDEX_NAME in pc.list_indexes().names():
    print(f"üóëÔ∏è Suppression de l'index existant '{INDEX_NAME}'...")
    pc.delete_index(INDEX_NAME)

# Cr√©er un nouvel index en 384D
print("üÜï Cr√©ation de l'index en 384D...")
pc.create_index(
    name=INDEX_NAME,
    dimension=384,  # ‚úÖ Doit correspondre √† la dimension du mod√®le
    metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"))

# Attendre que l‚Äôindex soit pr√™t
while not pc.describe_index(INDEX_NAME).status["ready"]:
    time.sleep(1)
print("üü¢ Index pr√™t.")

# %% [markdown]
# ## üì§ Insertion des vecteurs (upsert)

# %%
index = pc.Index(INDEX_NAME)
batch_size = 100
print("üì§ Insertion dans Pinecone...")

for i in range(0, len(texts), batch_size):
    batch = texts[i:i + batch_size]
    contents = [doc.page_content for doc in batch]
    embeds = embeddings.embed_documents(contents)
    vectors = [
        (str(i + j), emb, {"text": contents[j]})
        for j, emb in enumerate(embeds)
    ]
    index.upsert(vectors=vectors)

print(f"‚úÖ {len(texts)} chunks index√©s dans '{INDEX_NAME}'.")

üîë Collez votre cl√© API Pinecone (pcsk_...): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
üìÑ Chargement des PDFs...
‚úÖ 7234 chunks cr√©√©s.
üß† G√©n√©ration des embeddings...
‚úÖ Dimension des embeddings : 384
üì° Connexion √† Pinecone...
üÜï Cr√©ation de l'index en 384D...
üü¢ Index pr√™t.
üì§ Insertion dans Pinecone...
‚úÖ 7234 chunks index√©s dans 'mental-health-raft'.


In [None]:
# CELLULE 2 : IMPORTS
import os
import torch
from fastapi import FastAPI
from pydantic import BaseModel
from pinecone import Pinecone, ServerlessSpec
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFaceHub  # ‚Üê maintenant OK
from transformers import AutoTokenizer, AutoModelForCausalLM
import uvicorn
from pyngrok import ngrok
import nest_asyncio

### Imports et configuration des cl√©s API

In [None]:
!pip install -q bitsandbytes accelerate

In [None]:
# Cl√©s API (remplace par les tiennes)
os.environ["PINECONE_API_KEY"] = "xxxxxxxxxxxx"
os.environ["HF_TOKEN"] = "hxxxxxxxxxxxxxxx"

### Chargement du mod√®le LLaMA-2-7b-chat-hf




In [None]:
# Charger LLaMA-2-7b-chat-hf avec quantification 8-bit
print("üß† Chargement de LLaMA-2-7b-chat-hf (sans fine-tuning)...")
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    token=os.environ["HF_TOKEN"]
)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map="auto",
    token=os.environ["HF_TOKEN"]
)
print("‚úÖ Mod√®le charg√© sur GPU.")

üß† Chargement de LLaMA-2-7b-chat-hf (sans fine-tuning)...


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

‚úÖ Mod√®le charg√© sur GPU.


### Connexion √† Pinecone

In [None]:
# Initialiser Pinecone avec all-MiniLM-L6-v2 (384D)
print("üì° Connexion √† Pinecone...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

index_name = "mental-health-raft"
if index_name not in pc.list_indexes().names():
    raise ValueError(f"‚ùå Index '{index_name}' n'existe pas. Ex√©cute setup_pinecone.py d'abord !")

docsearch = PineconeVectorStore(index_name=index_name, embedding=embeddings)
print("‚úÖ Pinecone pr√™t.")

üì° Connexion √† Pinecone...
‚úÖ Pinecone pr√™t.


### D√©finition de l‚ÄôAPI FastAPI

In [None]:
# Endpoint FastAPI
app = FastAPI()

class Query(BaseModel):
    question: str

@app.post("/generate")
async def generate_response(query: Query):
    try:
        # Recherche de contexte pertinent
        docs = docsearch.similarity_search(query.question, k=3)
        context = "\n\n".join([doc.page_content for doc in docs])

        # Prompt format√© pour LLaMA-2-Chat
        prompt = f"""<s>[INST] <<SYS>>
You are a compassionate, non-judgmental emotional support companion. Use ONLY the expert advice below to answer.
<</SYS>>

Expert advice:
{context}

Question: {query.question} [/INST]"""

        # G√©n√©ration
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                temperature=0.7,
                top_p=0.9,
                top_k=50,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extraire la r√©ponse (apr√®s [/INST])
        if "[/INST]" in response:
            response = response.split("[/INST]")[-1].strip()

        return {"response": response}

    except Exception as e:
        return {"response": f"Erreur: {str(e)}"}

### Lancement du serveur avec ngrok

In [None]:
from pyngrok import ngrok

# Optionnel : lister les tunnels actifs
tunnels = ngrok.get_tunnels()
print("Tunnels actifs :", [t.public_url for t in tunnels])

# Fermer tous les tunnels
ngrok.kill()  # ‚ö†Ô∏è Cela arr√™te TOUTES les sessions ngrok sur cette machine/notebook

In [None]:
public_url = ngrok.connect(8000)
print("üåç URL publique :", public_url)

In [None]:
# Lancer le serveur (compatible Colab)
if __name__ == "__main__":
    # Configurer ngrok
    ngrok.set_auth_token("36RBWORCvUYCp7pAvnFdPN786zg_u1uQTtAX4oCctCWmoMpB")
    public_url = ngrok.connect(8000)
    print(f"üåç URL publique : {public_url.public_url}")
    print("‚úÖ Copie cette URL dans chatbot_app.py")

    # Appliquer nest_asyncio
    nest_asyncio.apply()

    # Lancer uvicorn SANS asyncio.run()
    import asyncio
    config = uvicorn.Config(app, host="0.0.0.0", port=8000, loop="asyncio")
    server = uvicorn.Server(config)

    # Ex√©cuter dans la boucle existante
    loop = asyncio.get_event_loop()
    loop.create_task(server.serve())
    print("‚úÖ Serveur d√©marr√©. Ne fermez pas cette cellule.")

üåç URL publique : https://shrubbier-ripely-carolyn.ngrok-free.dev
‚úÖ Copie cette URL dans chatbot_app.py
‚úÖ Serveur d√©marr√©. Ne fermez pas cette cellule.


In [None]:

# ## üîç Imports pour les strat√©gies avanc√©es de RAG

# %%
from sentence_transformers import CrossEncoder
import numpy as np
from langchain_core.prompts import ChatPromptTemplate

In [None]:
# ## üß† Strat√©gies de r√©cup√©ration de documents

# %%
# 1. RAG de base (Naive)
def retrieve_naive(question: str, k=3):
    return docsearch.similarity_search(question, k=k)

# 2. RAG avec r√©-rank
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def retrieve_rerank(question: str, k=3):
    candidates = docsearch.similarity_search(question, k=10)
    if not candidates:
        return []
    pairs = [(question, doc.page_content) for doc in candidates]
    scores = reranker.predict(pairs)
    scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in scored_docs[:k]]

# 3. RAG avec fusion de requ√™tes
def generate_queries(question: str, num=3):
    prompt = f"""Given the user's question, generate {num} diverse reformulations for better document retrieval.

User question: {question}

Reformulations (one per line):"""
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7, pad_token_id=tokenizer.eos_token_id)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    queries = [q.strip() for q in text.split('\n') if q.strip()]
    return queries[:num] or [question]

def retrieve_query_fusion(question: str, k=3):
    queries = generate_queries(question, num=3)
    all_docs = []
    for q in queries:
        all_docs.extend(docsearch.similarity_search(q, k=2))
    # Supprimer doublons
    seen = set()
    unique_docs = []
    for doc in all_docs:
        if doc.page_content not in seen:
            seen.add(doc.page_content)
            unique_docs.append(doc)
    return unique_docs[:k]

In [None]:
# ## üìä √âvaluation des strat√©gies RAG

# %%
# Jeu de test
test_questions = [
    {"question": "I feel anxious before social events. What can I do?", "gold": "Use breathing techniques and gradual exposure."},
    {"question": "How to manage overwhelming stress at work?", "gold": "Break tasks into smaller steps and take breaks."},
    {"question": "I keep ruminating on past mistakes.", "gold": "Practice self-compassion and refocus on the present."}
]

def evaluate_strategy(retrieve_fn, questions):
    scores = []
    for item in questions:
        docs = retrieve_fn(item["question"])
        retrieved_text = " ".join([d.page_content for d in docs])
        gold_text = item["gold"]
        emb_retrieved = embeddings.embed_query(retrieved_text)
        emb_gold = embeddings.embed_query(gold_text)
        similarity = np.dot(emb_retrieved, emb_gold) / (np.linalg.norm(emb_retrieved) * np.linalg.norm(emb_gold))
        scores.append(similarity)
        print(f"‚Ä¢ {item['question'][:50]}... ‚Üí {similarity:.3f}")
    avg = np.mean(scores)
    print(f"‚úÖ Score moyen : {avg:.3f}\n")
    return avg

# √âvaluer chaque strat√©gie
print("üîç √âvaluation du RAG de base (Naive) :")
score_naive = evaluate_strategy(retrieve_naive, test_questions)

print("üîç √âvaluation du RAG + R√©-rank :")
score_rerank = evaluate_strategy(retrieve_rerank, test_questions)

print("üîç √âvaluation du RAG + Fusion de requ√™tes :")
score_fusion = evaluate_strategy(retrieve_query_fusion, test_questions)

# R√©sum√©
print("üìä R√©sultats finaux :")
print(f"  - Naive RAG      : {score_naive:.3f}")
print(f"  - RAG + R√©-rank  : {score_rerank:.3f} (+{(score_rerank - score_naive)*100:.1f}%)")
print(f"  - RAG + Fusion   : {score_fusion:.3f} (+{(score_fusion - score_naive)*100:.1f}%)")

In [None]:
# ## üß™ Test manuel : Comparaison des r√©ponses

# %%
test_question = "I feel overwhelmed and can't stop thinking about work."

print("‚ùì Question :", test_question)
print("\n1. RAG de base :")
docs1 = retrieve_naive(test_question)
context1 = "\n\n".join([d.page_content for d in docs1])
prompt1 = f"<s>[INST] You are a mental health companion. Use this context:\n{context1}\n\nQuestion: {test_question} [/INST]"
inputs1 = tokenizer(prompt1, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
output1 = model.generate(**inputs1, max_new_tokens=150, temperature=0.7, pad_token_id=tokenizer.eos_token_id)
response1 = tokenizer.decode(output1[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
print("‚Üí", response1)

print("\n2. RAG + R√©-rank :")
docs2 = retrieve_rerank(test_question)
context2 = "\n\n".join([d.page_content for d in docs2])
prompt2 = f"<s>[INST] You are a mental health companion. Use this context:\n{context2}\n\nQuestion: {test_question} [/INST]"
inputs2 = tokenizer(prompt2, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
output2 = model.generate(**inputs2, max_new_tokens=150, temperature=0.7, pad_token_id=tokenizer.eos_token_id)
response2 = tokenizer.decode(output2[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
print("‚Üí", response2)