In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install ollama

# ETAPA 3 — Sistema QA / RAG híbrido
Objetivo: implementar um mecanismo de recuperação (TF-IDF) e um fluxo híbrido RAG que usa o contexto recuperado como `context` para o modelo local Ollama. O notebook contém funções para:
- construir índice TF-IDF
- recuperar top-k trechos relevantes
- gerar resposta com TF-IDF (concatenação+resumo simples) ou com Ollama (RAG)
- guardar resultados (JSON)


In [14]:
import os
import json
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm

nlp = spacy.load("en_core_web_sm")


from ollama import chat

# Caminho do dataset (ajusta se necessário)
DATA_CSV = "C:\\Users\\gonsa\\OneDrive\\Microsoft Teams Chat Files\\Desktop\\chatbot\\etapa_i\\icvs_dataset.csv" 

# Parâmetros
TOP_K = 3   # número de trechos a recuperar
TFIDF_MAX_FEATURES = 5000


In [15]:
df = pd.read_csv(DATA_CSV)
print("Total linhas:", len(df))
df.head()


Total linhas: 196


Unnamed: 0,section,text
0,Welcome Message,Welcome to ICVS!
1,Welcome Message,We’re excited to have you join our community o...
2,Welcome Message,"Together, we strive to conduct research of exc..."
3,Welcome Message,Welcome aboard!
4,Introduction,This guidebook was designed to facilitate your...


In [16]:
if 'text' in df.columns:
    corpus = df['text'].fillna("").astype(str).tolist()
else:
    corpus = df['text'].fillna("").astype(str).tolist()

# Guarda metadados para cada trecho
meta = df[['section']].copy()
meta['text'] = corpus


In [17]:
vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words='english')
X_tfidf = vectorizer.fit_transform(corpus)
print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (196, 1312)


In [18]:
def retrieve_top_k(question, k=TOP_K):
    q = question if isinstance(question, str) else str(question)
    q_vec = vectorizer.transform([q])
    sims = cosine_similarity(q_vec, X_tfidf).flatten()
    top_idx = sims.argsort()[-k:][::-1]
    results = []
    for idx in top_idx:
        results.append({
            "idx": int(idx),
            "score": float(sims[idx]),
            "section": meta.iloc[idx]['section'],
            "text": meta.iloc[idx]['text']
        })
    return results


In [19]:
def answer_with_tfidf(question, k=TOP_K):
    hits = retrieve_top_k(question, k)
    # concatena os textos recuperados como "context"
    context = "\n\n".join([h['text'] for h in hits])
    # estratégia simples: devolver o contexto como "evidence" e um curto sumário
    summary = "Context retrieved (evidence):\n" + context
    return {"method": "tfidf", "question": question, "hits": hits, "answer": summary}


In [24]:
def answer_with_ollama_rag(question, k=TOP_K, model_name="llama3.2:1b", temperature=0.25):
    # Recuperar contexto
    hits = retrieve_top_k(question, k)
    context = "\n\n".join(
        [f"[Section: {h['section']}]\n{h['text']}" for h in hits]
    )

    system_msg = (
        "You are an academic question-answering assistant.\n\n"
        "Your task is to answer the user's question using ONLY the provided context.\n"
        "You are allowed to COMBINE and SUMMARIZE information from multiple excerpts.\n"
        "You must NOT use any external knowledge or assumptions.\n\n"
        "Rules:\n"
        "- Use only facts explicitly stated in the context.\n"
        "- You may infer an answer by combining multiple excerpts.\n"
        "- If the context truly does not contain enough information, say:\n"
        "  \"The document does not provide sufficient information to answer this question.\"\n"
        "- Keep the answer concise and factual.\n"
        "- Use the same language as the question.\n"
        "- Do not repeat the question.\n"
        "- Do not add recommendations beyond what is stated."
    )

    user_prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        "Answer briefly. After the answer, include:\n"
        "Evidence: list of section names used."
    )

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_prompt}
    ]

    stream = chat(
        model=model_name,
        messages=messages,
        stream=True,
        options={"temperature": temperature}
    )

    full = ""
    for chunk in stream:
        content = chunk.get("message", {}).get("content", "")
        print(content, end="", flush=True)
        full += content
    print()

    return {
        "method": "rag_ollama",
        "question": question,
        "hits": hits,
        "answer": full
    }


In [26]:
examples = [
    "Who should I contact about safety training?",
    "What do I need to do to obtain the ICVS ID card?",
    "How do I request access to the microscopy facility??"
]

results = []
for q in examples:
    print("QUESTION:", q)
    # Primeiro, TF-IDF baseline
    base = answer_with_tfidf(q, k=3)
    print("\n--- TF-IDF baseline evidence ---")
    for h in base['hits']:
        print(f"[{h['score']:.4f}] {h['section']}: {h['text'][:200]}...")
    # Agora a versão híbrida com Ollama
    print("\n--- Ollama RAG answer (streaming) ---")
    rag = answer_with_ollama_rag(q, k=3, model_name="llama3.2:1b", temperature=0.35)
    results.append({"question": q, "tfidf": base, "rag": rag})
    print("\n" + "="*80 + "\n")


QUESTION: Who should I contact about safety training?

--- TF-IDF baseline evidence ---
[0.4109] Access to Specialized Facilities: If you are unsure whether a facility requires extra training, please contact training.icvs@med.uminho.pt...
[0.3938] Access to Specialized Facilities: Facility Safety Training – Process Flow...
[0.3914] Mandatory Safety Training: If at any point you have questions or doubts, please contact us at safety.icvs@med.uminho.pt...

--- Ollama RAG answer (streaming) ---
You should contact training.icvs@med.uminho.pt.

[Section: Access to Specialized Facilities]
If you are unsure whether a facility requires extra training, please contact training.icvs@med.uminho.pt

[Section: Mandatory Safety Training]
If at any point you have questions or doubts, please contact us at safety.icvs@med.uminho.pt

Evidence:
- Section 1: "If you are unsure..."
- Section 2: "...contact training.icvs@med.uminho.pt"


QUESTION: What do I need to do to obtain the ICVS ID card?

--- TF-IDF b

In [27]:
OUT_JSON = "icvs_qa_results4.json"
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Results saved to:", OUT_JSON)

Results saved to: icvs_qa_results4.json


In [28]:
# Exemplo de verificação manual: abre os pares no notebook e avalia
pd.DataFrame([{"question": r["question"], "rag_answer": r["rag"]["answer"][:300]} for r in results])


Unnamed: 0,question,rag_answer
0,Who should I contact about safety training?,You should contact training.icvs@med.uminho.pt...
1,What do I need to do to obtain the ICVS ID card?,"To obtain the ICVS ID card, you need to:\n\n1...."
2,How do I request access to the microscopy faci...,"To request access to the microscopy facility, ..."
