<a href="https://colab.research.google.com/github/Simo01sp/Repository_Biavasco/blob/main/RAG_LLM_SITODIPLOME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================================================
#   CIMEA CHATBOT ‚Äî FULL RAG WITH LLM FAQ FILTER
# =====================================================

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from openai import OpenAI

# =====================================================
#   CONFIGURAZIONE
# =====================================================

BASE_URL = "https://cimea-diplome.it"
#API_KEY set individually
client = OpenAI(api_key=API_KEY)


# =====================================================
#   1 ‚Äî CRAWLING DEL SITO
# =====================================================

def crawl_website(base_url):
    visited = set()
    to_visit = [base_url]
    pages = []

    print("\nüîç Crawling del sito in corso...")

    while to_visit:
        url = to_visit.pop()

        if url in visited:
            continue
        visited.add(url)

        try:
            res = requests.get(url, timeout=10)
            if res.status_code != 200:
                continue
        except:
            continue

        soup = BeautifulSoup(res.text, "html.parser")
        page_text = soup.get_text(separator="\n", strip=True)
        pages.append({"url": url, "text": page_text})

        for link in soup.find_all("a", href=True):
            full_url = urljoin(url, link["href"])
            if base_url in full_url and full_url not in visited:
                to_visit.append(full_url)

    print(f"üìÑ Trovate {len(pages)} pagine.")
    return pages


# =====================================================
#   2 ‚Äî CHUNKING
# =====================================================

def chunk_pages(pages, chunk_size=500):
    chunks = []
    print("\n‚úÇÔ∏è Creazione chunk...")

    for p in pages:
        text = p["text"]
        for i in range(0, len(text), chunk_size):
            chunk = text[i:i + chunk_size]
            chunks.append({
                "url": p["url"],
                "chunk": chunk
            })

    print(f"üîπ Creati {len(chunks)} chunk.")
    return chunks


# =====================================================
#   3 ‚Äî EMBEDDING MODEL
# =====================================================

embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def embed_texts(text_list):
    return embedding_model.encode(
        text_list,
        convert_to_numpy=True,
        normalize_embeddings=True
    )


# =====================================================
#   4 ‚Äî FAQ (EXCEL)
# =====================================================

def load_faq_embeddings():
    print("\nüìò Carico DB_QA.xlsx...")

    df = pd.read_excel("DB_QA.xlsx")
    df = df.rename(columns={
        "Domanda": "question",
        "Answer (ENGLISH)": "answer"
    })

    faq_questions = df["question"].tolist()
    faq_embeddings = embed_texts(faq_questions)

    dim = faq_embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(faq_embeddings)

    print(f"üìö Indicizzate {index.ntotal} domande FAQ")
    return df, index


# =====================================================
#   5 ‚Äî WEBSITE INDEX
# =====================================================

def build_website_index(chunks):
    chunk_texts = [c["chunk"] for c in chunks]
    chunk_embeddings = embed_texts(chunk_texts)

    dim = chunk_embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(chunk_embeddings)

    print(f"üï∏Ô∏è Indicizzati {index.ntotal} chunk del sito")
    return index


# =====================================================
#   6 ‚Äî LLM FILTER: VERIFICA PERTINENZA DELLA FAQ
# =====================================================

def is_faq_relevant(user_question, faq_question):
    if faq_question is None:
        return False

    prompt = f"""
You are a classifier.

Given a USER QUESTION and a FAQ QUESTION, answer ONLY with "yes" or "no".

USER QUESTION:
{user_question}

FAQ QUESTION:
{faq_question}

Is the FAQ QUESTION relevant?
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=5,
        temperature=0
    )

    verdict = response.choices[0].message.content.strip().lower()
    return verdict == "yes"


# =====================================================
#   7 ‚Äî RICERCA MISTA (FAQ + SITO)
# =====================================================

def ricerca_mista(user_question, faq_df, faq_index, chunks, index_chunks):

    user_emb = embed_texts([user_question])

    # --- FAQ ---
    sim_faq, idx_faq = faq_index.search(user_emb, k=1)
    faq_answer = faq_df.iloc[idx_faq[0][0]]["answer"]
    faq_question_found = faq_df.iloc[idx_faq[0][0]]["question"]
    faq_score = float(sim_faq[0][0])

    # üîç FILTRO LLM: scarta FAQ se non pertinente
    if not is_faq_relevant(user_question, faq_question_found):
        faq_answer = None
        faq_question_found = None
        faq_score = 0.0

    # --- WEBSITE ---
    sim_sito, idx_sito = index_chunks.search(user_emb, k=3)
    sito_chunks = [chunks[i]["chunk"] for i in idx_sito[0]]
    sito_sources = [chunks[i]["url"] for i in idx_sito[0]]
    sito_score = float(sim_sito[0][0])

    return {
        "faq_answer": faq_answer,
        "faq_question": faq_question_found,
        "faq_score": faq_score,

        "sito_chunks": sito_chunks,
        "sito_score": sito_score,
        "sito_sources": sito_sources
    }


# =====================================================
#   8 ‚Äî RISPOSTA LLM
# =====================================================

def answer_with_llm(question, data):

    faq_answer = data["faq_answer"]
    sito_chunks = "\n".join(data["sito_chunks"])

    prompt = f"""
User question:
{question}

FAQ information:
{faq_answer}

Website information:
{sito_chunks}

Write a final answer that:
- is clear, professional and helpful
- uses ONLY the information above
- does NOT invent facts
- MUST be written in English
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=600   # risposta completa
    )

    return response.choices[0].message.content


# =====================================================
#   9 ‚Äî MAIN CHAT LOOP
# =====================================================

def main():

    pages = crawl_website(BASE_URL)
    chunks = chunk_pages(pages)
    faq_df, faq_index = load_faq_embeddings()
    index_chunks = build_website_index(chunks)

    print("\nü§ñ Chatbot attivo! Scrivi una domanda.\n")

    while True:
        q = input("\nYou: ")
        if q.lower() == "exit":
            break

        info = ricerca_mista(q, faq_df, faq_index, chunks, index_chunks)

        print("\n‚ùì FAQ Question Found:", info["faq_question"])
        print("üìò FAQ Score:", info["faq_score"])
        print("üìÑ Website Score:", info["sito_score"])

        print("üîó Website Sources:")
        for s in info["sito_sources"]:
            print("   -", s)

        answer = answer_with_llm(q, info)
        print("\nBot:", answer)


# =====================================================
#   AVVIO
# =====================================================

if __name__ == "__main__":
    main()



üîç Crawling del sito in corso...
üìÑ Trovate 23 pagine.

‚úÇÔ∏è Creazione chunk...
üîπ Creati 438 chunk.

üìò Carico DB_QA.xlsx...
üìö Indicizzate 180 domande FAQ
üï∏Ô∏è Indicizzati 438 chunk del sito

ü§ñ Chatbot attivo! Scrivi una domanda.


‚ùì FAQ Question Found: None
üìò FAQ Score: 0.0
üìÑ Website Score: 0.35085880756378174
üîó Website Sources:
   - https://cimea-diplome.it/
   - https://cimea-diplome.it/page-homepage
   - https://cimea-diplome.it

Bot: On the DiploMe platform, there are three different services available. Your qualification can be related to only one of these services. To determine the type of service relevant to your qualification and the required documents, please ensure that you accurately select the Country of Education System and the Level of Education System in the Service Finder.

‚ùì FAQ Question Found: How much do CIMEA services cost?
üìò FAQ Score: 0.47026458382606506
üìÑ Website Score: 0.29611802101135254
üîó Website Sources:
   - https:

In [None]:
pip install requests beautifulsoup4 pandas sentence-transformers faiss-cpu openai numpy


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.6/23.6 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [None]:
!pip install fastapi uvicorn nest_asyncio pyngrok
