<a href="https://colab.research.google.com/github/SBXTREME/Collab/blob/main/talk_to_your_pdf_without_KG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install dependencies
!pip install langchain sentence-transformers PyPDF2 faiss-cpu

In [None]:
# 2. Imports
import os
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import PyPDF2
import faiss
import numpy as np

# 3. Load PDF and split into chunks
pdf_path = '/content/metalyse-epar-product-information_en.pdf'

def load_pdf_text(pdf_path):
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

raw_text = load_pdf_text(pdf_path)

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = splitter.split_text(raw_text)

# 4. Embed chunks using BAAI/bge-base-en-v1.5
embedder = SentenceTransformer('BAAI/bge-base-en-v1.5')
doc_embeddings = embedder.encode(docs, show_progress_bar=True, convert_to_numpy=True)

# 5. Build FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

# 6. Cosine similarity
def cosine_similarity(a, b):
    a = a / np.linalg.norm(a)
    b = b / np.linalg.norm(b)
    return np.dot(a, b)

# 7. Retrieval with similarity threshold
def retrieve_relevant_chunks(query, k=4, threshold=0.7):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_emb, k)
    retrieved = []
    for idx, dist in zip(I[0], D[0]):
        if idx == -1:
            continue
        chunk_emb = doc_embeddings[idx]
        sim = cosine_similarity(query_emb[0], chunk_emb)
        if sim >= threshold:
            retrieved.append((docs[idx], sim))
    return [chunk for chunk, sim in retrieved]

# 8. Strict system prompt
STRICT_SYSTEM_PROMPT = (
    "You are a helpful and kind AI assistant. "
    "You must only answer using the provided context from the PDF. "
    "If the answer is not in the context, say: "
    "'Sorry, I couldn't find information about your question in the provided PDF.' "
    "Do not use any external knowledge. "
    "If asked to summarize or provide a gist, only use the PDF content."
)

# 9. LLM API call
def ask_llm(question, context):
    url = "https://api.generative.engine.capgemini.com/v2/llm/invoke"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json",
        "x-api-key": "cTkevXu8Oc7M0EOWO4im8ajjhtfx1zKp91jlVxeK"
    }
    context = context[:2000]
    prompt = (
        f"Use ONLY the following context to answer the question. "
        f"If the answer is not in the context, say you don't know.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}"
    )
    payload = {
        "action": "run",
        "modelInterface": "langchain",
        "data": {
            "mode": "chain",
            "text": prompt,
            "files": [],
            "modelName": "openai.gpt-4o",
            "provider": "azure",
            "systemPrompt": STRICT_SYSTEM_PROMPT,
            "sessionId": "123e4567-e89b-12d3-a456-426614174000",
            "modelKwargs": {
                "maxTokens": 512,
                "temperature": 0.0,
                "streaming": False,
                "topP": 0.9
            }
        }
    }
    response = requests.post(url, headers=headers, json=payload)
    try:
        resp_json = response.json()
        # Extract answer from 'content' if present
        if 'content' in resp_json:
            return resp_json['content']
        elif 'data' in resp_json and 'output' in resp_json['data']:
            return resp_json['data']['output']
        elif 'output' in resp_json:
            return resp_json['output']
        elif 'message' in resp_json:
            return f"API Error: {resp_json['message']}"
        else:
            return f"Unexpected API response: {resp_json}"
    except Exception as e:
        return f"Error parsing API response: {e}, raw: {response.text}"

# 10. Chat interface with guardrails and summary/gist support
def chat():
    print("Ask questions about the PDF. Type 'exit' to quit.")
    while True:
        q = input("\nYour question: ")
        if q.lower() == 'exit':
            break

        # Detect summary/gist requests
        if any(word in q.lower() for word in ["summarize", "summary", "gist"]):
            # Use all docs as context for summary (or first 10 for brevity)
            context = "\n\n".join(docs[:10])
            print("\nDEBUG: Using first 10 chunks for summary/gist.\n")
            print(context)
            answer = ask_llm(q, context)
            print("\nAnswer:", answer)
            continue

        # Normal Q&A
        chunks = retrieve_relevant_chunks(q, k=4, threshold=0.6)
        if not chunks:
            print("\nAnswer: Sorry, I couldn't find information about your question in the provided PDF.")
            continue
        context = "\n\n".join(chunks)
        print("\nDEBUG: Retrieved context for your question:\n")
        print(context)
        answer = ask_llm(q, context)
        print("\nAnswer:", answer)

# 11. Run the chat interface
chat()