<a href="https://colab.research.google.com/github/SBXTREME/Collab/blob/main/talk_to_your_pdf_without_KG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain sentence-transformers PyPDF2 faiss-cpu


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from 

In [None]:
import os
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import PyPDF2
import faiss
import numpy as np
from google.colab import userdata

API_KEY = userdata.get('secretkey')

# 1. Load all PDFs from the folder
def load_all_pdfs(folder_path):
    all_text = ""
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            with open(pdf_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        all_text += page_text + "\n"
    return all_text

pdf_folder = '/content/'
raw_text = load_all_pdfs(pdf_folder)

# 2. Split text into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = splitter.split_text(raw_text)

# 3. Embed chunks
embedder = SentenceTransformer('BAAI/bge-base-en-v1.5')
doc_embeddings = embedder.encode(docs, show_progress_bar=True, convert_to_numpy=True)

# 4. Build FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

# 5. Cosine similarity
def cosine_similarity(a, b):
    a = a / np.linalg.norm(a)
    b = b / np.linalg.norm(b)
    return np.dot(a, b)

# 6. Retrieval with similarity threshold
def retrieve_relevant_chunks(query, k=4, threshold=0.7):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_emb, k)
    retrieved = []
    for idx, dist in zip(I[0], D[0]):
        if idx == -1:
            continue
        chunk_emb = doc_embeddings[idx]
        sim = cosine_similarity(query_emb[0], chunk_emb)
        if sim >= threshold:
            retrieved.append((docs[idx], sim))
    return [chunk for chunk, sim in retrieved]

# 7. Strict system prompt
STRICT_SYSTEM_PROMPT = (
    "You are a helpful and kind AI assistant. "
    "You must only answer using the provided context from the PDFs. "
    "If the answer is not in the context, say: "
    "'Sorry, I couldn't find information about your question in the provided PDFs.' "
    "Do not use any external knowledge. "
    "If asked to summarize or provide a gist, only use the PDF content."
    "Don't say things like 'according to the provided context'"
    "Don't say things like 'Based on the provided context'"
)

# 8. LLM API call
def ask_llm(question, context):
    url = "https://api.generative.engine.capgemini.com/v2/llm/invoke"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json",
        "x-api-key": API_KEY
    }
    context = context[:2000]
    prompt = (
        f"Use ONLY the following context to answer the question. "
        f"If the answer is not in the context, say you don't know.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}"
    )
    payload = {
        "action": "run",
        "modelInterface": "langchain",
        "data": {
            "mode": "chain",
            "text": prompt,
            "files": [],
            "modelName": "openai.gpt-4o",
            "provider": "azure",
            "systemPrompt": STRICT_SYSTEM_PROMPT,
            "sessionId": "123e4567-e89b-12d3-a456-426614174000",
            "modelKwargs": {
                "maxTokens": 512,
                "temperature": 0.7,
                "streaming": False,
                "topP": 0.9
            }
        }
    }
    response = requests.post(url, headers=headers, json=payload)
    try:
        resp_json = response.json()
        if 'content' in resp_json:
            return resp_json['content']
        elif 'data' in resp_json and 'output' in resp_json['data']:
            return resp_json['data']['output']
        elif 'output' in resp_json:
            return resp_json['output']
        elif 'message' in resp_json:
            return f"API Error: {resp_json['message']}"
        else:
            return f"Unexpected API response: {resp_json}"
    except Exception as e:
        return f"Error parsing API response: {e}, raw: {response.text}"

# 9. Chat interface
def chat():
    print("Ask questions about the PDFs. Type 'exit' to quit.")
    while True:
        q = input("\nYour question: ")
        if q.lower() == 'exit':
            break

        # Detect summary/gist requests
        if any(word in q.lower() for word in ["summarize", "summary", "gist"]):
            context = "\n\n".join(docs[:10])
            print("\nDEBUG: Using first 10 chunks for summary/gist.\n")
            print(context)
            answer = ask_llm(q, context)
            print("\nAnswer:", answer)
            continue

        # Normal Q&A
        chunks = retrieve_relevant_chunks(q, k=4, threshold=0.6)
        if not chunks:
            print("\nAnswer: Sorry, I couldn't find information about your question in the provided PDFs.")
            continue
        context = "\n\n".join(chunks)
        print("\nDEBUG: Retrieved context for your question:\n")
        print(context)
        answer = ask_llm(q, context)
        print("=================================== Generated Response ===================================")
        print("\nAnswer:", answer)

# 10. Run the chat interface
chat()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]