In [1]:
# Imports & simple configuration
import os
import sys
import pickle
from typing import List, Dict, Any, Tuple
from pathlib import Path
from dataclasses import dataclass, asdict

# Ensure GPU use not assumed
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Helpful paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data" / "docs"
INDICES_DIR = BASE_DIR / "indices"
DATA_DIR.mkdir(parents=True, exist_ok=True)
INDICES_DIR.mkdir(parents=True, exist_ok=True)

# Small dataclass for docs
@dataclass
class DocChunk:
    text: str
    source: str  # filename or identifier
    meta: Dict[str, Any]  # e.g., {'category': 'smartphone', 'page': 3}


In [3]:
!pip install pdfplumber pdf2image pytesseract pillow

Collecting pdfplumber
  Using cached pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Using cached pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Using cached pdfplumber-0.11.7-py3-none-any.whl (60 kB)
Using cached pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Using cached pytesseract-0.3.13-py3-none-any.whl (14 kB)
Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl (2.9 MB)
Installing collected packages: pytesseract, pypdfium2, pdf2image, pdfminer.six, pdfplumber

   -------- ------------------------------- 1/5 [pypdfium2]
   -------- ------------------------------- 1/

In [4]:
# PDF extractor with OCR fallback
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import tempfile

def extract_text_from_pdf(path: str, ocr_if_empty: bool = True) -> str:
    """
    Extract text from PDF using pdfplumber; if empty and ocr_if_empty True,
    run OCR via pytesseract/pdf2image.
    """
    path = str(path)
    text_pages = []
    try:
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                txt = (p.extract_text() or "").strip()
                text_pages.append(txt)
    except Exception as e:
        print("pdfplumber error:", e)
        text_pages = []

    full_text = "\n".join([p for p in text_pages if p])
    if not full_text.strip() and ocr_if_empty:
        print(f"[OCR] No text extracted; running OCR on {path} (this may be slow)...")
        try:
            images = convert_from_path(path, dpi=200)
            ocr_texts = []
            for img in images:
                ocr_texts.append(pytesseract.image_to_string(img))
            full_text = "\n".join(ocr_texts)
        except Exception as e:
            print("OCR conversion error:", e)
    return full_text


In [5]:
# Chunking function
import re
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """
    Naive chunker: split text into overlapping chunks of approximate char length `chunk_size`.
    Keeps sentence boundaries by splitting on periods if possible.
    """
    text = re.sub(r'\n+', '\n', text).strip()
    if len(text) <= chunk_size:
        return [text]
    chunks = []
    start = 0
    length = len(text)
    while start < length:
        end = start + chunk_size
        chunk = text[start:end]
        # try to extend to a sentence end (.) if within next 200 chars
        remainder = text[end:end+200]
        m = re.search(r'[.?!]\s', remainder)
        if m:
            end = end + m.end()
            chunk = text[start:end]
        chunks.append(chunk.strip())
        start = end - overlap
        if start < 0:
            start = 0
    return chunks


In [6]:
# Load PDFs from a folder and create DocChunk objects
def index_pdfs_from_folder(folder: str, category: str = "unknown"):
    """
    Walk through PDFs in `folder`, extract text, chunk, and return list[DocChunk].
    We'll not index embeddings yet — just prepare chunks with metadata.
    """
    folder = Path(folder)
    doc_chunks = []
    for pdf_path in sorted(folder.glob("*.pdf")):
        print("Loading:", pdf_path.name)
        full_text = extract_text_from_pdf(pdf_path)
        if not full_text.strip():
            print(f"Warning: no text in {pdf_path.name}")
            continue
        chunks = chunk_text(full_text, chunk_size=1200, overlap=200)
        for i, c in enumerate(chunks):
            doc_chunks.append(DocChunk(text=c, source=pdf_path.name, meta={'category': category, 'chunk_id': i}))
    print(f"Created {len(doc_chunks)} chunks from {folder}")
    return doc_chunks

# Example: index all PDFs placed in data/docs/
docs = index_pdfs_from_folder(DATA_DIR, category="gadget")  # you can organize subfolders by category
len(docs)


Created 0 chunks from C:\Personal\Data Science Course\Data_Science Projects\Interactive-Document-Assistant\data\docs


0

In [8]:
!pip install sentence_transformers

Collecting sentence_transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Using cached transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Using cached torch-2.8.0-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Using cached huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Using cached safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Using cached sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
Using cached transformers-4.57.0-py3-none-any.whl (12.0 MB)
Using cached huggingface_hub-0.

In [12]:
!pip install faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.12.0-cp313-cp313-win_amd64.whl.metadata (5.2 kB)
Using cached faiss_cpu-1.12.0-cp313-cp313-win_amd64.whl (18.2 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [13]:
# Embeddings (sentence-transformers) + build FAISS index
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# Load small, fast embedding model
EMB_MODEL_NAME = "all-MiniLM-L6-v2"  # good default balance
embed_model = SentenceTransformer(EMB_MODEL_NAME)

def build_faiss_index(doc_chunks: List[DocChunk], index_path: str = None):
    texts = [dc.text for dc in doc_chunks]
    print("Computing embeddings for", len(texts), "chunks...")
    embeddings = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)
    # save index and metadata
    if index_path:
        faiss.write_index(index, str(index_path))
        with open(str(index_path) + ".metas.pkl", "wb") as f:
            pickle.dump(doc_chunks, f)
    return index, embeddings, doc_chunks

# If you have doc chunks from previous cell:
if len(docs) > 0:
    faiss_index, embeddings_matrix, doc_metas = build_faiss_index(docs, index_path=INDICES_DIR / "faiss_index.bin")
else:
    # Empty index
    faiss_index = None
    embeddings_matrix = None
    doc_metas = []


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
# Retrieval
def retrieve(query: str, index: faiss.IndexFlatL2, doc_chunks: List[DocChunk], top_k: int = 5):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, top_k)
    results = []
    for rank, idx in enumerate(I[0]):
        if idx < 0 or idx >= len(doc_chunks):
            continue
        score = float(D[0][rank])
        chunk = doc_chunks[idx]
        results.append({
            "rank": rank+1,
            "score": score,
            "text": chunk.text,
            "source": chunk.source,
            "meta": chunk.meta
        })
    return results

# quick test:
if faiss_index is not None:
    print(retrieve("battery life performance", faiss_index, doc_metas, top_k=3))


In [15]:
# LLM wrapper (pluggable)
import os
import json

def call_llm_openai(system_prompt: str, user_prompt: str, temperature: float = 0.0, max_tokens: int = 512):
    """
    Simple OpenAI Chat call. Requires OPENAI_API_KEY in environment.
    """
    try:
        import openai
    except Exception as e:
        raise RuntimeError("openai package not installed. pip install openai") from e

    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("Set OPENAI_API_KEY in environment before calling OpenAI.")
    openai.api_key = api_key

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    resp = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # change if you have access to different models
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens
    )
    text = resp["choices"][0]["message"]["content"].strip()
    return text

def call_llm_generic(system_prompt: str, user_prompt: str):
    """
    Placeholder for other providers (Mistralai or local transformer pipelines).
    Replace body with an API call to your provider.
    """
    raise NotImplementedError("Replace call_llm_generic with your Mistralai/Open-source model call per their SDK.")


In [16]:
# Composing prompt from retrieved context and calling LLM
SYSTEM_PROMPT = (
    "You are a helpful assistant that answers user questions based ONLY on the provided document "
    "EXCERPTS below. If the answer is not contained in the excerpts, say you cannot find the answer. "
    "Always include a 'SOURCES:' section at the end with the filename(s) and chunk ids you used."
)

from textwrap import shorten

def answer_query(query: str, index: faiss.IndexFlatL2, doc_chunks: List[DocChunk], top_k: int = 5, llm_caller=call_llm_openai):
    # 1. retrieve
    retrieved = retrieve(query, index, doc_chunks, top_k=top_k)

    # 2. build context (include short snippet + source metadata)
    context_lines = []
    for r in retrieved:
        snippet = shorten(r['text'].replace("\n", " "), 400)
        context_lines.append(f"SOURCE: {r['source']} | META: {r['meta']}\n{snippet}\n---")

    context = "\n".join(context_lines) if context_lines else "No relevant excerpts found."

    # 3. build user prompt
    user_prompt = f"Context:\n{context}\n\nQuestion: {query}\n\nAnswer concisely and list SOURCES at the end."

    # 4. call LLM (pluggable)
    ans = llm_caller(SYSTEM_PROMPT, user_prompt)
    return {
        "answer": ans,
        "retrieved": retrieved
    }

# quick manual test (requires faiss_index and OPENAI key set)
# resp = answer_query("How long does battery last under heavy usage?", faiss_index, doc_metas, top_k=4)
# print(resp['answer'])
# print("Sources:", [r['source'] for r in resp['retrieved']])


In [17]:
# Persist index and doc metadata
def save_index(index: faiss.IndexFlatL2, doc_chunks: List[DocChunk], base_path: Path = INDICES_DIR / "faiss_index.bin"):
    faiss.write_index(index, str(base_path))
    with open(str(base_path) + ".metas.pkl", "wb") as f:
        pickle.dump(doc_chunks, f)
    print("Saved index and metas to:", base_path)

def load_index(base_path: Path = INDICES_DIR / "faiss_index.bin"):
    if not base_path.exists():
        raise RuntimeError("Index file not found: " + str(base_path))
    index = faiss.read_index(str(base_path))
    with open(str(base_path) + ".metas.pkl", "rb") as f:
        doc_chunks = pickle.load(f)
    return index, doc_chunks

# Example:
# save_index(faiss_index, doc_metas)
# loaded_index, loaded_metas = load_index()


In [23]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.4-py3-none-any.whl (2.0 MB)
   ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/2.0 MB 4.1 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/2.0 MB 3.8 MB/s eta 0:00:01
   ---------------------------------------- 2.0/2.0 MB 3.5 MB/s eta 0:00:00
Installing collected packages: reportlab
Successfully installed reportlab-4.4.4
