In [1]:
import sqlite3
import os

def create_db():
    os.makedirs("db", exist_ok=True)

    conn = sqlite3.connect("db/database.db")
    curr = conn.cursor()
    curr.execute("""
    CREATE TABLE IF NOT EXISTS chunks (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        doc_id TEXT,
        chunk_id TEXT,
        chunk_text TEXT
    )
    """)

    conn.commit()
    conn.close()

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)


In [3]:
import sqlite3
import pdfplumber
from pathlib import Path

def create_db(DB_PATH):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS chunks (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        doc_id TEXT,
        chunk_id TEXT,
        chunk_text TEXT,
        source_title TEXT,
        source_url TEXT
    )
    """)

   
    conn.commit()
    conn.close()

def chunk_text(text, max_len=500):
    paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
    chunks = []
    buffer = ""

    for p in paragraphs:
        if len(buffer) + len(p) + 1 <= max_len:  # +1 for space
            buffer = (buffer + " " + p).strip()
        else:
            chunks.append(buffer)
            buffer = p

    if buffer:
        chunks.append(buffer)

    return chunks


In [4]:
# import sqlite3
# conn = sqlite3.connect("db/database.db")
# c = conn.cursor()
# c.execute("ALTER TABLE chunks ADD COLUMN source_title TEXT")
# c.execute("ALTER TABLE chunks ADD COLUMN source_url TEXT")
# conn.commit()
# conn.close()

In [None]:
import pdfplumber, sqlite3, json
from pathlib import Path

DATA_DIR = "E:\GenAI\QnA_pdf\pdf"

def pdf(DATA_DIR):
    create_db("db/database.db")
    count = 1
    with sqlite3.connect("db/database.db", timeout=30) as conn:
        curr = conn.cursor()
        for pdf_file in Path(DATA_DIR).glob("*.pdf"):
            with pdfplumber.open(pdf_file) as pdf:
                full_text = "\n".join([page.extract_text() or "" for page in pdf.pages])

            chunks = chunk_text(full_text)
            # chunks = text_splitter.split_text(full_text)

            # load sources.json metadata once per pdf
            sources_map = {}
            sources_path = Path(DATA_DIR) / "sources.json"
            if sources_path.exists():
                sources_map = json.loads(sources_path.read_text())

            meta = sources_map.get(pdf_file.name, {})
            title = meta.get("title") or pdf_file.name
            url = meta.get("url") or ""

            for idx, chunk in enumerate(chunks):
                curr.execute(
                    "INSERT INTO chunks (doc_id, chunk_id, chunk_text, source_title, source_url) VALUES (?, ?, ?, ?, ?)",
                    (pdf_file.name, f"{pdf_file.stem}-{idx}", chunk, title, url)
                )

            conn.commit()
            

pdf(DATA_DIR)


Cannot set gray non-stroke color because /'P100' is an invalid float value
Cannot set gray non-stroke color because /'P103' is an invalid float value
Cannot set gray non-stroke color because /'P105' is an invalid float value
Cannot set gray non-stroke color because /'P109' is an invalid float value
Cannot set gray non-stroke color because /'P113' is an invalid float value
Cannot set gray non-stroke color because /'P114' is an invalid float value
Cannot set gray non-stroke color because /'P116' is an invalid float value
Cannot set gray non-stroke color because /'P119' is an invalid float value
Cannot set gray non-stroke color because /'P121' is an invalid float value
Cannot set gray non-stroke color because /'P123' is an invalid float value
Cannot set gray non-stroke color because /'P125' is an invalid float value
Cannot set gray non-stroke color because /'P127' is an invalid float value
Cannot set gray non-stroke color because /'P128' is an invalid float value
Cannot set gray non-strok

In [6]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
DATA_DIR = "E:\GenAI\QnA_pdf\pdf"
INDEX_PATH = "db/faiss_index.bin"
IDS_PATH = "db/ids.npy"


def build_embeddings():
    model = SentenceTransformer('all-MiniLM-L6-v2')

    conn = sqlite3.connect("db/database.db")
    curr = conn.cursor()

    curr.execute("SELECT id, chunk_text FROM chunks")

    data = curr.fetchall()
    ids = [d[0] for d in data]
    texts = [d[1] for d in data]

    # try load existing index & ids
    if os.path.exists(INDEX_PATH) and os.path.exists(IDS_PATH):
        index = faiss.read_index(INDEX_PATH)
        saved_ids = np.load(IDS_PATH, allow_pickle=True).tolist()
        return saved_ids, index, None, model

    #  embeddings
    batch_size = 32
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch)
        embeddings.append(emb)

    embed = np.vstack(embeddings).astype('float32')

    faiss.normalize_L2(embed)

    dim = embed.shape[1]
    index = faiss.IndexFlatIP(dim)#IP = Innner product

    index.add(embed)

    faiss.write_index(index, INDEX_PATH)
    np.save(IDS_PATH, np.array(ids, dtype=np.int64))

    conn.close()
    return ids, index, embed, model


ids, index, emebedding, model = build_embeddings()

In [46]:
import sqlite3


def search(query, model, index, ids, k=10):

    query_s = model.encode([query]).astype('float32')

    faiss.normalize_L2(query_s)
    scores, indices = index.search(query_s, k)

    top_idxs = indices[0]
    top_scores = scores[0]

    conn = sqlite3.connect("db/database.db")
    curr = conn.cursor()
    results = []

    for pos, score in zip(top_idxs, top_scores):
        if pos < 0 or pos >= len(ids):
            continue

        chunk_db_id = ids[pos]
        
        curr.execute("SELECT doc_id, chunk_id, chunk_text, source_title, source_url FROM chunks WHERE id=?", (chunk_db_id,))
        
        row = curr.fetchone()
        
        if not row: 
            continue
        
        doc_id, chunk_id, text, source_title, source_url = row

        results.append({
            "db_id": chunk_db_id, # id in db
            "doc_id": doc_id,#pdf name
            "chunk_id": chunk_id, # pdf chunk id
            "text": text,
            "vector_score": float(score),
            "source_title": source_title,
            "source_url": source_url
        })

    conn.close()
    return results

model = SentenceTransformer('all-MiniLM-L6-v2')

results = search("What are  Robots?", model=model, index=index, ids=ids, k=10)


In [47]:
results

[{'db_id': 1546,
  'doc_id': 'Hokuyo-USA_-_A_Safety_Guide_to_Industrial_Robotics_Hazards_-_Whitepaper.pdf',
  'chunk_id': 'Hokuyo-USA_-_A_Safety_Guide_to_Industrial_Robotics_Hazards_-_Whitepaper-4',
  'text': 'improving safety and productivity.\nIndustrial robots have also become more accessible to small and medium-\nsized enterprises (SMEs) as the cost of industrial robots has decreased, and the\ninstallation process has become more straightforward. As a result, SMEs can now\nadopt automation and enhance their productivity and efficiency.\n2\nThe Exponential Rise in the\nUse of Industrial Robotics\nAs industrial robots are equipped with sophisticated sensors and better control technologies,\nthey are used for a wide range of applications across different industries. The two main reasons\nindustries have started using robots at a large scale are efficiency and safety.\nRobots are capable of carrying out tedious and\nhazardous operations with consistent efficiency 24/7.\nThese intellige

In [48]:
pip install rank-bm25 --quiet

Note: you may need to restart the kernel to use updated packages.


In [49]:
from rank_bm25 import BM25Okapi



def rerank(results, query, alpha=0.6):
    if not results:
        return results

    texts = [r['text'] for r in results]
    tokenized = [t.split() for t in texts]
    bm25 = BM25Okapi(tokenized)
    bm25_scores = bm25.get_scores(query.split())

    vec_scores = np.array([r['vector_score'] for r in results], dtype=float)

    vec_norm = vec_scores
    bm_norm = bm25_scores

    for i, r in enumerate(results):
        r['bm25_score'] = float(bm25_scores[i])
        r['vector_score_norm'] = float(vec_norm[i])
        r['bm25_score_norm'] = float(bm_norm[i])
        r['total_score'] = float(alpha * vec_norm[i] + (1 - alpha) * bm_norm[i])

    results = sorted(results, key=lambda x: x['total_score'], reverse=True)
    return results



In [50]:
query = "What are  Indians?"
result = rerank(results, query)

In [51]:
result

[{'db_id': 1544,
  'doc_id': 'Hokuyo-USA_-_A_Safety_Guide_to_Industrial_Robotics_Hazards_-_Whitepaper.pdf',
  'chunk_id': 'Hokuyo-USA_-_A_Safety_Guide_to_Industrial_Robotics_Hazards_-_Whitepaper-2',
  'text': 'can also lead to serious accidents, making it crucial to address the risks, hazards, and safety\nstandards associated with them. By prioritizing safety and adhering to industry standards,\nwe can ensure that the benefits of industrial robotics are realized while minimizing potential\nhazards and risks.\nThis whitepaper offers an in-depth exploration of the fundamentals of industrial robotics and\naims to provide a comprehensive safety guide for addressing hazards and mitigating risks\nassociated with these machines.\n1\nWhat are\nIndustrial Robots?\nIndustrial robots are versatile machines that have revolutionized the industrial sector. These\nrobots are programmable and can execute various tasks in different industrial settings, such as\nmanufacturing, logistics, agriculture, an

In [52]:
result[0]

{'db_id': 1544,
 'doc_id': 'Hokuyo-USA_-_A_Safety_Guide_to_Industrial_Robotics_Hazards_-_Whitepaper.pdf',
 'chunk_id': 'Hokuyo-USA_-_A_Safety_Guide_to_Industrial_Robotics_Hazards_-_Whitepaper-2',
 'text': 'can also lead to serious accidents, making it crucial to address the risks, hazards, and safety\nstandards associated with them. By prioritizing safety and adhering to industry standards,\nwe can ensure that the benefits of industrial robotics are realized while minimizing potential\nhazards and risks.\nThis whitepaper offers an in-depth exploration of the fundamentals of industrial robotics and\naims to provide a comprehensive safety guide for addressing hazards and mitigating risks\nassociated with these machines.\n1\nWhat are\nIndustrial Robots?\nIndustrial robots are versatile machines that have revolutionized the industrial sector. These\nrobots are programmable and can execute various tasks in different industrial settings, such as\nmanufacturing, logistics, agriculture, and he

In [55]:

def generate_answer(results, threshold=0.5):
    # results must be sorted by final score if reranked; otherwise by vector_score
    if not results:
        return None
    
    top = results[0]
    
    score_for_abstain = top["total_score"]
    
    if score_for_abstain < threshold:
        return None
    
    snippet = top['text'][:330].strip()
    
    citation = f"{top['source_title'] or top['doc_id']} (chunk: {top['chunk_id']})"
    
    return f"{snippet}\n\nSource: {citation}\nURL: {top['source_url']}"



In [56]:
# print(query)
print(generate_answer(results, threshold=0.5))

None
