In [1]:
import os
import subprocess
import faiss
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import numpy as np

In [2]:
# STEP 2 — PDF Text Extraction
# =========================
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

pdf_path = "BELT STOCK.pdf"  # Change this to your file path
full_text = extract_text_from_pdf(pdf_path)
print(f"Total characters extracted: {len(full_text)}")


Total characters extracted: 1123


In [3]:
# STEP 3 — Chunk the text
# =========================
def chunk_text(text, chunk_size=1500, overlap=300):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = chunk_text(full_text, chunk_size=1500, overlap=300)
print(f"Total chunks created: {len(chunks)}")

Total chunks created: 1


In [4]:
# STEP 4 — Create FAISS Vector Store
# =========================
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Small & fast model
embeddings = embedder.encode(chunks, convert_to_numpy=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index created with {index.ntotal} vectors.")

FAISS index created with 1 vectors.


In [5]:
# STEP 5 — Ollama query function
# =========================
def ollama_generate(prompt, model="llama3.1:latest"):
    """Call Ollama model from Python and return output text."""
    result = subprocess.run(
        ["ollama", "run", model],
        input=prompt.encode("utf-8"),
        stdout=subprocess.PIPE
    )
    return result.stdout.decode("utf-8").strip()


In [15]:
# STEP 6 — Ask Questions
# =========================
def ask(query, top_k=3):
    # Step 6.1 — Embed query
    q_embedding = embedder.encode([query], convert_to_numpy=True)
    
    # Step 6.2 — Retrieve top chunks
    distances, indices = index.search(q_embedding, top_k)
    retrieved_chunks = [chunks[i] for i in indices[0]]
    
    # Step 6.3 — Build prompt
    context = "\n".join(retrieved_chunks)
    prompt = f"""
You are a highly precise data extraction and analysis tool. Your sole purpose is to provide a direct, factual answer based ONLY on the provided context.

**Instructions:**
1.  Carefully examine the 'Context' provided below. The context is a table.
2.  Find the exact data requested in the 'Question'.
3.  If the answer exists in the context, provide it as a short, direct answer. Do not add any extra commentary, apologies, or explanations.
4.  If the specific information requested in the 'Question' is not explicitly present in the 'Context', you MUST respond with a single, clear phrase: "The information is not available in the provided table."

**Context:**
{context}

**Question:** {query}

**Answer:**
"""
    
    # Step 6.4 — Run Ollama
    return ollama_generate(prompt)


In [16]:
# STEP 7 — Example Usage
# =========================
question = "What is the stockbelt of StrngthST1600,width2000mm in meters?"
answer = ask(question, top_k=3)
print("\n--- Model Answer ---\n", answer)


--- Model Answer ---
 1739 

(From row "2 ST 1600 (2000 mm)")


In [17]:
# STEP 7 — Example Usage
# =========================
question = "What is the stockbelt of strengthEP800/4(2400MM)?"
answer = ask(question, top_k=2)
print("\n--- Model Answer ---\n", answer)


--- Model Answer ---
 The information is not available in the provided table.


In [18]:
# STEP 7 — Example Usage
# =========================
question = "BELT DRUM DETAILS OF EP800/4(2000mm)?"
answer = ask(question, top_k=1)
print("\n--- Model Answer ---\n", answer)


--- Model Answer ---
 The information is not available in the provided table.


In [19]:
# STEP 7 — Example Usage
# =========================
question = "available belt drum of EP800/4(2000mm) ?"
answer = ask(question, top_k=1)
print("\n--- Model Answer ---\n", answer)


--- Model Answer ---
 The information is not available in the provided table.
