In [2]:
import fitz 
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [3]:
def load_pdf(file_name):
    doc = fitz.open(file_name)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def split_text(text, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

In [6]:
import os

pdf_path = "ML.pdf"
if not os.path.exists(pdf_path):
    print("❌ PDF not found. Make sure ML.pdf is in the same folder as this script.")
else:
    print("✅ PDF file found. Proceeding...")

✅ PDF file found. Proceeding...


In [7]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embeddings(chunks):
    return embedding_model.encode(chunks, convert_to_tensor=True)


In [8]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory="./chroma_db", anonymized_telemetry=False))
collection = client.get_or_create_collection(name="notes")

def store_chunks_in_chromadb(chunks):
    for i, chunk in enumerate(chunks):
        collection.add(
            documents=[chunk],
            ids=[f"chunk_{i}"]
        )

In [9]:
def retrieve_relevant_chunks(query, top_k=3):
    query_embedding = embedding_model.encode(query).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    return results['documents'][0]

In [10]:
import subprocess

def query_ollama(context, question):
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    result = subprocess.run(["ollama", "run", "llama3", prompt], capture_output=True, text=True)
    return result.stdout.strip()

In [11]:
def ask_question_about_pdf(pdf_path, user_question):
    text = load_pdf(pdf_path)
    chunks = split_text(text)
    store_chunks_in_chromadb(chunks)
    relevant_chunks = retrieve_relevant_chunks(user_question)
    context = "\n".join(relevant_chunks)
    answer = query_ollama(context, user_question)
    return answer