In [25]:
import ollama
!OLLAMA_ACCELERATE=1

In [26]:
ollama.pull("llama3.2")

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [27]:
ollama.pull("nomic-embed-text")


ProgressResponse(status='success', completed=None, total=None, digest=None)

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_ollama.chat_models import ChatOllama
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import StrOutputParser
from markdown import markdown

In [24]:
# Global variables
VECTOR_DB_NAME = "local-rag"
local_model = "llama3.2"
llm = ChatOllama(model=local_model)
vector_db = None

# Function to load PDF
def load_pdf(file_path):
    loader = PyPDFLoader(file_path=file_path)
    return loader.load()

# Function to split text
def split_text(data, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(data)

# Function to create vector database
def create_vector_db(chunks):
    global vector_db
    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=OllamaEmbeddings(model="nomic-embed-text"),
        collection_name=VECTOR_DB_NAME,
        persist_directory="./chroma_db"  # Use a directory to persist embeddings
#         persist_directory=None  # Set to None for in-memory storage
    )
    return "Vector database created successfully"

# Function to set up retriever
def get_retriever():
    query_prompt = PromptTemplate(
        input_variables=["question"],
        template="""You are an AI language model assistant. Generate 2 different versions of the given user question to retrieve relevant documents from a vector database. Provide these alternative questions separated by newlines. Original question: {question}""",
    )
    return MultiQueryRetriever.from_llm(vector_db.as_retriever(), llm, prompt=query_prompt)

# Function to create RAG chain
def create_rag_chain():
    retriever = get_retriever()
    template = """Answer the question based ONLY on the following context:\n{context}\nQuestion: {question}"""
    prompt = ChatPromptTemplate.from_template(template)
    return (
        {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()
    )

# Function to query the document
def chat_with_pdf(question):
    if not vector_db:
        return "Error: No vector database found. Please upload and process a PDF first."
    chain = create_rag_chain()
    response = chain.invoke(question)
    return markdown(response)

In [23]:
# # This may be used in order to load the embeddigns from already created pdf file
# # SO that we do not create the embeddings for the same file -> save processing
# import hashlib

# def get_document_id(file_path):
#     """Generate a unique document ID based on the file content hash."""
#     with open(file_path, "rb") as f:
#         file_hash = hashlib.md5(f.read()).hexdigest()  # Create file hash
#     return file_hash

# def create_vector_db(chunks, file_path):
#     global vector_db

#     # Generate a unique document ID
#     doc_id = get_document_id(file_path)

#     # Check if the document already exists in the DB
#     existing_docs = vector_db.get(ids=[doc_id])
    
#     if existing_docs and existing_docs["documents"]:
#         print("📄 Document already exists. Loading from database...")
#         return vector_db  # Load existing embeddings

#     print("⚡ New document detected. Creating embeddings...")

#     # Store document with unique ID
#     vector_db = Chroma.from_documents(
#         documents=chunks,
#         embedding=OllamaEmbeddings(model="nomic-embed-text"),
#         ids=[doc_id]  # Store document ID in the DB
#     )
#     return vector_db


In [10]:
!pip install "posthog<3.0.0"


Defaulting to user installation because normal site-packages is not writeable
Collecting posthog<3.0.0
  Downloading posthog-2.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading posthog-2.5.0-py2.py3-none-any.whl (36 kB)
Installing collected packages: posthog
  Attempting uninstall: posthog
    Found existing installation: posthog 3.21.0
    Uninstalling posthog-3.21.0:
      Successfully uninstalled posthog-3.21.0
Successfully installed posthog-2.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
# Test the pipeline step by step
if __name__ == "__main__":
    pdf_path = "nep-6.pdf"  # Change to the actual file path
    
    # Load PDF
    data = load_pdf(pdf_path)
    print("PDF Loaded Successfully")
    
    # Split text
    chunks = split_text(data[:2])
    print("Text Split into Chunks")
    
    # Create Vector DB
    print(create_vector_db(chunks))
    
    # Test query
    test_question = "What is this document about?"
    print("Response:", chat_with_pdf(test_question))

PDF Loaded Successfully
Text Split into Chunks
Vector database created successfully
Response: <p>This document appears to be a policy document related to education in India, specifically focusing on higher education. It outlines recommendations for reforming and revitalizing the country's higher education system, with an emphasis on quality, equity, and inclusion. The document discusses various issues facing the current system, such as fragmentation, lack of cognitive skills development, rigid specialization, and limited access, among others. It proposes a comprehensive set of changes to address these challenges and promote a high-quality, inclusive, and holistic higher education system that prepares students for meaningful lives and productive contributions to society.</p>


In [22]:
  # Test query
test_question = "Summarize NEP 2020?"
print("Response:", chat_with_pdf(test_question))

Response: <p>Based on the provided context, here is a summary of the National Education Policy 2020 (NEP 2020):</p>
<p>The policy aims to overhaul and re-energize the higher education system in India to overcome existing challenges. The key vision includes:</p>
<ol>
<li>Consolidating into large multidisciplinary universities and colleges with at least one in or near every district, offering medium of instruction or programs in local/Indian languages.</li>
<li>Embracing a more multidisciplinary undergraduate education.</li>
<li>Granting faculty and institutional autonomy.</li>
<li>Revamping curriculum, pedagogy, assessment, and student support for enhanced student experiences.</li>
<li>Reinforcing the integrity of faculty and institutional leadership positions through merit-based appointments and career progression based on teaching, research, and service.</li>
<li>Establishing a National Research Foundation.</li>
</ol>
<p>The policy also highlights several challenges that need to be ad