In [120]:
# Cell 1: Imports and Setup (Run once per session)
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI
import psycopg2
from dotenv import load_dotenv
import os
load_dotenv()
key=os.getenv("OPENAI_API_KEY")
# Database connection parameters
DB_PARAMS = {
    "dbname": "new1",
    "user": "postgres",
    "password": "vidisha",
    "host": "localhost",
    "port": "5432"  # Change to "5433" if your docker-compose.yml uses "5433:5432"
}

# Initialize the embedding model (nomic-embed-text-v1 uses 768 dimensions)
embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={'trust_remote_code': True}
)

# Initialize the LLM (replace with your OpenAI API key)
llm = ChatOpenAI(
    model_name="gpt-4o",
    temperature=0,
    openai_api_key=key # Replace with your key
)

# Prompt template
prompt_template = PromptTemplate(
    input_variables=["context", "query"],
    template="""
            You are an AI assistant that helps users understand PDF documents. 
            
            **IMPORTANT: Response Format**  
            - If you don't find relevant information: **"No relevant information found in the provided PDFs."**  
            - After stating the sources, provide a detailed and structured answer.
            
            **Rules for Answering:**  
            - Use **only** the provided context to generate responses.   
            - If the answer is **not available**, state: **"Answer is not available in the context."**
            - If the query involves **calculations**, perform them and provide the exact result.    
            - If the query is **unclear**, ask for clarification instead of making assumptions.  
            

    Context:
    {context}

    Query:
    {query}

    Answer:
    """
)

print("Setup complete. Run the next cells to load data and query.")

<All keys matched successfully>


Setup complete. Run the next cells to load data and query.


In [121]:
# Cell 2: Database Connection and Table Setup (Run once, or when resetting the table)
conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()

# Create or reset the documents table (768 dimensions for nomic-embed-text-v1)
cur.execute("""
    DROP TABLE IF EXISTS documents;  -- Remove this line if you want to keep existing data
    CREATE TABLE documents (
        id SERIAL PRIMARY KEY,
        content TEXT NOT NULL,
        embedding VECTOR(768)
    );
""")

# Add an index for efficient semantic search
cur.execute("""
    CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 200);

""")
conn.commit()

print("Database table and index created. Run the next cell to load documents.")

Database table and index created. Run the next cell to load documents.


In [123]:
# Cell 3: Load and Store Documents (Run once per file, or skip if data is already loaded)
from PyPDF2 import PdfReader

file_path = "QUIZ 2017-18.pdf"  # Changed to PDF file

# Load and split the document
reader = PdfReader(file_path)
# Extract text from all pages
text = ""
for page in reader.pages:
    text += page.extract_text()

# Create a document object similar to what TextLoader would produce
from langchain.docstore.document import Document
documents = [Document(page_content=text)]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# Store in the database
# Store in the database with the correct vector format
for doc in texts:
    embedding = embedding_model.embed_query(doc.page_content)
    # Format the embedding as a PostgreSQL-compatible vector
    embedding_str = f"[{','.join(map(str, embedding))}]"

    cur.execute(
        "INSERT INTO documents (content, embedding) VALUES (%s, %s::vector)",
        (doc.page_content, embedding_str)
    )

conn.commit()
print(f"Loaded and stored {len(texts)} chunks.")

Loaded and stored 112 chunks.


In [126]:
# Cell 4: Query and LLM Response (Run this cell repeatedly with different queries)
query = " What is Characterology? "  

# Perform similarity search
query_embedding = embedding_model.embed_query(query)
query_embedding_str =f"[{','.join(map(str, query_embedding))}]"

cur.execute("SET hnsw.ef_search = 100;") 

# Cast the query embedding to VECTOR type in the query
cur.execute("""
    SET hnsw.ef_search = 100;
    SELECT content, embedding <-> %s::vector AS distance
    FROM documents
    ORDER BY embedding <-> %s::vector
    LIMIT 5;
""", (query_embedding_str, query_embedding_str))
similar = cur.fetchall()

# Display results
print("Top 5 similar document chunks:")
for content, distance in similar:
    print(f"Distance: {distance:.4f}\nContent: {content}\n")

# Prepare context for LLM
context = "\n\n".join([content for content, _ in similar])

# Get LLM response
llm_chain = LLMChain(llm=llm, prompt=prompt_template)
response = llm_chain.run({"context": context, "query": query})



Top 5 similar document chunks:
Distance: 1.0211
Content: Abhinav Bindra  
453) When is National Safety Day Observed?      
   4th March  
454) Which is the Highest Plat eau in the World?     
   Pamir ( Tibet ian Plateau)  
455) What is the Expansion of CSIR?       
   Council of Scientific and Industrial Research  
456) Who is the present CEO of Britannia?      
   Varun Berry  
457) What is Characterology?        
   Study of Personal  Character  
458) Who wrote the book, “The race of My Life”?     
   Milkha Singh

Distance: 1.0462
Content: Assam  
106) Which is the Highest Gallantry Award in India?    
   Param Vi r Chakra  
107) When is National Statistics Day Celebrated?     
   29th June 
108) What is the expansion of USB?       
   Universal Serial Bus  
109) Who was the first Indian to become the member of the British 
Parliament?           
   Dadabhai Naoroji  
110) What is the Total Duration of Foot ball Match?     
   2 Hours & 45 Minutes  
111) Who wrot e the book, “Disco

In [127]:
print("LLM Response:")
print(response)

LLM Response:
Sources:
- Context provided

Answer:
Characterology is the study of personal character.


In [None]:
# cur.close()
# conn.close()
# print("Database connection closed.")