In [None]:
# =====================Imported Packages====================== #
import psycopg2
from pypdf import PdfReader as DocumentLoader
from sentence_transformers import SentenceTransformer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# ========================Variables=========================== #
index: int = 0
document_path: str = '/Volumes/Data/books/An-Autobiography.pdf'

# Set the embedding model globally
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
) # Llama Index uses 384 dimensions


# embedding_model = SentenceTransformer("all-MiniLM-L6-v2 ") # Sentence Transformer uses 768 dimensions

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

# =========================Functions========================== #
def documnetPageIndex(document_reader):
  """ 
  This method return the text in the form of paragraph
  for each page present in the documnet uploaded
  """

  global index
  
  # Saved 'page numbers' for the uploded document
  document_index = [] 

  # Get 'total-size' of uploaded document
  total_pages: int = len(document_reader.pages)

  while index < total_pages:
    document_index.append(index)
    index += 1

  return document_index

# used inside function
def createEmbeddings(text):
  """
  Create vector embeddings for the text which has been passed
  using the globally set LlamaIndex embedding model.
  """

  # return embedding_model.encode(text).tolist()
  return Settings.embed_model.get_text_embedding(text)

def createDbConfiguration(host: str, username: str, password: str, database: str):
  """
  Returns a dictionary with database connection parameters
  """ 

  return { "host": host, "user": username, "database": database, "password": password }

def createTable(table_name: str, cursor, connection):
    """
    Create table with a custom name
    """
    
    create_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id SERIAL PRIMARY KEY,
        page_number INT,
        content_type TEXT,  -- 'Text' or 'Image'
        content TEXT,       -- Either actual text or file path to image
        embedding VECTOR(384)  -- Assuming pgvector is installed and 384-dim vector used
    );
    """
    cursor.execute(create_query)
    connection.commit()

# used inside function
def saveToDatabase(table_name:str, page_number:int, content_type:str, content:str, embeddings, cursor, connection):
  """
  Save the extracted content and its embedding into the specified table.
  """
  query = f"""
  INSERT INTO {table_name} (page_number, content_type, content, embedding)
  VALUES (%s, %s, %s, %s);
  """
  cursor.execute(query, (page_number, content_type, content, embeddings))
  connection.commit()
  return True

def saveDocumentContent(index_array, document_reader, table_name, cursor, connection):
  """
  1. Takes the 'document index array' as input.
  2. Use that to extract content per page number.
  3. Check the type of content (whether image or text).
  4. Saves in database with 'page-number' as key, type of content
     and the content of that page itself.
  5. Create the embeddings for each content.
  """

  try:
      for page_index in index_array:
          text = ''
          page_text = document_reader.pages[page_index].extract_text()
          
          if page_text and page_text.strip():
              lines = page_text.split('\n')
              text = ' '.join([line.strip() for line in lines if line.strip()])
              text_embedding = createEmbeddings(text)
              saveToDatabase(table_name, page_index + 1, "Text", text, text_embedding, cursor, connection)

      print("All pages processed and saved successfully....!")

  except Exception as e:
        print(f"⚠️ Error while processing pages: {e}")

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #

In [38]:
# ===========================Usage============================ #
table_name = "chatbot_mahatma_gandhi_autobiography"
DB_Configuration = createDbConfiguration("localhost", "postgres", "T101786R", "llm_chatbot")
connection = psycopg2.connect(
    host=DB_Configuration["host"],
    user=DB_Configuration["user"],
    password=DB_Configuration["password"],
    database=DB_Configuration["database"]
)

dbCursor = connection.cursor()
document_reader = DocumentLoader(document_path)

index_array = documnetPageIndex(document_reader)
createTable(table_name, dbCursor, connection)
saveDocumentContent(index_array, document_reader, table_name, dbCursor, connection)

All pages processed and saved successfully....!


In [50]:
from ollama import chat, ChatResponse

# print formatted boxed headings
def print_boxed_text(heading: str, content: str, width: int = 50):
    border = "+" + "-" * (width - 2) + "+"
    empty_line = "|" + " " * (width - 2) + "|"

    # format heading centered
    heading_line = "|{:^{width}}|".format(heading, width=width - 2)

    # wrap content to fit box width
    import textwrap
    wrapped = textwrap.wrap(content, width=width - 4)  # Inner content width
    content_lines = ["| {:<{width}} |".format(line, width=width - 4) for line in wrapped]

    # assemble the box
    print(border)
    print(heading_line)
    print(empty_line)
    for line in content_lines:
        print(line)
    print(border)

def fetchTopVectorSimilarResults(search_embedding, table_name, k_results, cursor, connection):
  """
  Retrieve the top `k_results` similar content chunks based on vector similarity.
  """
  query = f"""
  SELECT page_number, content, 1 - (embedding <=> %s::vector) AS similarity
  FROM {table_name}
  ORDER BY embedding <=> %s::vector
  LIMIT %s;
  """
  try:
    cursor.execute(query, (search_embedding, search_embedding, k_results))
    results = cursor.fetchall()
    return results
  except Exception as e:
      connection.rollback()
      print("❌ SQL Error:", e)
      return []

def generateLlmResponse(top_results, user_prompt, set_similarity_threshold, llm_model):
    similarity_scores = []
    formatted_chunks = []
    
    for idx, (page_number, content, similarity) in enumerate(top_results, start=1):
        similarity_scores.append(similarity)
        formatted_chunks.append(f"{idx}. {content.strip()}")
    
    max_similarity = max(similarity_scores) if similarity_scores else 0.0
    
    if max_similarity < set_similarity_threshold:
        print(f"❌ Out of context. max similarity: {max_similarity:.4f} < threshold {set_similarity_threshold}")
        return "The question appears to be out of context based on the document content."
    
    joined_chunks = "\n\n".join(formatted_chunks)
    
    final_prompt = f"""
    You are an assistant answering questions using ONLY the provided context below.
    If the answer is not contained in the context, respond with "I don't know based on the provided context."

    === CONTEXT ===
    {joined_chunks}

    === QUESTION ===
    {user_prompt}

    Please provide your answer, and explicitly state whether you used the context or not.
    """
    
    message = [{'role': 'user', 'content': final_prompt}]
    
    response: ChatResponse = chat(model=llm_model, messages=message)
    answer = response['message']['content']
    
    print_boxed_text("Retrieved Context", joined_chunks, width=100)
    print_boxed_text("User Question: ", user_prompt, width=100)
    print_boxed_text("Answer", answer, width=100)
    print(f"(Max similarity: {max_similarity:.4f}, Threshold: {set_similarity_threshold})")
    
    return answer

# LLM MODEL
model="mistral"

# user imput
user_prompt = input("Ask your question: ")

# create user prompt embeddings
prompt_embedding = createEmbeddings(user_prompt)

# Fetch top 5 similar results from DB
top_results = fetchTopVectorSimilarResults(prompt_embedding, table_name, 10, dbCursor, connection)

# LLM Response
response_by_llm = generateLlmResponse(top_results=top_results, user_prompt=user_prompt, set_similarity_threshold=0.47, llm_model=model)

+--------------------------------------------------------------------------------------------------+
|                                        Retrieved Context                                         |
|                                                                                                  |
| 1. An Autobiography or My Experiments with Truth www.mkgandhi.org  Page 438 8. LAKSHMAN JHULA It |
| was a positive relief to reach the Gurukul and meet Mahatma Munshiramji with his giant frame. I  |
| at once felt the wonderful contrast between the peace of the Gurukul and the din and noise of    |
| Hardvar. The Mahatma overwhelmed me with affection. The Brahmacharis were all attention. It was  |
| here that I was first introduced to Acharya Ramadevji, and I could immediately see what a force  |
| and a power he must be. We had different viewpoints in several matters, nevertheless our         |
| acquaintance soon ripened into friendship. I had long discussions with Acharya Ramadevji 

In [48]:
# Fetch top 5 similar results from DB
top_results = fetchTopVectorSimilarResults(prompt_embedding, table_name, 10, dbCursor, connection)

# LLM Response
response_by_llm = generateLlmResponse(top_results=top_results, user_prompt=user_prompt, set_similarity_threshold=0.47, llm_model=model)

# print_boxed_text("User Question: ", user_prompt, width=100)
# print_boxed_text("LLM model: Mistral", response_by_llm, width=100)

+--------------------------------------------------------------------------------------------------+
|                                        Retrieved Context                                         |
|                                                                                                  |
| 1. An Autobiography or My Experiments with Truth www.mkgandhi.org  Page 22 1. BIRTH AND          |
| PARENTAGE The Gandhis belong to the Bania caste and seem to have been originally grocers. But    |
| for three generations, from my grandfather, they have been Prime Ministers in several Kathiawad  |
| States. Uttamchand Gandhi, alias Ota Gandhi, my grandfather, must have been a man of principle.  |
| State intrigues compelled him to leave Porbandar, where he was Diwan, and to seek refuge in      |
| Junagadh. There he saluted the Nawab with the left hand. Someone, noticing the apparent          |
| discourtesy, asked for an explanation, which was given thus : ‘The right hand is already 

In [28]:
connection.close()

In [None]:
# generateResponse(top_results, user_prompt, 0.5)
# for result in top_results:
#     page_number, content, similarity = result
#     print(f"\n[Page {page_number}] (Score: {similarity:.4f})\n{content[:300]}...")

In [None]:
# Format the top chunks (numbered)
# formatted_chunks = "\n\n".join([f"{i+1}. {chunk}" for i, chunk in enumerate(top_chunks)])

# if not user_prompt:
#     print_boxed_text("Error", "You must enter a valid question.", width=60)
# else:
#     # Build the complete prompt
#     final_prompt = f"""You are an assistant. Use the following context from a PDF document to answer the user's question.

# === CONTEXT ===
# {formatted_chunks}

# === QUESTION ===
# {user_prompt}

# Provide a helpful and accurate answer based on the above context.
# """

#     # Call the LLM via Ollama
#     from ollama import chat
#     response = chat(model="llama3.2:1b", messages=[
#         {'role': 'user', 'content': final_prompt}
#     ], stream=True)

#     # Collect and format the response
#     llm_response = ""
#     for chunk in response:
#         llm_response += chunk["message"]["content"]

#     print_boxed_text("LLM Response", llm_response, width=100)