In [1]:
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import numpy as np
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import faiss
from langchain.schema import Document
from langchain_community.docstore.in_memory import InMemoryDocstore
import pickle
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

In [3]:
import fitz

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def chunk_text(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]


In [4]:
class SentenceTransformerEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, documents):
        return self.model.encode(documents, convert_to_numpy=True)

    def embed_query(self, query):
        return self.model.encode([query], convert_to_numpy=True)[0]


In [5]:
pdf_text = extract_text_from_pdf('Bhagavad-gita.pdf')
chunks = chunk_text(pdf_text)

In [6]:
chunks[0]

'This free for download ebook is a direct reproduction of the\noriginal bona fide personally approved and blessed by Srila\nPrabhupada.\nThis ebook was made by the official website for Srila\nPrabhupada’s original books:\nKrishnapath.org\nAll the content was directly taken from the original scans, is\nunchanged and intact.\nMore free downloads at:\nwww.krishnapath.org\nBhagavad-gītā \nAs It Is\nBhagavad-gītā \nAs It Is\nCOMPLETE EDITION\nwith original Sanskrit text,\nRoman transliteration, English equivalents,\ntranslation a'

In [8]:
embedding_model = SentenceTransformerEmbedding()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
chunk_embeddings = embedding_model.embed_documents(chunks)

In [12]:
chunk_embeddings[0]

array([-8.60100612e-02, -3.41308825e-02, -1.09034471e-01,  1.55170672e-02,
       -7.33080506e-02,  1.58233894e-03,  9.81454272e-03,  3.76618351e-03,
        1.77386254e-02,  5.39740287e-02,  9.19527262e-02,  2.19920930e-02,
        6.49828538e-02, -4.78510112e-02, -1.57244364e-03, -1.28372200e-02,
       -1.85714662e-02,  8.91471133e-02,  1.05936015e-02, -2.73640174e-02,
       -9.30150598e-03,  8.42301920e-02,  1.85012948e-02, -1.40110897e-02,
        3.88583343e-04,  8.21738446e-04,  3.46470246e-04, -7.73026198e-02,
       -1.98862925e-02, -1.48520116e-02,  2.39372402e-02, -2.09651086e-02,
       -1.80726424e-02, -5.58208628e-03,  3.60445672e-04,  5.61998747e-02,
       -8.81234854e-02,  3.37460004e-02,  3.24368775e-02,  7.85292499e-03,
       -2.67651044e-02, -2.37838570e-02, -2.32703015e-02, -3.44898030e-02,
        4.76781167e-02, -6.97358325e-02, -9.31292474e-02, -9.13184416e-03,
        2.00774893e-02, -3.25640216e-02, -1.21875003e-01, -5.95569611e-02,
       -7.15949535e-02,  

In [13]:
documents = [Document(page_content=chunk) for chunk in chunks]

index = faiss.IndexFlatL2(len(chunk_embeddings[0])) 

index.add(chunk_embeddings)

docstore = InMemoryDocstore({i: documents[i] for i in range(len(documents))})

vector_store = FAISS(
    embedding_function=embedding_model.embed_query,
    index=index,
    docstore=docstore,
    index_to_docstore_id={i: i for i in range(len(documents))},
)



`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [14]:
# Store
import faiss
import pickle

faiss_index_file = 'faiss_index.index'
faiss.write_index(vector_store.index, faiss_index_file)

docstore_file = 'in_memory_docstore.pkl'
with open(docstore_file, 'wb') as f:
    pickle.dump(vector_store.docstore, f)

index_to_docstore_id_file = 'index_to_docstore_id.pkl'
with open(index_to_docstore_id_file, 'wb') as f:
    pickle.dump(vector_store.index_to_docstore_id, f)


In [15]:
# Load
faiss_index_file = 'faiss_index.index'
index = faiss.read_index(faiss_index_file)

docstore_file = 'in_memory_docstore.pkl'
with open(docstore_file, 'rb') as f:
    docstore = pickle.load(f)

index_to_docstore_id_file = 'index_to_docstore_id.pkl'
with open(index_to_docstore_id_file, 'rb') as f:
    index_to_docstore_id = pickle.load(f)

vector_store = FAISS(
    embedding_function=embedding_model.embed_query, 
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [16]:
generation_config = {
  "temperature": 1.0,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config)

def retrieve_relevant_chunks(query, k=5):
    docs = vector_store.similarity_search(query, k=k)
    return " ".join([doc.page_content for doc in docs])

def generate_with_google(context, query):
    response = model.generate_content(f"Context: {context}\nQuery: {query}\nAnswer:")
    return response.text

In [17]:
prompt_template = """Based on the context provided below, answer the following query.

Context: {context}
Query: {query}
Answer:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "query"])

def run_rag_pipeline(query):
    retrieved_text = retrieve_relevant_chunks(query, k=5)
    response = generate_with_google(retrieved_text, query)
    return response

In [18]:
query = "Summarize what is true power"
final_answer = run_rag_pipeline(query)
print(final_answer)

True power, according to this text, lies in understanding and realizing the Absolute Truth (described in three phases: Brahman, Paramātmā, and Bhagavān).  This understanding frees one from material attachments and the illusory energy, allowing for unwavering devotional service to the Supreme Personality of Godhead, Krishna.  It's not about independence or equal power, but recognition of one's relationship with Krishna as a part and parcel of Him and acting accordingly.  The ultimate power is the Lord's, and living entities are His energies, subject to His control.

