In [1]:
import numpy as np
import pandas as pd
from langchain.llms import GooglePalm
from langchain.prompts import PromptTemplate
from langchain.embeddings import GooglePalmEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders.pdf import OnlinePDFLoader, PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory

In [3]:
import dotenv

dotenv.load_dotenv()

llm = GooglePalm(temperature=0.5)
embedding = GooglePalmEmbeddings()

In [None]:
# ! pip install pypdf

In [10]:
loader = PyPDFLoader("../music.pdf")
data = loader.load()
len(data)

6

In [15]:
splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " "], chunk_size=500, chunk_overlap=100)
docs = splitter.split_documents(data)

In [19]:
vectordb = FAISS.from_documents(docs, embedding)

In [80]:
memory = ConversationBufferMemory(memory_key="history", return_messages=True, input_key="question")

In [81]:
base_prompt = '''You are a AI assistant of pdf's. You are supposed to answer related to data you have provided. If you don't know just say I don't know, don't make things up. If user greets you by formal hello or by telling their name, greet as well. Your name is PdfGPT. 
This is the context {context} 
This is the history {history}
This is the question : {question}
'''

In [82]:
prompt = PromptTemplate(template=base_prompt, input_variables=["context","history", "question"])

In [83]:
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff" ,retriever=vectordb.as_retriever(),chain_type_kwargs={
    "prompt":prompt,
    "memory":memory
}, input_key="question")

In [84]:
def ask_question(q):
    return chain({"question":q}, return_only_outputs=True)

In [85]:
ask_question("Hello")["result"]

'Hello! How can I help you today?'

In [86]:
ask_question("My name is Paras")["result"]

'Hello Paras! How can I help you today?'

In [87]:
ask_question("What is your name ?")["result"]

'I am PdfGPT, an AI assistant that helps you with your PDFs.'

In [89]:
ask_question("What is user name ?")["result"]

"The user's name is Paras."

In [90]:
ask_question("What are frequency based features of audio")["result"]

'Mel-spectrogram and tempogram are two frequency based features of audio. Mel-spectrogram is a 2D representation of the frequency spectrum of a sound signal. It is computed by taking the short-time Fourier transform (STFT) of the audio signal and then applying a mel-scale filter bank to the resulting spectrum. The mel-scale filter bank is a set of overlapping band-pass filters that are logarithmically spaced in mel frequency. This results in a spectrogram that is more sensitive to the frequencies that are important for human hearing.\n\nTempogram is a 1D representation of the tempo of a sound signal. It is computed by taking the autocorrelation of the audio signal and then applying a low-pass filter to the resulting autocorrelation function. The low-pass filter removes the high-frequency components of the autocorrelation function, which correspond to the transients in the audio signal. This leaves a smooth curve that represents the tempo of the audio signal.\n\nBoth mel-spectrogram and

In [1]:
import os
import dotenv
from pypdf import PdfReader
from io import BytesIO
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.embeddings.google_palm import GooglePalmEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.document_loaders.pdf import PyPDFLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain_core.runnables import RunnablePassthrough


dotenv.load_dotenv()

class BinaryPDFRecursiveSplitter:
    def __init__(self,separators=["\n\n", "\n", ".", " "], chunk_size=1200, chunk_overlap=200):
        self.splitter = RecursiveCharacterTextSplitter(separators=separators,chunk_size=chunk_size,chunk_overlap=chunk_overlap)
        
    def split_pdf(self,stream):
        self.stream = BytesIO(stream)
        self.reader = PdfReader(self.stream)
        n = len(self.reader.pages)
        s = ""
        for i in range(n):
            text = self.reader.pages[i].extract_text()
            s+=text
        return self.splitter.split_text(s)

class PdfGPT:
    def __init__(self):
        self.llm = None
        self.embedding = None    
        self.sample_vectordb = None
        self.prompt = None
        self.create_llm_model()
        
    def create_llm_model(self, google_api_key=os.environ["GOOGLE_API_KEY"]):
        self.llm = GoogleGenerativeAI(model="gemini-pro",google_api_key=google_api_key, temperature=0.5)
        self.embedding = SentenceTransformerEmbeddings()
        
        self.sample_vectordb = FAISS.from_texts(texts=["Your name is PdfGPT. You are an AI assistant of pdf's. You are developed by Paras Punjabi. You can help users to extract data from pdf's and help them by answering some questions related to that pdf. You are supposed to answer related to data you have provided. If you don't know just say I don't know and justify your answer with generic reason, don't make things up or throw an unknown error. If user greets you by formal hello or by telling their name, greet as well. History of chat will be provided with the question only. Understand that chat history which is between you and the client and answer according to that chat and data you have been trained."], embedding=self.embedding)
        
        base_prompt = '''Your name is PdfGPT. You are an AI assistant of pdf's. You can help users to extract data from pdf's and help them by answering some questions related to that pdf. You are supposed to answer related to data you have provided. If you don't know just say I don't know and justify your answer with generic reason, don't make things up or throw an unknown error. If user greets you by formal hello or by telling their name, greet as well. History of chat will be provided with the question only. Understand that chat history which is between you and the client and answer according to that chat and data you have been trained.
        This is the context {context} 
        This is the question : {question}
        Provide only answer, don't use any prefixes.
        '''
        
        self.prompt = PromptTemplate(template=base_prompt, input_variables=["context", "question"])
    
    def add_binary_pdf(self,stream,chat_id, filename):
        vectordb = self.load_vectordb_from_local(chat_id)
        splitter = BinaryPDFRecursiveSplitter()
        docs = splitter.split_pdf(stream)
        
        vectordb.add_texts([f"The pdf file name is {filename} and content of pdf is below"])
        vectordb.add_texts(docs)
        self.save_vectordb(vectordb,chat_id)
    
    def add_pdf(self,pdf_path, chat_id):
        vectordb = self.load_vectordb_from_local(chat_id)
        filename = os.path.basename(pdf_path)
        loader = PyPDFLoader(file_path=pdf_path,extract_images=True)
        data = loader.load()
        splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " "], chunk_size=1200, chunk_overlap=200)
        docs = splitter.split_documents(data)
        
        vectordb.add_texts([f"The pdf file name is {filename} and content of pdf is below"])
        vectordb.add_documents(docs)
        
        self.save_vectordb(vectordb, chat_id)
    
    def save_vectordb(self, vectordb, chat_id):
        vectordb.save_local(folder_path=f"./faiss_indexes/{chat_id}")
    
    def load_vectordb_from_local(self,chat_id):
        if(os.path.exists(f"./faiss_indexes/{chat_id}") == False):
            self.save_vectordb(self.sample_vectordb, chat_id)
            return self.load_vectordb_from_local(chat_id)
        
        vectordb = FAISS.load_local(folder_path=f"./faiss_indexes/{chat_id}", embeddings=self.embedding, allow_dangerous_deserialization=True)
        return vectordb
    
    def rename_chat_id(self, old_chat_id, new_chat_id):
        os.rename(f"./faiss_indexes/{old_chat_id}", f"./faiss_indexes/{new_chat_id}")
    
    def ask_question(self,chat_id, q,chat_history_str=""):
        try:
            vectordb = self.load_vectordb_from_local(chat_id)
            
            chain = (
                {"context":vectordb.as_retriever(), "question": RunnablePassthrough()} 
                | self.prompt 
                | self.llm
                )
            
            if(len(chat_history_str) != 0):
                q = q + f"\nThis is the history : {chat_history_str}"
            
            return {"result":chain.invoke(q), "status":True}
        
        except Exception as e:
            print(e)
            return {"result":"Sorry, I don't know","status":False}

In [2]:
m = PdfGPT()

In [5]:
m.ask_question(chat_id="letter", q="What is the total ctc")

{'result': "I don't know. This document does not contain any information about total ctc.",
 'status': True}

In [40]:
m.add_pdf(chat_id="resume", pdf_path="./Paras_Punjabi_Resume.pdf")

In [43]:
m.add_pdf(chat_id="resume", pdf_path="./tcs_mail.pdf")

In [6]:
m.ask_question(chat_id="resume", q="What is two stream network ")

{'result': 'Two Stream Network is a model that will detect activities performed by humans in that video.',
 'status': True}

In [77]:
db = m.load_vectordb_from_local("letter")

In [58]:
text = ""
for v in db.index_to_docstore_id.values():
    text+=db.docstore.search(v).page_content

{0: 'fde4af6e-82de-4029-962e-3ec48facf15b',
 1: 'd689f51b-a90b-465d-99d7-27d522b86970',
 2: '3ff32b5d-0220-4873-8cf8-ad6b6f045a3a',
 3: '6ccc2d99-03ae-427a-9759-ed2a49ac826b',
 4: '4a0dcc85-9599-4f9a-b530-a703942fb835',
 5: 'e67e83a4-9405-4da0-b723-51f700e57d35',
 6: 'd02a3455-36a7-4a28-bd9c-0fc39a628735',
 7: 'f382b069-483f-4471-b570-cfee8bd51d13',
 8: 'f2d0ba56-650f-422d-87c4-3844a2ed43ee',
 9: '87f903a5-2e0f-47e6-8444-92c1a1785ad1',
 10: '9256faa3-3c03-4d9d-a73d-f58f8bff87db',
 11: '8d721bb9-9c6b-4def-a977-3b1d89d600c6',
 12: '4d211ff9-3d95-4992-8e23-67500c6fd792',
 13: 'dbb44d3c-fba6-4f31-a31e-c799c1522088',
 14: '64f674fa-bdea-4144-bb18-243f955b9497',
 15: '8d1d6d05-b419-4696-b27c-377617266416',
 16: 'f1559415-4c9b-47a5-b018-b919b5610d3c',
 17: 'b0eb68a4-8440-48d6-a44a-70afe8636f35',
 18: '2efec435-a361-40bc-94e2-7a1af2284179',
 19: '3426968f-496b-49cf-b2dd-ae36c3cefcb8',
 20: '03278e9e-d5e1-495d-bdef-3ba317bdb2ba',
 21: '8ef9e35e-be53-46de-95d0-3f1f826015d9',
 22: 'f74e93ab-2001-

In [7]:
m.ask_question(chat_id="test", q="What can you do for me ?")

{'result': "I can help you to extract data from pdf's and help you by answering some questions related to that pdf.",
 'status': True}