In [27]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain import PromptTemplate, HuggingFaceHub
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.chains import LLMChain
from dotenv import load_dotenv
import os, logging
import streamlit as st

In [3]:
load_dotenv()

True

In [4]:
HUGGINGFACEHUB_API_KEY = os.getenv('HUGGINGFACEHUB_API_KEY')

In [5]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_KEY

In [6]:
loader = PyPDFDirectoryLoader(path='pdfs/',glob="**/*.pdf")
loader

<langchain_community.document_loaders.pdf.PyPDFDirectoryLoader at 0x208532b9810>

In [7]:
pdfs = loader.load()
len(pdfs)

381

In [8]:
pdfs[0]

Document(page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n___________________________________________\nFORM 10-K  \n___________________________________________\n(Mark One)\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2023  \nOR\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from              to             .\nCommission file number: 001-37580  \n___________________________________________\nAlphabet Inc.  \n(Exact name of registrant as specified in its charter)\n___________________________________________\nDelaware 61-1767919\n(State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification No.)\n1600 Amphitheatre Parkway  \nMountain View , CA 94043  \n(Address of principal executive offices, including zip code)\n(650) 253-0000  \n(Registrant\'s telephone number, in

In [9]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
docs = splitter.split_documents(pdfs)

In [10]:
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
# # Initialize ChromaDB vector store
# db = Chroma.from_documents(docs,embeddings,persist_directory='db')
# db

In [12]:
db = Chroma(persist_directory='db',embedding_function=embeddings)
db

<langchain_community.vectorstores.chroma.Chroma at 0x2086febac50>

In [13]:
model_path = "openai-community/gpt2"

llm = HuggingFaceHub(repo_id=model_path,
                    model_kwargs={'temperature': 0.6, 'max_length': 200})
llm

  warn_deprecated(


HuggingFaceHub(client=<InferenceClient(model='openai-community/gpt2', timeout=None)>, repo_id='openai-community/gpt2', task='text-generation', model_kwargs={'temperature': 0.6, 'max_length': 200})

In [14]:
template = """You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: {question}"""

prompt_template = PromptTemplate(template=template,input_variables=['question'])
prompt_template

PromptTemplate(input_variables=['question'], template='You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: {question}')

In [15]:
retriever = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=llm,prompt=prompt_template)
retriever

MultiQueryRetriever(retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002086FEBAC50>), llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['question'], template='You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: {question}'), llm=HuggingFaceHub(client=<InferenceClient(model='openai-community/gpt2', timeout=None)>, repo_id='openai-community/gpt2', task='text-generation', model_kwargs={'temperature': 0.6, 'max_length': 200}), output_parser=LineListOutputParser()))

In [16]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [17]:
sample_question = "What are the risk factors associated with Google and Tesla?"

unique_documents = retriever.get_relevant_documents(query=sample_question)
print(unique_documents)

INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: What are the risk factors associated with Google and Tesla?', '', 'Answer: A significant risk factor is the two-factor analysis described in the previous section. The risk factor is how many users are using the service. The risk factor is how many users are using the service at a given time. The risk factor is how many users are using the service at a given time. The risk factor is how many users are using the service at a given time. The risk factor is how many users are using the service at a given time. The risk factor is']


[Document(page_content="YouTube provides people with entertainment, information, and opportunities to learn something new. Google Assistant \noffers the best way to get things done seamlessly across different devices, providing intelligent help throughout a \nperson's day, no matter where they are. Google Cloud helps customers solve today’s business challenges, improve \nproductivity, reduce costs, and unlock new growth engines. We are continually innovating and building new products \nand features that will help our users, partners, customers, and communities and have invested more than $150 billion \nin research and development in the last five years in support of these efforts .\nMaking AI H elpful for Everyone\nAI is a transformational technology that can bring meaningful and positive change to people and societies across \nthe world, and for our business. At Google, we have been bringing AI into our products and services for more than a", metadata={'page': 4, 'source': 'pdfs\\goog

In [18]:
sample_question = "What is the total revenue for Google Search?"

unique_documents = retriever.get_relevant_documents(query=sample_question)
print(unique_documents)

INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: What is the total revenue for Google Search? Answer: $11.3 billion. What is the total revenue for Google Search? $11.3 billion. Question: What is the total revenue for Uber? Answer: $7 billion. What is the total revenue for Tesla? Answer: $5.4 billion. What is the total revenue for Uber? Answer: $4.4 billion. Question: What is the total revenue for Uber? Answer: $3.9 billion. What is the total revenue for Uber? Answer: $']


[Document(page_content='Costs and Expenses\nCost of Revenues\nThe following table presents cost of revenues, including TAC (in millions, except percentages):\n Year Ended December 31,\n 2021 2022 2023\nTAC $ 45,566 $ 48,955 $ 50,886 \nOther cost of revenues  65,373  77,248  82,446 \nTotal cost of revenues $ 110,939 $ 126,203 $ 133,332 \nTotal cost of revenues as a percentage of revenues  43 %  45 %  43 %\nCost of revenues increased  $7.1 billion  from 2022  to 2023  due to an increase  in other cost of revenues and TAC \nof $5.2 billion  and $1.9 billion , respectively.\nThe increase  in TAC from 2022  to 2023  was largely due to an increase in TAC paid to distribution partners, \nprimarily driven by growth in revenues subject to TAC. The TAC rate decreased from 21.8%  to 21.4%  from 2022  to \n2023  primarily due to a revenue mix shift from Google Network properties to Google Search & other properties. The \nTAC rate on Google Search & other revenues and the TAC rate on Google Network

In [19]:
sample_question = "What are the differences in the business of Tesla and Uber?"

unique_documents = retriever.get_relevant_documents(query=sample_question)
print(unique_documents)

INFO:langchain.retrievers.multi_query:Generated queries: ['You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: What are the differences in the business of Tesla and Uber? Answer: Uber is a startup that uses its own software to build its own vehicles. Tesla uses the same software to build its own vehicles. Uber uses the same software to build its own vehicles. Tesla has its own software to build its own vehicles. Tesla has its own software to build its own vehicles. There are many different ways to choose to build your own vehicles.', '', "Since you can't build your own car, you can't build your own software. So why do you need to know"]


[Document(page_content='UBER TECHNOLOGIES, INC.\nNOTES TO CONSOLIDATED FINANCIAL STATEMENTS\nNote 1 – Description of Business and Summary of Significant Accounting Policies\nDescription of Business\nUber Technologies, Inc. (“Uber,” “we,” “our,” or “us”) was incorporated in Delaware in July 2010, and is headquartered in San Francisco, California. Uber is a\ntechnology platform that uses a massive network, leading technology, operational excellence and product expertise to power movement from point A to point B.\nUber develops and operates proprietary technology applications supporting a variety of offerings on its platform (“platform(s)” or “Platform(s)”). Uber connects\nconsumers (“Rider(s)”) with independent providers of ride services (“Mobility Driver(s)”) for ridesharing services, and connects Riders and other consumers\n(“Eaters”) with restaurants, grocers and other stores (collectively, “Merchants”) with delivery service providers (“Couriers”) for meal preparation, grocery and', m

In [20]:
# Chain
llm_chain = LLMChain(llm=llm,prompt=prompt_template)

In [21]:
output = llm_chain(inputs={
    "question": sample_question,
    "context": "\n-------------\n".join([d.page_content for d in unique_documents])
})
output['text']

  warn_deprecated(


"You are an AI assistant. You have access to the content of several PDF documents from Google, Uber and Tesla. Compare the information from these documents to answer the following question: Question: What are the differences in the business of Tesla and Uber? Answer: Uber is a startup that uses its own software to build its own vehicles. Tesla uses the same software to build its own vehicles. Uber uses the same software to build its own vehicles. Tesla has its own software to build its own vehicles. Tesla has its own software to build its own vehicles. There are many different ways to choose to build your own vehicles.\n\nSince you can't build your own car, you can't build your own software. So why do you need to know"

In [25]:
output['text'].split('Answer: ')[1]

"Uber is a startup that uses its own software to build its own vehicles. Tesla uses the same software to build its own vehicles. Uber uses the same software to build its own vehicles. Tesla has its own software to build its own vehicles. Tesla has its own software to build its own vehicles. There are many different ways to choose to build your own vehicles.\n\nSince you can't build your own car, you can't build your own software. So why do you need to know"

In [26]:
st.title("Content Engine for Comparing PDF Documents")

query = st.text_input("Enter your question:")

if query:
    docs = retriever.get_relevant_documents(query=query)
    output = llm_chain(inputs={
        "question": query,
        "context": "\n---------------\n".join([d.page_content for d in docs])
    })
    response = str(output['text'].split('Answer: ')[1])
    st.write("Response:")
    st.write(response)

2024-06-27 22:18:43.361 
  command:

    streamlit run C:\Users\ADMIN\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-06-27 22:18:43.388 Session state does not function when running a script without `streamlit run`
