In [1]:
import os
import pinecone 
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

  from tqdm.autonotebook import tqdm


In [2]:
PINECONE_API_KEY = "PINECONE_API_KEY"
PINECONE_ENV = "PINECONE_ENV"
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [3]:
loader = DirectoryLoader(
    'Data/', # my local directory
    glob='**/*.pdf',     # we only get pdfs
    show_progress=True
)
docs = loader.load()

  0%|          | 0/29 [00:00<?, ?it/s]

100%|██████████| 29/29 [01:56<00:00,  4.03s/it]


In [4]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=0
)
docs_split = text_splitter.split_documents(docs)
#docs_split

Created a chunk of size 1294, which is longer than the specified 1000
Created a chunk of size 3329, which is longer than the specified 1000
Created a chunk of size 2874, which is longer than the specified 1000
Created a chunk of size 1748, which is longer than the specified 1000
Created a chunk of size 1031, which is longer than the specified 1000
Created a chunk of size 2521, which is longer than the specified 1000
Created a chunk of size 3362, which is longer than the specified 1000
Created a chunk of size 2018, which is longer than the specified 1000
Created a chunk of size 1996, which is longer than the specified 1000
Created a chunk of size 3949, which is longer than the specified 1000
Created a chunk of size 5521, which is longer than the specified 1000
Created a chunk of size 2088, which is longer than the specified 1000
Created a chunk of size 1209, which is longer than the specified 1000
Created a chunk of size 1306, which is longer than the specified 1000
Created a chunk of s

In [5]:
# we use the openAI embedding model
embeddings = OpenAIEmbeddings()
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

In [7]:
# it cost almost 0,10$
doc_db = Pinecone.from_documents(
    docs_split, 
    embeddings, 
    index_name='langchain-test'
)

In [8]:
# it cost almost 0,035$
query_ex1 = "What were the most important events for Google in 2021?"
search_docs = doc_db.similarity_search(query_ex1)
search_docs

[Document(page_content='To our investors,\n\n2022 was a year full of change and uncertainty around the world. In February, when war broke out in Ukraine, our teams worked around-the-clock to make sure our products were helpful to people who needed them, from providing trustworthy information on Search to disrupting cyberattacks to partnering with the government to deploy air raid alerts. In March, I traveled to Warsaw, Poland, where I met Googlers hosting families who sought refuge, talked with entrepreneurs using our office spaces, and saw how our products like Google Translate were helping Ukrainians find a bit of hope and connection.\n\nshared new generative models, including Imagen, our text-to-image model, and Phenaki, which can generate long, coherent videos from text prompts.', metadata={'source': 'Data/2022-alphabet-annual-report.pdf'}),
 Document(page_content='remain critical. Our products give people choice and help them find high-quality journalism — from international stori

In [9]:
# it cost almost 0,035$
query_ex2 = "How much benefits made Google en 2021?"
search_docs = doc_db.similarity_search(query_ex2)
search_docs

[Document(page_content='Google Search & other Google Search & other revenues increased $5,947 million from 2019 to 2020. The overall growth was primarily driven by interrelated factors including increases in search queries resulting from ongoing growth in user adoption and usage, primarily on mobile devices, growth in advertiser spending primarily in the second half of the year, and improvements we have made in ad formats and delivery. This increase was partially offset by a decline in advertiser spending primarily in the first half of the year driven by the impact of COVID-19.', metadata={'source': 'Data/2020-alphabet-annual-report.pdf'}),
 Document(page_content='Google Services total\n\n237,529\n\n253,528\n\nGoogle Cloud\n\n19,206\n\n26,280\n\nOther Bets\n\n753\n\n1,068\n\nHedging gains (losses)\n\n149\n\n1,960\n\nTotal revenues\n\n$\n\n257,637 $\n\n282,836\n\nGoogle Services\n\nGoogle advertising revenues\n\nGoogle Search & other\n\nGoogle Search & other revenues increased $13.5 bil

In [13]:
### use a LLM
from langchain import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [11]:
llm = OpenAI()
# or
# llm = ChatOpenAI()


In [14]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type='stuff',
    retriever=doc_db.as_retriever(),
)

In [15]:
query_q1 = "What were the earnings in 2022?"
result = qa.run(query_q1)

In [16]:
result

' The earnings in 2022 were $74,842 million.'