In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI 
from langchain.chains import RetrievalQA
from langchain.llms import OpenAIChat
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PagedPDFSplitter
from langchain.document_loaders import DirectoryLoader,PagedPDFSplitter

In [1]:
import os
os.environ['OPENAI_API_KEY']=''
os.environ['ACTIVELOOP_TOKEN']=''

In [5]:
import requests
import tqdm
from typing import List


# financial reports of amamzon, but can be replaced by any URLs of pdfs
urls = ['https://www.anz.com/content/dam/anzcom/shareholder/2022-anz-annual-report.pdf',
        'https://www.anz.com/content/dam/anzcom/shareholder/2021-annual-report-double-page-view.pdf',
        'https://www.anz.com/content/dam/anzcom/shareholder/ANZ-2020-Annual-Report.pdf',
        ]

def load_reports(urls: List[str]) -> List[str]:
    """ Load pages from a list of urls"""
    pages = []

    for url in tqdm.tqdm(urls):
        r = requests.get(url)
        path = url.split('/')[-1]
        with open(path, 'wb') as f:
            f.write(r.content)
        loader = PagedPDFSplitter(path)
        local_pages = loader.load_and_split()
        pages.extend(local_pages)
    return pages

pages = load_reports(urls)


100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:44<00:00, 14.88s/it]


In [7]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(pages)

embeddings = OpenAIEmbeddings()

db = DeepLake(dataset_path="hub://l60jsrwc/anz_annual_reports", embedding_function=embeddings, token=os.environ['ACTIVELOOP_TOKEN'])
db.add_documents(texts)


Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


|

Dataset(path='hub://l60jsrwc/anz_annual_reports', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape       dtype  compression
  -------    -------     -------     -------  ------- 
 embedding  embedding  (1203, 1536)  float32   None   
    id        text      (1203, 1)      str     None   
 metadata     json      (1203, 1)      str     None   
   text       text      (1203, 1)      str     None   


 

['ccddf619-13a5-11ee-b60b-e848b8c82000',
 'ccddf61a-13a5-11ee-98a7-e848b8c82000',
 'ccddf61b-13a5-11ee-b082-e848b8c82000',
 'ccddf61c-13a5-11ee-b8f9-e848b8c82000',
 'ccddf61d-13a5-11ee-9aea-e848b8c82000',
 'ccddf61e-13a5-11ee-8297-e848b8c82000',
 'ccddf61f-13a5-11ee-9092-e848b8c82000',
 'ccddf620-13a5-11ee-bdca-e848b8c82000',
 'ccddf621-13a5-11ee-a0b8-e848b8c82000',
 'ccddf622-13a5-11ee-9f19-e848b8c82000',
 'ccddf623-13a5-11ee-99ea-e848b8c82000',
 'ccddf624-13a5-11ee-bac4-e848b8c82000',
 'ccddf625-13a5-11ee-b229-e848b8c82000',
 'ccddf626-13a5-11ee-b51c-e848b8c82000',
 'ccddf627-13a5-11ee-9369-e848b8c82000',
 'ccddf628-13a5-11ee-bdf0-e848b8c82000',
 'ccddf629-13a5-11ee-9e80-e848b8c82000',
 'ccddf62a-13a5-11ee-96ea-e848b8c82000',
 'ccddf62b-13a5-11ee-a556-e848b8c82000',
 'ccddf62c-13a5-11ee-b910-e848b8c82000',
 'ccddf62d-13a5-11ee-ba1e-e848b8c82000',
 'ccddf62e-13a5-11ee-8dfa-e848b8c82000',
 'ccddf62f-13a5-11ee-a04c-e848b8c82000',
 'ccddf630-13a5-11ee-88e9-e848b8c82000',
 'ccddf631-13a5-

In [17]:
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model='gpt-3.5-turbo'), chain_type='stuff', retriever=db.as_retriever())


In [19]:
qa.run("Describe ANZ's financial performance in 2020.")


"ANZ's financial performance in 2020 'Met Expectations' when considering the objectives they set themselves. The net interest margin for the Australia Retail and New Zealand divisions were 2.59% and 2.26% respectively, while the operating expenses to operating income ratio was 45.1% for Australia Retail and 44.8% for New Zealand. ANZ demonstrated appropriate responses to the pandemic, supporting their customers and people while remaining well-managed, including through the demonstration of strong financial discipline. The cash profit from continuing operations in 2020 was $3.7 billion, compared to $6.5 billion in 2019. Overall, ANZ's performance was slightly below expectations, with areas for improvement in customer focus and slower than expected progress in regards to building a Group-wide non-financial risk framework."