In [2]:
# download files
# !mkdir data
# !wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
# !unzip data/UBER.zip -d data

In [3]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [4]:
import os
os.environ['OPENAI_API_KEY'] = ""

In [6]:
from llama_index import download_loader, GPTSimpleVectorIndex, ServiceContext
from pathlib import Path

### Ingest Unstructured Data Through the Unstructured.io Reader

Leverage the capabilities of Unstructured.io HTML parsing.
Downloaded through LlamaHub.

In [7]:
years = [2022, 2021, 2020, 2019]

In [8]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

Collecting unstructured
  Downloading unstructured-0.5.8.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting argilla
  Downloading argilla-1.5.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting msg_parser
  Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openpyxl
  Downloading openpyxl-3.1.2-py2.

[?25hCollecting rfc3986[idna2008]<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: unstructured, python-docx, python-pptx, olefile
  Building wheel for unstructured (setup.py): started
  Building wheel for unstructured (setup.py): finished with status 'done'
  Created wheel for unstructured: filename=unstructured-0.5.8-py3-none-any.whl size=1319517 sha256=67785c57f4fe67c8032e8a458275327b49793459d65150084610968cc8c06021
  Stored in directory: /Users/raziel/Library/Caches/pip/wheels/80/cf/56/f90e201059fcd421ae55f0ecaed2

In [9]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to /Users/raziel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/raziel/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...


### Setup a Vector Index for each SEC filing

We setup a separate vector index for each SEC filing from 2019-2022.

We also optionally initialize a "global" index by dumping all files into the vector store.

In [10]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
service_context = ServiceContext.from_defaults(chunk_size_limit=512)
for year in years:
    cur_index = GPTSimpleVectorIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'index_{year}.json')
    

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 232797 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 241424 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 257154 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 246480 tokens


In [11]:
# Load indices from disk
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'index_{year}.json')
    index_set[year] = cur_index

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)

We want our queries to aggregate/synthesize information across *all* 10-K filings. To do this, we define a List index
on top of the 4 vector indices.

In [13]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.indices.composability import ComposableGraph

In [14]:
# set summary text for each doc
index_summaries = [f"UBER 10-k Filing for {year} fiscal year" for year in years]

In [15]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [16]:
# define a list index over the vector indices
# allows us to synthesize information across each index
graph = ComposableGraph.from_indices(
    GPTListIndex, 
    [index_set[y] for y in years], 
    index_summaries=index_summaries,
    service_context=service_context
)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [17]:
graph.save_to_disk('10k_graph.json')

In [18]:
graph = ComposableGraph.load_from_disk('10k_graph.json', service_context=service_context)

## Setting up the Chatbot Agent

We use Langchain to define the outer chatbot abstraction. We use LlamaIndex as a core Tool within this abstraction.

In [19]:
from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent

from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, GraphToolConfig

In [21]:
# define a decompose transform
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor, verbose=True
)

# define query configs for graph 
query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            # "include_summary": True
        },
        "query_transform": decompose_transform
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        }
    },
]

In [22]:
# define toolkit
index_configs = []
for y in range(2019, 2023):
    tool_config = IndexToolConfig(
        index=index_set[y], 
        name=f"Vector Index {y}",
        description=f"useful for when you want to answer queries about the {y} SEC 10-K for Uber",
        index_query_kwargs={"similarity_top_k": 3},
        tool_kwargs={"return_direct": True}
    )
    index_configs.append(tool_config)
    
# graph config
graph_config = GraphToolConfig(
    graph=graph,
    name=f"Graph Index",
    description="useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber.",
    query_configs=query_configs,
    tool_kwargs={"return_direct": True}
)

toolkit = LlamaToolkit(
    index_configs=index_configs,
    graph_configs=[graph_config]
)

In [23]:
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
    verbose=True
)

In [24]:
agent_chain.run(input="hi, i am bob")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? No
AI: Hi Bob, nice to meet you! How can I help you today?[0m

[1m> Finished chain.[0m


'Hi Bob, nice to meet you! How can I help you today?'

In [16]:
agent_chain.run(input="What were some of the biggest risk factors in 2020 for Uber?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: Vector Index 2020
Action Input: Risk Factors[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 2006 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 3 tokens



Observation: [33;1m[1;3m

Risk Factors

The COVID-19 pandemic and the impact of actions to mitigate the pandemic has adversely affected and continues to adversely affect our business, financial condition, and results of operations. The risks associated with the pandemic include, but are not limited to, the classification of Drivers as independent contractors, the economic impact of the pandemic, the potential cascading effects of the pandemic on our business, financial performance, and results of operations, the practices necessitated by the outbreak and related governmental actions, the increase in remote working and the associated privacy, cybersecurity and fraud risks, the legal and regulatory challenges associated with the pandemic, the workforce reductions announced in May 2020, the launch of new, or expansion of existing, services, features, or health and safety requirements in response to COVID-19, and the potential fines or other enforcement measures that could adversely imp

'\n\nRisk Factors\n\nThe COVID-19 pandemic and the impact of actions to mitigate the pandemic has adversely affected and continues to adversely affect our business, financial condition, and results of operations. The risks associated with the pandemic include, but are not limited to, the classification of Drivers as independent contractors, the economic impact of the pandemic, the potential cascading effects of the pandemic on our business, financial performance, and results of operations, the practices necessitated by the outbreak and related governmental actions, the increase in remote working and the associated privacy, cybersecurity and fraud risks, the legal and regulatory challenges associated with the pandemic, the workforce reductions announced in May 2020, the launch of new, or expansion of existing, services, features, or health and safety requirements in response to COVID-19, and the potential fines or other enforcement measures that could adversely impact our financial resu

In [31]:
cross_query_str = (
    "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
)

agent_chain.run(input=cross_query_str)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: Graph Index
Action Input: Compare/contrast the risk factors described in the Uber 10-K across years.[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2022 fiscal year?
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2022 fiscal year?
[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 964 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 18 tokens


[36;1m[1;3m> Got response: 
The risk factors described in the Uber 10-K for the 2022 fiscal year include: the potential for changes in the classification of Drivers, the potential for increased competition, the potential for...
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2021 fiscal year?
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2021 fiscal year?
[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 590 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 18 tokens


[36;1m[1;3m> Got response: 
1. The COVID-19 pandemic and the impact of actions to mitigate the pandemic have adversely affected and may continue to adversely affect parts of our business.

2. Our business would be adversely ...
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2020 fiscal year?
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2020 fiscal year?
[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 516 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 18 tokens


[36;1m[1;3m> Got response: 
The risk factors described in the Uber 10-K for the 2020 fiscal year include: the timing of widespread adoption of vaccines against the virus, additional actions that may be taken by governmental ...
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2019 fiscal year?
[0m[33;1m[1;3m> Current query: Compare/contrast the risk factors described in the Uber 10-K across years.
[0m[38;5;200m[1;3m> New query:  What are the risk factors described in the Uber 10-K for the 2019 fiscal year?
[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 1020 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 18 tokens
INFO:gpt_index.indices.common.tree.base:> Building index from nodes: 0 chunks


[36;1m[1;3m> Got response: 
Risk factors described in the Uber 10-K for the 2019 fiscal year include: competition from other transportation providers; the impact of government regulations; the impact of litigation; the impac...
[0m

INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 7039 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 72 tokens



Observation: [36;1m[1;3m
In 2020, the risk factors included the timing of widespread adoption of vaccines against the virus, additional actions that may be taken by governmental authorities, the further impact on the business of Drivers, Merchants, consumers, and business partners, and the legal challenges to the classification of Drivers as independent contractors. In 2019, the risk factors included competition from other transportation providers, the impact of government regulations, the impact of litigation, the impact of negative publicity, the impact of changes in tax laws, the impact of changes in labor laws, the impact of changes in technology, the impact of cybersecurity risks, the impact of changes in economic conditions, the impact of changes in the financial markets, and the impact of changes in the competitive landscape. In 2022, the risk factors include the potential for changes in the classification of Drivers, the potential for increased competition, the potential for

'\nIn 2020, the risk factors included the timing of widespread adoption of vaccines against the virus, additional actions that may be taken by governmental authorities, the further impact on the business of Drivers, Merchants, consumers, and business partners, and the legal challenges to the classification of Drivers as independent contractors. In 2019, the risk factors included competition from other transportation providers, the impact of government regulations, the impact of litigation, the impact of negative publicity, the impact of changes in tax laws, the impact of changes in labor laws, the impact of changes in technology, the impact of cybersecurity risks, the impact of changes in economic conditions, the impact of changes in the financial markets, and the impact of changes in the competitive landscape. In 2022, the risk factors include the potential for changes in the classification of Drivers, the potential for increased competition, the potential for changes in laws and regu

### Setup Chatbot Loop Within Notebook

We'll keep a running loop so that you can converse with the agent. 

In [39]:
# reinitialize agent
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
)

In [None]:
while True:
    text_input = input("User: ")
    response = agent_chain.run(input=text_input)
    print(f'Agent: {response}')
    

User:  What were some of the legal proceedings against Uber in 2022?


INFO:gpt_index.token_counter.token_counter:> [query] Total LLM token usage: 1957 tokens
INFO:gpt_index.token_counter.token_counter:> [query] Total embedding token usage: 6 tokens


Agent: 

In 2022, legal proceedings against Uber include a motion to compel arbitration, an appeal of a ruling that Proposition 22 is unconstitutional, a complaint alleging that drivers are employees and entitled to protections under the wage and labor laws, a summary judgment motion, allegations of misclassification of drivers and related employment violations in New York, fraud related to certain deductions, class actions in Australia alleging that Uber entities conspired to injure the group members during the period 2014 to 2017 by either directly breaching transport legislation or commissioning offenses against transport legislation by UberX Drivers in Australia, and claims of lost income and decreased value of certain taxi. Additionally, Uber is facing a challenge in California Superior Court alleging that Proposition 22 is unconstitutional, and a preliminary injunction order prohibiting Uber from classifying Drivers as independent contractors and from violating various wage and h