In [2]:
import re
import os
import glob
import pandas as pd
import datetime
from sec_api import QueryApi, RenderApi

from llama_index import (
    download_loader,
    VectorStoreIndex,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
    ListIndex,
    LLMPredictor,
    load_graph_from_storage,
)
from langchain.chains.conversation.memory import ConversationBufferMemory
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
from llama_index.langchain_helpers.agents import (
    LlamaToolkit,
    create_llama_chat_agent,
    IndexToolConfig,
)
from langchain import OpenAI
from llama_index.indices.composability import ComposableGraph
from pathlib import Path

from llama_index import (
    download_loader,
    ServiceContext,
    GPTVectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage,
    LLMPredictor,
)
from langchain.llms.openai import OpenAIChat

from langchain.chat_models import ChatOpenAI

from pathlib import Path
from dotenv import load_dotenv

In [3]:
sec_path_apple='/Users/rouzbeh/value-cumulation/value-cumulation-api/chat_finance/data/sec10K/tmp_AAPL.html'
sec_path_intel='/Users/rouzbeh/value-cumulation/value-cumulation-api/chat_finance/data/sec10K/tmp_INTC.html'

In [8]:
# cofigure LLM services
llm_predictor=LLMPredictor(llm=OpenAI(temperature=0,model_name="gpt-3.5-turbo", max_tokens=-1))
service_context=ServiceContext.from_defaults(llm_predictor=llm_predictor)



In [9]:
# document loader
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)
loader = UnstructuredReader()

[nltk_data] Downloading package punkt to /Users/rouzbeh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rouzbeh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
document_apple = loader.load_data(file=sec_path_apple, split_documents=True)
document_intel = loader.load_data(file=sec_path_intel, split_documents=True)
print(f'Apple length is {len(document_apple)}')
print(f'Intel length is {len(document_intel)}')

Apple length is 2748
Intel length is 5467


In [13]:
apple_index = GPTVectorStoreIndex.from_documents(document_apple)
print(f"finished building for apple. Length of the document is {len(apple_index.docstore.docs)}")
intel_index = GPTVectorStoreIndex.from_documents(document_intel)
print(f"finished building for intel. Length of the document is {len(intel_index.docstore.docs)}")


finished building for apple. Length of the document is 2748
finished building for intel. Length of the document is 5467


- Fix the html index creation process. It's too long and that is because of the length of the document is too long. I have to figure out a way to make the document shorter and get rid of the non-sense html tags.

In [14]:
apple_engine = apple_index.as_query_engine(similarity_top_k=3)
intel_engine = intel_index.as_query_engine(similarity_top_k=3)

In [15]:
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine

query_engine_tools = [
    QueryEngineTool(
        query_engine=apple_engine,
        metadata=ToolMetadata(name="apple_10k", description="provides information about apple's financials in 2022 fiscal year"),
    ),
    QueryEngineTool(
        query_engine=intel_engine,
        metadata=ToolMetadata(name="intel_10k", description="provides information about intel's financials in 2022 fiscal year"),
    )
]
query_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)


In [41]:
# Run Queries
import nest_asyncio
nest_asyncio.apply()


In [52]:
dir = "/Users/rouzbeh/value-cumulation/value-cumulation-api/chat_finance/data/sec10K"
documents = SimpleDirectoryReader(dir).load_data()
print(len(documents))
this_index = GPTVectorStoreIndex.from_documents([documents[0]])



7
