In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from pinecone import Pinecone

In [2]:
DOT_ENV_PATH = find_dotenv()
load_dotenv(DOT_ENV_PATH)

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2")
PINECONE_INDEX_NAME = "equity-sense-vdb"
LANGCHAIN_PROJECT="defaultv2"
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

### Helper Function

In [4]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

### Instantiate LLM Model

In [5]:
# Instantiate LLM Model
llm = ChatOpenAI()

### Instantiate PineCone Client

In [6]:
# Declare PineCone Client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Get Index Object
index = pc.Index(PINECONE_INDEX_NAME)

### Load Embedding Model

In [7]:
# Initialize embedding model
embeddings = OpenAIEmbeddings(
    api_key=OPENAI_API_KEY, 
    model='text-embedding-ada-002'
)

### Load VectorStore

In [8]:
# Create VectorStore
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [9]:
retriever = vector_store.as_retriever(k=10)

### Add Self-Querying Capabilities to Retriever

In [11]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

financial_metadata = \
    [
    AttributeInfo(
        name="ticker_id",
        description="Stock ticker symbol of the company",
        type="string",
    ),
    AttributeInfo(
        name="source",
        description="Source file of the financial data",
        type="string",
    )]

financial_content_description = "Financial statements and metrics for various companies in the Stock Market by year"

self_querying_retriever = SelfQueryRetriever(
    llm=llm,
    vector_store=vector_store,
    document_content_description=financial_content_description,
    metadata_field_info=financial_metadata,
)

KeyError: 'vectorstore'

### Add ReRanking to Retriever

In [12]:
compressor = CohereRerank(
    cohere_api_key=COHERE_API_KEY,
    model='rerank-english-v3.0',
    top_n=3
)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

#### Test Traditional Retriever

In [13]:
retriever_docs = retriever.invoke(
    "What are META reported Operating Income?"
)
pretty_print_docs(retriever_docs)

Document 1:

The Meta Inc Total Operating Income As Reported For the date 2023-12-31 was 46.751 Billions USD. For the date 2022-12-31 was 28.944 Billions USD. For the date 2021-12-31 was 46.753 Billions USD. For the date 2020-12-31 was 32.671 Billions USD. For the date 2019-12-31 was nan Billions USD. 
----------------------------------------------------------------------------------------------------
Document 2:

The Meta Inc Operating Income For the date 2023-12-31 was 46.751 Billions USD. For the date 2022-12-31 was 28.944 Billions USD. For the date 2021-12-31 was 46.753 Billions USD. For the date 2020-12-31 was 32.671 Billions USD. For the date 2019-12-31 was nan Billions USD. 
----------------------------------------------------------------------------------------------------
Document 3:

The Meta Inc Operating Revenue For the date 2023-12-31 was 133.844 Billions USD. For the date 2022-12-31 was 115.801 Billions USD. For the date 2021-12-31 was 117.208 Billions USD. For the date 2

#### Test Re-Ranker Retriever

In [14]:
compressed_docs = compression_retriever.invoke(
    "What are NVIDIA reported Operating Income?"
)
pretty_print_docs(compressed_docs)

Document 1:

The NVIDIA Total Operating Income As Reported For the date 2024-01-31 was 32.972 Billions USD. For the date 2023-01-31 was 4.224 Billions USD. For the date 2022-01-31 was 10.041 Billions USD. For the date 2021-01-31 was 4.532 Billions USD. 
----------------------------------------------------------------------------------------------------
Document 2:

The NVIDIA Operating Income For the date 2024-01-31 was 32.972 Billions USD. For the date 2023-01-31 was 5.577 Billions USD. For the date 2022-01-31 was 10.041 Billions USD. For the date 2021-01-31 was 4.532 Billions USD. 
----------------------------------------------------------------------------------------------------
Document 3:

The NVIDIA Operating Revenue For the date 2024-01-31 was 60.922 Billions USD. For the date 2023-01-31 was 26.974 Billions USD. For the date 2022-01-31 was 26.914 Billions USD. For the date 2021-01-31 was 16.675 Billions USD. 


### Load Parser

In [15]:
parser = StrOutputParser()

### Create Prompt

In [16]:
system_prompt = (
    """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, say that you don't know. Use three sentences maximum and keep the answer concise. \n\n
    Context: {context}"""
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(compression_retriever, question_answer_chain)



#### Test Prompt:

In [17]:
response = rag_chain.invoke({"input": "Which year does NVIDIA has the highest Net Income?"})
response["answer"]

'NVIDIA had the highest Net Income in the year 2024 with a total of 29.76 Billion USD.'

In [18]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, compression_retriever, contextualize_q_prompt
)

In [19]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [20]:
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [21]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []

question = "What is META Net Income?"
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

print(ai_msg_1["answer"] + "\n\n")

second_question = "Order these income from highest to lowest?"
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})

print(ai_msg_2["answer"])

The Meta Inc Net Income for 2023 was 39.098 Billion USD, for 2022 was 23.2 Billion USD, for 2021 was 39.37 Billion USD, and for 2020 was 29.146 Billion USD. The Net Income figures were not available for the year 2019.


The Net Income figures for Meta Inc from highest to lowest are as follows: 
- 2023: 39.098 Billion USD
- 2021: 39.37 Billion USD
- 2020: 29.146 Billion USD
- 2022: 23.2 Billion USD
- 2019: Not available


### Add Memory/Persistence to RAG System

In [22]:
from typing import Sequence

from langchain_core.messages import BaseMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, StateGraph
from langgraph.graph.message import add_messages
from typing_extensions import Annotated, TypedDict


# We define a dict representing the state of the application.
# This state has the same input and output keys as `rag_chain`.
class State(TypedDict):
    input: str
    chat_history: Annotated[Sequence[BaseMessage], add_messages]
    context: str
    answer: str

# We then define a simple node that runs the `rag_chain`.
# The `return` values of the node update the graph state, so here we just
# update the chat history with the input message and response.
def call_model(state: State):
    response = rag_chain.invoke(state)
    return {
        "chat_history": [
            HumanMessage(state["input"]),
            AIMessage(response["answer"]),
        ],
        "context": response["context"],
        "answer": response["answer"],
    }


# Our graph consists only of one node:
workflow = StateGraph(state_schema=State)
workflow.add_edge(START, "model")
workflow.add_node("model", call_model)

# Finally, we compile the graph with a checkpointer object.
# This persists the state, in this case in memory.
memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

In [23]:
config = {"configurable": {"thread_id": "abc125"}}

result = app.invoke(
    {"input": "What is NVIDIA Net Income?"},
    config=config,
)
print(result["answer"])

NVIDIA's Net Income for 2024-01-31 was 29.76 Billion USD, for 2023-01-31 was 4.368 Billion USD, for 2022-01-31 was 9.752 Billion USD, and for 2021-01-31 was 4.332 Billion USD.


In [24]:
result = app.invoke(
    {"input": "Order NVIDIA income from highest to lowest?"},
    config=config,
)
print(result["answer"])

The order of NVIDIA's income from highest to lowest is as follows: 2024-01-31 with 29.76 Billion USD, 2022-01-31 with 9.752 Billion USD, 2023-01-31 with 4.368 Billion USD, and 2021-01-31 with 4.332 Billion USD.


In [25]:
chat_history = app.get_state(config).values["chat_history"]
for message in chat_history:
    message.pretty_print()


What is NVIDIA Net Income?

NVIDIA's Net Income for 2024-01-31 was 29.76 Billion USD, for 2023-01-31 was 4.368 Billion USD, for 2022-01-31 was 9.752 Billion USD, and for 2021-01-31 was 4.332 Billion USD.

Order NVIDIA income from highest to lowest?

The order of NVIDIA's income from highest to lowest is as follows: 2024-01-31 with 29.76 Billion USD, 2022-01-31 with 9.752 Billion USD, 2023-01-31 with 4.368 Billion USD, and 2021-01-31 with 4.332 Billion USD.


In [26]:
result = app.invoke(
    {"input": "What company information is this? "},
    config=config,
)
print(result["answer"])

The provided information is related to the financial data of NVIDIA, a technology company known for its graphics processing units (GPUs) and semiconductor products.


### System Limitations and Proposal for Fine-Tuning

In [27]:
result = app.invoke(
    {"input": "What companies do you have financial information knowledge about?"},
    config=config,
)
print(result["answer"])

I have information about companies like NVIDIA, APPLE COMPUTER, and STARBUCKS Corp, among others.


In [66]:
chat_history = app.get_state(config).values["chat_history"]
for message in chat_history:
    message.pretty_print()


What is NVIDIA Net Income?

NVIDIA's Net Income for 2024-01-31 was 29.76 billion USD, for 2023-01-31 was 4.368 billion USD, for 2022-01-31 was 9.752 billion USD, and for 2021-01-31 was 4.332 billion USD.

Order NVIDIA income from highest to lowest?

The NVIDIA Net Income from highest to lowest is as follows: 29.76 billion USD for 2024-01-31, 9.752 billion USD for 2022-01-31, 4.368 billion USD for 2023-01-31, and 4.332 billion USD for 2021-01-31.

What company information is this? 

The information provided pertains to NVIDIA, not Apple.

What companies do you have financial information knowledge about?

I have financial information knowledge about various companies, including but not limited to NVIDIA, Apple, Microsoft, Tesla, Amazon, Google (Alphabet Inc.), and more.
