# Setup

In [1]:
%pip install -r requirements.txt --ignore-installed --no-dependencies 
# Colab already has some packages installed # Colab already has some packages installed 
# Do NOT restart Colab session 

Collecting aiohttp==3.9.4 (from -r requirements.txt (line 1))
  Using cached aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl.metadata (7.5 kB)
Collecting aiosignal==1.3.1 (from -r requirements.txt (line 2))
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting alembic==1.13.1 (from -r requirements.txt (line 3))
  Using cached alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting altair==5.3.0 (from -r requirements.txt (line 4))
  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting annotated-types==0.6.0 (from -r requirements.txt (line 5))
  Using cached annotated_types-0.6.0-py3-none-any.whl.metadata (12 kB)
Collecting anyio==4.3.0 (from -r requirements.txt (line 6))
  Using cached anyio-4.3.0-py3-none-any.whl.metadata (4.6 kB)
Collecting appnope==0.1.4 (from -r requirements.txt (line 7))
  Using cached appnope-0.1.4-py2.py3-none-any.whl.metadata (908 bytes)
Collecting asttokens==2.4.1 (from -r requirements.txt (line 8))
  Using cached

# Use LlamaIndex to build Node sentence window

Reference: https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/MetadataReplacementDemo/

## Helper functions to build Node Sentence Window

In [2]:
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

def build_sentence_window_index(
    document, llm, vector_store, embed_model="local:BAAI/bge-small-en-v1.5"
):
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser
    )
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context, storage_context=storage_context
    )

    return sentence_index

def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

  from .autonotebook import tqdm as notebook_tqdm


# Trulens
## Answer relevance, Context relevance, Groundedness
Reference:

https://www.trulens.org/trulens_eval/evaluation/feedback_evaluations/answer_relevance_smoke_tests/

https://www.trulens.org/trulens_eval/evaluation/feedback_evaluations/context_relevance_smoke_tests/

https://www.trulens.org/trulens_eval/evaluation/feedback_evaluations/groundedness_smoke_tests/

:( I feel the definitions on trulens are not clear. Here's a better writing:

https://medium.com/@rajib76.gcp/rag-triad-introduction-4e5ecba26741

In [3]:
import os

import nest_asyncio
nest_asyncio.apply()

api_key = os.environ["OPENAI_API_KEY"]

# Using OpenAI as feedback provider since I only have OpenAI API key
# It's possible to use other providers, e.g. Cohere, Huggingface...
# As suggested here: https://www.trulens.org/trulens_eval/evaluation/feedback_evaluations/answer_relevance_smoke_tests/
# It's probably better to use other providers. Since we already use OpenAI to build sentence
# window index, we need to use other provider to give feedback, e.g. groundedness, answer relevance

# As you will see later, using OpenAI for both build sentence and window index and feedback
# leads to 100% groundedness and answer relevance, which doesn't seem reasonable
from trulens_eval import OpenAI as fOpenAI
provider = fOpenAI()

# Answer relevance
from trulens_eval import Feedback
f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons,
    name="Answer Relevance"
).on_input_output()

# Context relevance
from trulens_eval import TruLlama
context_selection = TruLlama.select_source_nodes().node.text
import numpy as np
f_qs_relevance = (
    Feedback(provider.qs_relevance,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

from trulens_eval.feedback import Groundedness
grounded = Groundedness(groundedness_provider=provider)
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons,
             name="Groundedness"
            )
    .on(context_selection)
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)




✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input context will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


[nltk_data] Downloading package punkt to /Users/moonshine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read and merge documents

In [4]:
from trulens_eval import Tru

tru = Tru()
tru.reset_database()

from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(
    "./AAPL", recursive=True
).load_data()

from llama_index.core import Document

document = Document(text="\n\n".\
                    join([doc.text for doc in documents]))


🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


## Build sentence window query engine

In [5]:
# This step is time consuming!

#from utils import build_sentence_window_index
from llama_index.llms.openai import OpenAI
from llama_index.core.vector_stores.types import BasePydanticVectorStore

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
sentence_index = build_sentence_window_index(
    document,
    llm,
    vector_store= None, #BasePydanticVectorStore,
    embed_model="local:BAAI/bge-small-en-v1.5"
)

  sentence_context = ServiceContext.from_defaults(


In [6]:
#from utils import get_sentence_window_query_engine
# This step is time consuming! 10 minutes to run

sentence_window_engine = \
get_sentence_window_query_engine(sentence_index)

## Evaluation of the llamaindex made RAG

In [7]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

tru_recorder = TruLlama(
    sentence_window_engine,
    app_id="App_1",
    feedbacks=[
        f_qa_relevance,
        f_qs_relevance,
        f_groundedness
    ]
)

In [75]:
# TruEra leverages GPT model to generate Answer relevance, Context relevance, groundedness
# The provided "standard answer" will not be used

question = " "
with tru_recorder as recording:
    sentence_window_engine.query(question)

print("Tru finishes recording. Now get the Tru result")

# Note that this step's result can be unstable. Sometimes there's no record/feedback generated
# Besides, since I use the same OpenAI model for build sentence window index and feedback,
# I see groundedness and answer relevance both be 1
records, feedback = tru.get_records_and_feedback(app_ids=[])

print(records)
print(f'feedback: {feedback}\n') #it doesn't have meaningful info
tru_evals = [records.loc[0, "output"], records.loc[0, "Context Relevance"], records.loc[0, "Answer Relevance"], records.loc[0, "Groundedness"]]
print(tru_evals)

# Clean up sample usage to have a clean start
tru.reset_database()

Tru finishes recording. Now get the Tru result
  app_id                                           app_json  \
0  App_1  {"tru_class_info": {"name": "TruLlama", "modul...   
1  App_1  {"tru_class_info": {"name": "TruLlama", "modul...   

                                                type  \
0  RetrieverQueryEngine(llama_index.core.query_en...   
1  RetrieverQueryEngine(llama_index.core.query_en...   

                                      record_id input  \
0  record_hash_4fc2814dcb3ae373197525c5749cf6cf   " "   
1  record_hash_6c7b05d3ea80035f72d53a88401f2cc6   " "   

                                              output tags  \
0  "Apple Inc.'s financial statements show the co...    -   
1  "Apple Inc.'s financial statements show the co...    -   

                                         record_json  \
0  {"record_id": "record_hash_4fc2814dcb3ae373197...   
1  {"record_id": "record_hash_6c7b05d3ea80035f72d...   

                                           cost_json  \
0  {"n_reques

## Consolidate above trulens code to a function for later use

In [88]:
def trulens_evals(query):
    with tru_recorder as recording:
        sentence_window_engine.query(query)

    records, feedback = tru.get_records_and_feedback(app_ids=[])

    #records.head()
    #tru_evals = [records.loc[0, "output"], records.loc[0, "Context Relevance"], records.loc[0, "Answer Relevance"], records.loc[0, "Groundedness"]]
    return records, feedback

# Knowledge graph based Evaluation

I tried to use ChatGPT to produce knowledge graph, but the result is undesirable
ChatGPT seems to only extract superficial information or hangs in the middle
So I decide to build an in-memory knowledge graph

```
knowledge_graph = {
    "entities": [
        {"id": 1, "type": "Organization", "name": "Apple Inc."},
        {"id": 2, "type": "Financial Metric", "name": "Net Sales"},
        {"id": 3, "type": "Financial Metric", "name": "Gross Margin"},
        {"id": 4, "type": "Financial Metric", "name": "Earnings per Share"},
        {"id": 5, "type": "Product", "name": "iPhone"},
        {"id": 6, "type": "Product", "name": "iPad"},
        {"id": 7, "type": "Product", "name": "Mac"},
        {"id": 8, "type": "Product", "name": "Services"},
        {"id": 9, "type": "Document", "name": "Form 10-Q"},
        {"id": 10, "type": "Document", "name": "SEC filings"},
        {"id": 11, "type": "Location", "name": "Cupertino, California"},
        {"id": 12, "type": "Location", "name": "Americas"},
        {"id": 13, "type": "Location", "name": "Europe"},
        {"id": 14, "type": "Location", "name": "Greater China"}
    ],
    "relations": [
        {"source": 1, "target": 2, "type": "reports"},
        {"source": 1, "target": 3, "type": "reports"},
        {"source": 1, "target": 4, "type": "reports"},
        {"source": 2, "target": 5, "type": "includes_sales_of"},
        {"source": 2, "target": 6, "type": "includes_sales_of"},
        {"source": 2, "target": 7, "type": "includes_sales_of"},
        {"source": 2, "target": 8, "type": "includes_sales_of"},
        {"source": 3, "target": 5, "type": "derived_from"},
        {"source": 3, "target": 6, "type": "derived_from"},
        {"source": 3, "target": 7, "type": "derived_from"},
        {"source": 3, "target": 8, "type": "derived_from"},
        {"source": 1, "target": 9, "type": "documented_in"},
        {"source": 1, "target": 10, "type": "documented_in"},
        {"source": 1, "target": 11, "type": "located_in"},
        {"source": 1, "target": 12, "type": "operates_in"},
        {"source": 1, "target": 13, "type": "operates_in"},
        {"source": 1, "target": 14, "type": "operates_in"}
    ]
}
```

In [169]:
! python -m spacy download en_core_web_sm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m382.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [170]:
# It took about 20 seconds to build an in-memory knowledge graph
import spacy
import fitz  # PyMuPDF
from collections import defaultdict

# Load the spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Define function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

# Define function to extract entities and relationships
def extract_entities_relations(text):
    doc = nlp(text)
    entities = []
    relations = []
    
    for ent in doc.ents:
        entities.append({"text": ent.text, "label": ent.label_})
    
    for token in doc:
        if token.dep_ in ("attr", "dobj"):
            subject = [w for w in token.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append({"subject": subject.text, "predicate": token.head.text, "object": token.text})
    
    return entities, relations

# Main function to create the knowledge graph
def create_knowledge_graph(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    entities, relations = extract_entities_relations(text)
    
    knowledge_graph = {
        "entities": entities,
        "relations": relations
    }
    return knowledge_graph

# Build knowledge graph based on APPL pdfs
pdf_path = "./AAPL/merged_AAPL.pdf"  # Update this to your PDF file path
knowledge_graph = create_knowledge_graph(pdf_path)
print(len(knowledge_graph["entities"]))
print(len(knowledge_graph["relations"]))
print(knowledge_graph)


8425
796
{'entities': [{'text': 'UNITED STATES', 'label': 'GPE'}, {'text': 'Washington', 'label': 'GPE'}, {'text': 'D.C.', 'label': 'GPE'}, {'text': 'Mark One', 'label': 'PERSON'}, {'text': 'THE SECURITIES EXCHANGE ACT OF 1934', 'label': 'ORG'}, {'text': 'the quarterly period ended June\xa025, 2022', 'label': 'DATE'}, {'text': 'THE SECURITIES EXCHANGE ACT OF 1934', 'label': 'ORG'}, {'text': '001-36743', 'label': 'CARDINAL'}, {'text': 'Apple Inc.', 'label': 'ORG'}, {'text': 'Registrant', 'label': 'ORG'}, {'text': 'California', 'label': 'GPE'}, {'text': '94-2404110', 'label': 'DATE'}, {'text': 'I.R.S. Employer Identification No', 'label': 'ORG'}, {'text': 'One', 'label': 'CARDINAL'}, {'text': 'Apple Park Way', 'label': 'FAC'}, {'text': 'Cupertino', 'label': 'GPE'}, {'text': 'California', 'label': 'GPE'}, {'text': '95014', 'label': 'DATE'}, {'text': 'Zip Code', 'label': 'PERSON'}, {'text': '408', 'label': 'CARDINAL'}, {'text': '996-1010', 'label': 'DATE'}, {'text': 'Registrant', 'label': 

In [171]:
def create_knowledge_graph(hypothesis):
    entities, relations = extract_entities_relations(hypothesis)
    
    knowledge_graph = {
        "entities": entities,
        "relations": relations
    }
    return knowledge_graph

In [172]:
import json
def kg_evaluate(hypothesis):
    rag_kg = create_knowledge_graph(hypothesis)
    
    rag_kg_entities_set = set(map(lambda d: json.dumps(d, sort_keys=True), rag_kg["entities"]))
    kg_entities_set = set(map(lambda d: json.dumps(d, sort_keys=True), knowledge_graph["entities"]))
    rag_kg_relations_set = set(map(lambda d: json.dumps(d, sort_keys=True), rag_kg["relations"]))
    kg_relations_set = set(map(lambda d: json.dumps(d, sort_keys=True), knowledge_graph["relations"]))


    common_entities = rag_kg_entities_set.intersection(kg_entities_set)
    common_relations = rag_kg_relations_set.intersection(kg_relations_set)
    hallucination_entities = rag_kg_entities_set.difference(kg_entities_set)
    hallucination_relations = rag_kg_relations_set.difference(kg_relations_set)
    # The missing* metrics don't make much sense, because the information needed to answer
    # a question is a very small subset of the whole knowledge graph
    # So large missing* metrics don't mean bad answer
    missing_entities = kg_entities_set.difference(rag_kg_entities_set)
    missing_relations = kg_relations_set.difference(rag_kg_relations_set)
    """
    print(common_entities)
    print(common_relations)
    print(len(common_entities))
    print(len(common_relations))
    print(hallucination_entities)
    print(hallucination_relations)
    print(len(hallucination_entities))
    print(len(hallucination_relations))
    print(missing_entities)
    print(missing_relations)
    print(len(missing_entities))
    print(len(missing_relations))
    """
    return common_entities, len(common_entities), common_relations, len(common_relations),\
        hallucination_entities, len(hallucination_entities), \
        hallucination_relations, len(hallucination_relations)

In [None]:
print(kg_evaluate("Apple's total net sales decreased by 5% from the three months ended December 25, 2021, to the three months ended December 31, 2022."))

({'{"label": "DATE", "text": "the three months ended December 31, 2022"}', '{"label": "PERCENT", "text": "5%"}', '{"label": "ORG", "text": "Apple"}'}, 3, set(), 0, {'{"label": "DATE", "text": "the three months ended December 25, 2021"}'}, 1, set(), 0)


# Bleu, Rouge, Meteor, BirtScore

In [35]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# bleu evaluation doesn't seem to be a fit here
# bleu compares the n-gram precision between reference and hypothesis
# so if the lengths of reference and hypothesis are different, bleu cannot function.
# I got error:
# AssertionError: The number of hypotheses and their reference(s) should be the same 
def bleu(reference, hypothesis):
    reference_tokens = word_tokenize(reference)
    hypothesis_tokens = word_tokenize(hypothesis)
    print(reference_tokens)
    print(hypothesis_tokens)
    bleu_score = corpus_bleu(reference_tokens, hypothesis_tokens)
    return bleu_score

# hypotheses="the cat was found under the bed", references="the cat was under the bed"
from rouge import Rouge
r = Rouge()
def rouge_score(references, hypothesis):
    rouge_score = r.get_scores(hypothesis, references)
    return rouge_score

from nltk.translate.meteor_score import meteor_score, single_meteor_score
nltk.download('wordnet')
def meteor_score(reference, hypothesis):
    tokenized_refs = word_tokenize(reference)
    tokenized_hypo = word_tokenize(hypothesis)
    
    # Calculating the METEOR score using tokenized text
    score = single_meteor_score(tokenized_refs, tokenized_hypo)
    return score

import bert_score
from bert_score import BERTScorer
def bert_score(references, hypothesis):
    scorer = BERTScorer(model_type='bert-base-uncased', num_layers=9, batch_size=3)
    P, R, F1 = scorer.score([references], [hypothesis])
    #print(f"BERTScore Precision: {P.mean()}, Recall: {R.mean()}, F1: {F1.mean()}")
    return (P, R, F1)

[nltk_data] Downloading package punkt to /Users/moonshine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/moonshine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Sample Usage

### How to interpret Rouge score
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) scores are a set of metrics used to evaluate the quality of summaries generated by automatic summarization systems, particularly in the field of natural language processing (NLP). These metrics compare the generated summary against one or more reference summaries, typically produced by humans.

ROUGE-N (N-gram Overlap): This metric measures the overlap of n-grams (sequences of n words) between the generated summary and the reference summaries. ROUGE-N is often computed for different values of N (e.g., ROUGE-1 for unigrams, ROUGE-2 for bigrams, etc.).

ROUGE-L (Longest Common Subsequence): ROUGE-L measures the longest common subsequence between the generated summary and the reference summaries. This metric is particularly useful when the order of words is important for the summary's coherence.

Higher scores indicate better performance: A higher ROUGE score generally indicates that the generated summary is more similar to the reference summaries, suggesting better quality.

### How to interpret Meteor score
A higher METEOR score indicates better performance. A score closer to 1 indicates a higher level of similarity between the generated translation and the reference translations.


### How to interpret Bert_score
BERTScore measures the similarity between the generated text and one or more reference texts. It considers the contextual embeddings of the text fragments to capture the semantic similarity between them.

Similar to other evaluation metrics, BERTScore typically ranges from 0 to 1, where 1 indicates a perfect match between the generated text and the reference text, and 0 indicates no similarity.

In [36]:
reference = "Based on the provided documents, Apple's total net sales have changed over time as follows: - For the quarterly period ended June 25, 2022, the total net sales were $82,959 million. (SOURCE: 2022 Q3 AAPL.pdf) - For the quarterly period ended December 31, 2022, the total net sales were $117,154 million. (SOURCE: 2023 Q1 AAPL.pdf) - For the quarterly period ended April 1, 2023, the total net sales were $94,836 million. (SOURCE: 2023 Q2 AAPL.pdf) - For the quarterly period ended July 1, 2023, the total net sales were $81,797 million. (SOURCE: 2023 Q3 AAPL.pdf) From these figures, it can be observed that there was an increase in total net sales from the quarter ended June 25, 2022, to the quarter ended December 31, 2022. However, there was a subsequent decrease in total net sales in the quarters ended April 1, 2023, and July 1, 2023. SOURCE(S): 2022 Q3 AAPL.pdf, 2023 Q1 AAPL.pdf, 2023 Q2 AAPL.pdf, 2023 Q3 AAPL.pdf"
# paraphrase of the reference:
hypothesis = "According to the provided documents, there have been fluctuations in Apple's total net sales over various quarters: the total net sales for the quarter ending June 25, 2022, were $82,959 million, as noted in the 2022 Q3 AAPL.pdf. This figure rose to $117,154 million by the quarter ending December 31, 2022, as per the 2023 Q1 AAPL.pdf. A decline was noted in subsequent quarters, with sales dropping to $94,836 million by April 1, 2023 (2023 Q2 AAPL.pdf), and further to $81,797 million by July 1, 2023 (2023 Q3 AAPL.pdf). This data shows an initial increase in sales followed by a decrease in the later quarters. Sources: 2022 Q3 AAPL.pdf, 2023 Q1 AAPL.pdf, 2023 Q2 AAPL.pdf, 2023 Q3 AAPL.pdf."

#print(f"bleu_score {bleu(reference, hypothesis)}")

print(f"rouge_score {rouge_score(reference, hypothesis)}")
print(f"meteor_score {meteor_score(reference, hypothesis)}")
print(f"bert_score {bert_score(reference, hypothesis)}")
# It takes 17s (not a cold start) to run 3 calls. 12s was spent on bert_score

rouge_score [{'rouge-1': {'r': 0.6617647058823529, 'p': 0.6164383561643836, 'f': 0.638297867346713}, 'rouge-2': {'r': 0.35106382978723405, 'p': 0.3235294117647059, 'f': 0.33673468888588093}, 'rouge-l': {'r': 0.5735294117647058, 'p': 0.5342465753424658, 'f': 0.5531914843679896}}]
meteor_score 0.49912847189285503
bert_score (tensor([0.7959]), tensor([0.8150]), tensor([0.8053]))


## Helper function

In [None]:
import pandas as pd

def extract_aapl_questions_answers(file_path):
    # Load the spreadsheet into a DataFrame
    data = pd.read_excel(file_path)
    
    # Filter rows where 'Source Docs' column contains 'AAPL'
    filtered_data = data[data['Source Docs'].str.contains("AAPL", na=False)]
    
    # Extract 'Question' and 'Answer' columns
    result = filtered_data[['Question', 'Answer']]
    
    return result

# Path to the spreadsheet file
file_path = 'qna_human_curated.xlsx'  # You should update this path
aapl = extract_aapl_questions_answers(file_path)
print(aapl)
print(aapl.iloc[0]["Question"], aapl.iloc[0]["Answer"])

                                              Question  \
0    How has Apple's total net sales changed over t...   
1    What are the major factors contributing to the...   
2    Has there been any significant change in Apple...   
3    How has Apple's revenue from iPhone sales fluc...   
4    Can any trends be identified in Apple's Servic...   
5    What is the impact of foreign exchange rates o...   
6    Are there any notable changes in Apple's liqui...   
7    How does Apple's R&D expenditure in the most r...   
8    What legal proceedings or contingencies are di...   
9    Has Apple engaged in any significant share rep...   
10   What is the effective tax rate reported by App...   
37   Does Apple report any significant new business...   
38   How have Apple's inventory levels changed acro...   
39   Are there any significant changes or new discl...   
40   Summarize the risk factors to Apple's business...   
68   What was the gross margin for Apple in the lat...   
69   How did A

## Evaluate Llamaindex produced RAG using AAPL* question/answer
It took about 18 minutes 12 seconds to go through 38 quesetions relevant to AAPL

In [185]:
# Clean up
# trulens persists the batch result in a database
# it has to be cleared up before next batch use
# not clearly documented :(
tru = Tru()
tru.reset_database()

evaluation = []
for index, qa in aapl.iterrows():
    question, answer = qa["Question"], qa["Answer"]

    try: 
        records, feedback = trulens_evals(question)
    except Exception as e:
        print(f"Error occurred: {e}")
    print(records)

    rouge = rouge_score(answer, rag_answer)
    meteor = meteor_score(answer, rag_answer)
    bert = bert_score(answer, rag_answer)

    evaluation.append({"question": question, "answer": answer,  \
        "rouge": rouge, "meteor": meteor, "bert": bert})
    #print(evaluation)
    """
    print(f"question: {question}")
    print(f"answer: {answer}")
    print(f"rag_answer: {rag_answer}")
    print(f"context_relevance: ${context_relevance}")  # Sometimes the field is not present
    print(f"answer_relevance: ${answer_relevance}")
    print(f"groundedness: ${groundedness}")
    print(f"rouge: ${rouge}")
    print(f"meteor: ${meteor}")
    print(f"bert: ${bert}")
    print(f"kg_evals: ${kg_evals}")
    print("\n")
    """


The document provided doesn't contain any information regarding the change in Apple's total net sales over time, so I'm unable to provide an answer based on the file referenced. If you need details on Apple's financials, consulting their annual reports or reliable financial news sources could be beneficial.
Empty DataFrame
Columns: [app_id, app_json, type, record_id, input, output, tags, record_json, cost_json, perf_json, ts, latency, total_tokens, total_cost]
Index: []
The file provided does not contain information regarding the factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to previous quarters. If you have the text or details from the 10-Q, please share them, and I can help analyze the factors for you.
Empty DataFrame
Columns: [app_id, app_json, type, record_id, input, output, tags, record_json, cost_json, perf_json, ts, latency, total_tokens, total_cost]
Index: []
The provided text does not contain any specific details or numbers concern

KeyboardInterrupt: 

In [184]:
# Tricky!
# trulens sometimes doesn't emit Context Relevance, Answer Relevance and Groundedness immediately
# in each query call; but the values will be ready later
# so I append the value to evaluation result after iterating through all queries
# I suspect there may be async functions inside trulens' or evaluator (OpenAI). 
# It's not clear documented :(
for index in range(len(evaluation)):
    rag_answer = records.iloc[index].get("output", None)
    trulens_scores = records.iloc[index][feedback]
    context_relevance = trulens_scores.get("Context Relevance", None)
    answer_relevance = trulens_scores.get("Answer Relevance", None)
    groundedness = trulens_scores.get("Groundedness", None)
    evaluation[index]["rag_answer"] = rag_answer
    evaluation[index]["context_relevance"] = context_relevance
    evaluation[index]["answer_relevance"] = answer_relevance
    evaluation[index]["groundedness"]= groundedness
    evaluation[index]["kg_evaluation"] = kg_evaluate(rag_answer)

df = pd.DataFrame(evaluation)
print(df)

df.to_csv('rag_llamaindex_evals.csv', index=False)

IndexError: single positional indexer is out-of-bounds

You could open rag_llamaindex_evals.csv for better viewing experience.

## My Observation
1. RAG generated answers are shorter, but still on the right track. However, RAG generated answers tend to lack details compare with curated answers. Besides, RAG generated answers do not provide reference to the source documents compared with curated answers. 

2. We talked about how the lengths of reference (curated answer) and hypothesis (rag generated answer) can skew the evalution metrics, e.g. Rouge, Bleu, Bert... so I picked one example which has higher scores. 

It seems if the lengths of reference and hypothesis are close, the scores tend to be higher.

Besides, RAG generated answer gives the wrong answer, but the groundedness is high. I may have to do with the data OpenAI model knows about.

```
Question: What effective tax rate did Apple report in its latest quarterly filing, and how does this compare to the statutory tax rate?	
Curated answer: Apple reported an effective tax rate of 12.5% for the third quarter of 2023, which is lower than the statutory federal income tax rate of 21%. SOURCE(S): 2023 Q3 AAPL.pdf	
Rouge: [{'rouge-1': {'r': 0.4074074074074074, 'p': 0.3055555555555556, 'f': 0.34920634430839004}, 'rouge-2': {'r': 0.20689655172413793, 'p': 0.13636363636363635, 'f': 0.1643835568549448}, 'rouge-l': {'r': 0.2962962962962963, 'p': 0.2222222222222222, 'f': 0.25396824907029486}}]	
Blue: 0.3581371191	
Bert: (tensor([0.6412]), tensor([0.7180]), tensor([0.6774]))	
Rag generated answer: "Apple reported an effective tax rate of 15.8% in its latest quarterly filing. This effective tax rate is lower than the statutory federal income tax rate of 21%."	
Context relevance: 0.45	
Answer relevance: 1	
Groundedness: 1																	
```
3. I looked at another example with high context_relevance score: 0.75
It seems the RAG generated answer mentions about some figures existing in curated answer, e.g. $19.0 billion, but it also mentioned figures that doens't exist in curated answer, e.g. $21.7 billion. The figures comes from `Copy of 2022 Q3 AAPL.pdf`: `The Company repurchased $21.7 billion of its common stock and paid dividends and dividend equivalents of $3.8 billion during the third quarter of 2022.` So `$21.7 billion` actually seems to be relevant and correct answer.

```
Has Apple engaged in any significant share repurchase activities in the reported quarters and what are the financial implications of these activities?	
Yes, Apple has engaged in significant share repurchase activities in the reported quarters. The financial implications of these activities include a reduction in the number of shares outstanding, which can lead to an increase in earnings per share (EPS) and potentially enhance shareholder value. The repurchase activities are as follows: - In the quarter ended June 25, 2022, Apple repurchased 408 million shares of its common stock for $65.0 billion under a share repurchase program authorized by the Board of Directors. - In the quarter ended December 31, 2022, Apple repurchased 133 million shares of its common stock under an authorized share repurchase program for $19.0 billion, excluding excise tax due under the Inflation Reduction Act of 2022. - In the quarter ended April 1, 2023, Apple repurchased 262 million shares of its common stock under an authorized share repurchase program for $38.1 billion, excluding excise tax due under the Inflation Reduction Act of 2022. - In the quarter ended July 1, 2023, Apple repurchased 365 million shares of its common stock for $56.1 billion, excluding excise tax due under the Inflation Reduction Act of 2022. These repurchases are part of Apple's capital return program, which includes share repurchases and dividends, aimed at returning excess cash to shareholders. SOURCE(S): 2022 Q3 AAPL.pdf, 2023 Q1 AAPL.pdf, 2023 Q2 AAPL.pdf, 2023 Q3 AAPL.pdf	
[{'rouge-1': {'r': 0.12962962962962962, 'p': 0.3888888888888889, 'f': 0.1944444406944445}, 'rouge-2': {'r': 0.013071895424836602, 'p': 0.045454545454545456, 'f': 0.020304565058621014}, 'rouge-l': {'r': 0.12037037037037036, 'p': 0.3611111111111111, 'f': 0.18055555180555563}}]	0.06932773109	
(tensor([0.4339]), tensor([0.5681]), tensor([0.4920]))	
"Apple engaged in significant share repurchase activities in both the first quarter of 2023 and the third quarter of 2022. The company repurchased $19.0 billion of its common stock in Q1 2023 and $21.7 billion in Q3 2022. These activities indicate that Apple is actively buying back its own shares from the market. The financial implications of these share repurchases include reducing the number of outstanding shares, potentially increasing earnings per share, and signaling confidence in the company's financial strength and future prospects."	
0.75	
1	
0.25																	
```

4. Overall, groundedness tends to have extreme values, e.g. 0 or 1. However, the rag generated answer with 0 groundedness seems decent.

```
What are the major factors contributing to the change in Apple's gross margin in the most recent 10-Q compared to the previous quarters?	
In the most recent 10-Q for the quarter ended July 1, 2023, the factors contributing to the change in Apple's gross margin compared to previous quarters include: 1. Weakness in foreign currencies relative to the U.S. dollar, which had an unfavorable impact on gross margin. 2. Lower Products volume, which decreased gross margin. 3. Cost savings and a different Products mix, which partially offset the decrease in gross margin. For the third quarter of 2023, the Products gross margin percentage increased compared to the same quarter in 2022 due to cost savings and a different Products mix, despite the negative impact of foreign currency weakness and decreased leverage. However, the year-over-year Products gross margin percentage for the first nine months of 2023 decreased due to the weakness in foreign currencies and decreased leverage, despite cost savings and a different Products mix. The Services gross margin increased due to higher Services net sales but was partially offset by the weakness in foreign currencies and higher Services costs. The Services gross margin percentage decreased due to higher Services costs, partially offset by improved leverage. SOURCE(S): 2023 Q3 AAPL.pdf	
[{'rouge-1': {'r': 0.20224719101123595, 'p': 0.5, 'f': 0.28799999589888003}, 'rouge-2': {'r': 0.07092198581560284, 'p': 0.22727272727272727, 'f': 0.10810810448268822}, 'rouge-l': {'r': 0.20224719101123595, 'p': 0.5, 'f': 0.28799999589888003}}]	0.1135391623	(tensor([0.4550]), tensor([0.6020]), tensor([0.5183]))	
"The change in Apple's gross margin in the most recent 10-Q compared to the previous quarters was primarily influenced by the weakness in foreign currencies relative to the U.S. dollar, higher Services costs, improved leverage, and a different Services mix."	
0.75	
0.9	
0																	
```
5. I feel inconclusive about the trulens metrics and doubt about the quality of human curated answers.
6. Overall, it's a very time-consuming process, either at building node setence window phase or at the retrival phase. Possible reason could be the response time of OpenAI API calls. Besides, it seems to be memory-intensive. On my environment, I had to close other applications to keep the VSCode running. So my weak equipment could also contribute to the long running time.

# GPT-based Approach
## Feed files and queries to GPT
Since the OpenAI Chat API only allows referencing 1 file, I had to merge all APPL* files togetehr

From experiment, I realized GPT also uses its own knowledge to answer the questions. To limit its source of reference, I used prompt engineering to limit its knowledge source: `You are a helpful assistant. Answer the questions ONLY based on the file referenced.`

In [108]:
from PyPDF2 import PdfReader, PdfWriter

def merge_pdfs_in_directory(directory_path, output_path):
    pdf_writer = PdfWriter()
    files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    files.sort()  # Optional: sort the file names

    for filename in files:
        filepath = os.path.join(directory_path, filename)
        pdf_reader = PdfReader(filepath)
        num_pages = len(pdf_reader.pages)
        for page in range(num_pages):
            pdf_writer.add_page(pdf_reader.pages[page])

    # Write out the merged PDF
    with open(output_path, 'wb') as out:
        pdf_writer.write(out)

# Example usage:
merge_pdfs_in_directory('./AAPL', './AAPL/merged_AAPL.pdf')

In [136]:
import openai 

client = openai.OpenAI(api_key=api_key)
def upload_file(file_path):
    with open(file_path, 'rb') as file:
        response = client.files.create(file=file, purpose='assistants')
    return response.id

# Example: Upload a PDF and get the file ID
file_id = upload_file('./AAPL/merged_AAPL.pdf')
print("Uploaded File ID:", file_id)


Uploaded File ID: file-7z6y9S71qCn3aefZAapSMRYn


In [137]:
def query_model_with_file(file_id, question):
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            # I have to instruct the system to only use the file provided
            {"role": "system", "content": "You are a helpful assistant. Answer the questions ONLY based on the file referenced."},
            {"role": "user", "content": question}
        ],
        max_tokens=500
    )
    return response.choices[0].message.content.strip()

# Get a response using the uploaded file
result = query_model_with_file(file_id, "How has Apple's total net sales changed over time?")
print("GPT-4-turbo Output:", result)


GPT-4-turbo Output: The total net sales for Apple have changed over time as follows:

- In 2021, total net sales were $365,817 million.
- In 2022, total net sales increased to $394,328 million.


## Evaluation

In [187]:
file_path = 'qna_human_curated.xlsx'  # You should update this path
aapl = extract_aapl_questions_answers(file_path)

evaluation = []
for index, qa in aapl.iterrows():
    question, answer = qa["Question"], qa["Answer"]
    rag_answer = query_model_with_file(file_id, question)

    rouge = rouge_score(answer, rag_answer)
    meteor = meteor_score(answer, rag_answer)
    bert = bert_score(answer, rag_answer)
    kg_evals = kg_evaluate(rag_answer)

    evaluation.append({"question": question, "answer": answer,  "rag_answer": rag_answer, \
        "rouge": rouge, "meteor": meteor, "bert": bert, "kg_evals": kg_evals})

In [188]:
df = pd.DataFrame(evaluation)
print(df)

df.to_csv('rag_gpt4_evals.csv', index=False)

                                             question  \
0   How has Apple's total net sales changed over t...   
1   What are the major factors contributing to the...   
2   Has there been any significant change in Apple...   
3   How has Apple's revenue from iPhone sales fluc...   
4   Can any trends be identified in Apple's Servic...   
5   What is the impact of foreign exchange rates o...   
6   Are there any notable changes in Apple's liqui...   
7   How does Apple's R&D expenditure in the most r...   
8   What legal proceedings or contingencies are di...   
9   Has Apple engaged in any significant share rep...   
10  What is the effective tax rate reported by App...   
11  Does Apple report any significant new business...   
12  How have Apple's inventory levels changed acro...   
13  Are there any significant changes or new discl...   
14  Summarize the risk factors to Apple's business...   
15  What was the gross margin for Apple in the lat...   
16  How did Apple's operating e

## My Obervation
I've omitted the trulens metrics this time, given its non-ideal results before

1. I noticed the format of RAG produced answer tend to be bulleting rich, compare with the curated answers' plain text style.
2. Like the llamamindex based implementation, RAG generated answer (with GPT-4 model) tend to give concised answer with less details, e.g. omitting the quarter information. But for some test cases, curated answers provided unnecessary information:
```
curated answer
The Services segment contributed 21,213 million dollars to Apple's total net sales of 81,797 million dollars in Q3 2023. To calculate the percentage contribution of the Services segment to the total net sales: (21,213 / 81,797) * 100 = 25.93% Therefore, the Services segment contributed approximately 25.93% of Apple's total revenue in Q3 2023. SOURCE(S): 2023 Q3 AAPL.pdf

RAG generated answer
In Q3 2023, Apple's Services segment contributed 23.2% to its total revenue.
```
Perhaps I could do some prompt engineering to have GPT-4 write "chain of throughts" style answers.


3. GPT can fail to answer some questions honestly, which is good! 
```
What was the gross margin for Apple in the latest 10-Q report?	
The gross margin for Apple in the latest 10-Q report for the three months ended July 1, 2023, was $36,413 million. SOURCE(S): 2023 Q3 AAPL.pdf	
You need to provide me with the specific data or portions of the 10-Q report you are referencing, as I don't have access to external documents or databases directly. If you can share the relevant financial figures or sections from the report, I'd be happy to help calculate or analyze the gross margin for you!	
[{'rouge-1': {'r': 0.25, 'p': 0.13333333333333333, 'f': 0.173913038941399}, 'rouge-2': {'r': 0.12, 'p': 0.05555555555555555, 'f': 0.07594936276237807}, 'rouge-l': {'r': 0.25, 'p': 0.13333333333333333, 'f': 0.173913038941399}}]	
0.206102576	
(tensor([0.4429]), tensor([0.4186]), tensor([0.4304]))																				
```
Interestingly, previous Llamaindex implementation also failed on the same test case:
```
"The gross margin for Apple in the latest 10-Q report was not explicitly provided in the context information."
```
Perhaps it's because both implementation uses GPT-4 model. If alloted more time, I'd try using different model.

4. About knowledge graph based evalutions, I expected higher common_entities, common_relations would correlate to high Rouge, Bleu, Bert scores, however, I don't find the strong correlation. 
Besides, in some cases, knowledge graph built from rag_answer captures useless information, e.g. `{"label": "CARDINAL", "text": "13"}` which skews the common_entities metrcis to higher values. 
If alotted more time, I'd likee to try other embeddingg algorithms and filter out useless inforamtion, so that the knowledge graph only captures useful inforamtion.


# Application

Due to the slow speed of llamaindex producing RAG, it is more reasonable to leverage GPT based approach

I've recorded a demo video: https://www.loom.com/share/6d8a950f098d4a57aff8e3819b454cef?sid=287e92de-721e-4902-b2e8-687eb26fc1a6

In [199]:
import gradio as gr

# Define a global variable to store the file ID
global_file_id = None

def upload_and_query(file, question):
    global global_file_id
    if global_file_id is None:
        # Upload the file and get the file ID if not already uploaded
        global_file_id = upload_file(file)
    # Query the model with the uploaded file and the user's question
    answer = query_model_with_file(global_file_id, question)
    return answer

inputs = [
    gr.File(label="Upload PDF file"),
    gr.Textbox(lines=2, label="Enter your question")
]

output = gr.Textbox(label="Answer")

gr.Interface(upload_and_query, inputs, output, title="File Upload and GPT Query", theme="compact").launch()



Sorry, we can't find the page you are looking for.


Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


