# Introduction

This pipeline implements a RAG pipeline and then evalutes the performance of same using evaluation dataset

In [1]:
import nest_asyncio

nest_asyncio.apply()

import os
import openai


#TODO: Replace ChatGPT with Open source LLM
os.environ["OPENAI_API_KEY"] = ".."

In [2]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode

# Setup

### Load LLM

In [3]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

## Implement Custom Feature Extractor

You can implement a custom way to extract features from the data using ChatGPT and Llamma Index

In [4]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    KeywordExtractor,
    BaseExtractor,
)
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import BaseNode, TextNode
from typing import Any, Dict, List, Optional, Sequence, cast
from llama_index.core.prompts import PromptTemplate
from llama_index.core.async_utils import DEFAULT_NUM_WORKERS, run_jobs
from llama_index.core.llms.llm import LLM
from llama_index.core.settings import Settings
from llama_index.core.service_context_elements.llm_predictor import (
    LLMPredictorType,
)
from llama_index.core.bridge.pydantic import Field, PrivateAttr

text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)

In [5]:
MASTER_SUMMARY_TEMPLATE = """\
Given summary for three consecutive parts, create Headline for the page:
Summary 1:
{context_str_1}
Summary 2:
{context_str_2}
Summary 3:
{context_str_3}

Headline for the page \

Headline: """

SUMMARY_TEMPLATE = """\
Here is the content of the section:
{context_str}

Summarize the key topics and entities of the section. \

Summary: """

class CustomExtractor(BaseExtractor):
    """
    Summary extractor. 
    performs two levels for summary - chunk level, page level
    For the now, page level is done using reading chunks before and after
    
    TODO: summarize all nodes of page. current method looks at only 3 chunks  

    Args:
        llm (Optional[LLM]): LLM
        prompt_template (str): template for summary extraction for nodes
        prompt_template_master (str): template for summary extraction for 3 nodes combined
    """
    
    llm: LLMPredictorType = Field(description="The LLM to use for generation.")
    prompt_template: str = Field(
        default=SUMMARY_TEMPLATE,
        description="Template to use when generating summaries.",
    )
    prompt_template_master: str = Field(
        default=MASTER_SUMMARY_TEMPLATE,
        description="Template to use when generating summaries.",
    )

    def __init__(
    self,
    llm: Optional[LLM] = None,
    # TODO: llm_predictor arg is deprecated
    llm_predictor: Optional[LLMPredictorType] = None,
    prompt_template: str = SUMMARY_TEMPLATE,
    prompt_template_master: str = MASTER_SUMMARY_TEMPLATE,
    num_workers: int = DEFAULT_NUM_WORKERS,
    **kwargs: Any,
    ):

        super().__init__(
            llm=llm or llm_predictor or Settings.llm,
            prompt_template=prompt_template,
            prompt_template_master=prompt_template_master,
            num_workers=num_workers,
            **kwargs,
        )

    
    async def _agenerate_combined_summary(self, 
                                          context_str_1: str, 
                                          context_str_2: str, 
                                          context_str_3: str, 
                                          ) -> str:
        """Generate combined summary."""

        summary = await self.llm.apredict(
            PromptTemplate(template=self.prompt_template_master), 
            context_str_1=context_str_1,
            context_str_2=context_str_2,
            context_str_3=context_str_3,
        )

        return summary.strip()
    
    async def _agenerate_node_summary(self, node: BaseNode) -> str:
        """Generate a summary for a node."""
        if self.is_text_node_only and not isinstance(node, TextNode):
            return ""

        context_str = node.get_content(metadata_mode=self.metadata_mode)
        summary = await self.llm.apredict(
            PromptTemplate(template=self.prompt_template), context_str=context_str
        )

        return summary.strip()

    async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]:
        if not all(isinstance(node, TextNode) for node in nodes):
            raise ValueError("Only `TextNode` is allowed for `Summary` extractor")

        node_summaries_jobs = []
        for node in nodes:
            node_summaries_jobs.append(self._agenerate_node_summary(node))

        node_summaries = await run_jobs(
            node_summaries_jobs,
            show_progress=self.show_progress,
            workers=self.num_workers,
        )

        master_summaries_jobs = []
        # Extract node-level summary metadata
        metadata_list: List[Dict] = [{} for _ in nodes]
        for i, metadata in enumerate(metadata_list):
            prev_sum = ""
            next_sum = ""

            if i>0 and nodes[i-1].metadata["page_label"] == nodes[i].metadata["page_label"]:
                prev_sum = node_summaries[i - 1]
            if i < len(nodes) - 1 and nodes[i+1].metadata["page_label"] == nodes[i].metadata["page_label"]:
                next_sum = node_summaries[i + 1]
            if i > 0 and node_summaries[i - 1] and  i < len(nodes) - 1 and node_summaries[i + 1]:
                master_summaries_jobs.append(self._agenerate_combined_summary(
                                                prev_sum,
                                                node_summaries[i],
                                                next_sum,
                                            ))
        
        master_summaries = [node_summaries[0]]
        master_summaries += await run_jobs(
                    master_summaries_jobs,
                    show_progress=self.show_progress,
                    workers=self.num_workers,
                    )
        master_summaries.append(node_summaries[-1])
        
        # Extract node-level summary metadata
        metadata_list: List[Dict] = [{} for _ in nodes]
        for i, metadata in enumerate(metadata_list):
            if master_summaries[i]:
                metadata["headline"] = master_summaries[i]
                metadata["summary"] = node_summaries[i]

        return metadata_list

In [6]:
extractors = [
    QuestionsAnsweredExtractor(questions=3, llm=llm),

    # already used in custom extractor
    # SummaryExtractor(summaries=["self"], llm=llm),
    
    KeywordExtractor(keywords=10, llm=llm),

    # Summarizes three chunk together. for a given page.
    CustomExtractor(llm=llm)
]

transformations = [text_splitter] + extractors

## read and process data

In [7]:
from llama_index.core import SimpleDirectoryReader

In [8]:
docs = SimpleDirectoryReader(input_files=["../data/Investment Case For Disruptive Innovation.pdf"]).load_data()

In [9]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

data_nodes = pipeline.run(documents=docs)

100%|██████████| 25/25 [00:09<00:00,  2.72it/s]
100%|██████████| 25/25 [00:05<00:00,  4.65it/s]
100%|██████████| 25/25 [00:10<00:00,  2.33it/s]
100%|██████████| 23/23 [00:04<00:00,  5.73it/s]


In [38]:
print(data_nodes[3].get_content(metadata_mode=MetadataMode.EMBED))

[Excerpt from document]
page_label: 3
file_path: ..\data\Investment Case For Disruptive Innovation.pdf
questions_this_excerpt_can_answer: 1. How will the adoption of Neural Networks impact the future of cloud datacenters and AI-specific compute hardware?
2. What potential transformations can be expected in the transportation industry due to declining costs of Advanced Battery Technology and electric drivetrain cost declines?
3. In what ways will robotics, catalyzed by artificial intelligence, revolutionize manufacturing processes and supply chains in the future?
excerpt_keywords: Neural Networks, Cloud datacenters, AI-specific compute hardware, Advanced Battery Technology, Electric drivetrain, Robotics, Artificial intelligence, Manufacturing processes, Supply chains, Disruptive Innovation
headline: "The Convergence of Innovation Platforms: Redefining Technology in the Modern Era"
summary: The key topics of the section include the adoption of Neural Networks and its impact on cloud data

In [39]:
print(data_nodes[3].get_content(metadata_mode=MetadataMode.LLM))

[Excerpt from document]
page_label: 3
file_path: ..\data\Investment Case For Disruptive Innovation.pdf
questions_this_excerpt_can_answer: 1. How will the adoption of Neural Networks impact the future of cloud datacenters and AI-specific compute hardware?
2. What potential transformations can be expected in the transportation industry due to declining costs of Advanced Battery Technology and electric drivetrain cost declines?
3. In what ways will robotics, catalyzed by artificial intelligence, revolutionize manufacturing processes and supply chains in the future?
excerpt_keywords: Neural Networks, Cloud datacenters, AI-specific compute hardware, Advanced Battery Technology, Electric drivetrain, Robotics, Artificial intelligence, Manufacturing processes, Supply chains, Disruptive Innovation
headline: "The Convergence of Innovation Platforms: Redefining Technology in the Modern Era"
summary: The key topics of the section include the adoption of Neural Networks and its impact on cloud data

In [40]:
print(data_nodes[3].metadata["headline"])

"The Convergence of Innovation Platforms: Redefining Technology in the Modern Era"


In [41]:
print(data_nodes[3].metadata["summary"])

The key topics of the section include the adoption of Neural Networks and its impact on cloud datacenters and AI-specific compute hardware, the potential transformations in the transportation industry due to declining costs of Advanced Battery Technology and electric drivetrain cost declines, and the revolutionization of manufacturing processes and supply chains through robotics catalyzed by artificial intelligence. The section discusses how these disruptive innovations will change the way people live, work, and play, as well as transform various sectors and business models. Key entities mentioned include Neural Networks, Cloud datacenters, AI-specific compute hardware, Advanced Battery Technology, Electric drivetrain, Robotics, Artificial intelligence, Manufacturing processes, and Supply chains.


In [42]:
print(data_nodes[3].metadata["questions_this_excerpt_can_answer"])

1. How will the adoption of Neural Networks impact the future of cloud datacenters and AI-specific compute hardware?
2. What potential transformations can be expected in the transportation industry due to declining costs of Advanced Battery Technology and electric drivetrain cost declines?
3. In what ways will robotics, catalyzed by artificial intelligence, revolutionize manufacturing processes and supply chains in the future?


## Index docs using Open Source Model

In [14]:
from llama_index.core import VectorStoreIndex

In [15]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

from llama_index.core import Settings

# index model required for decent indexing performance. 
# fine tuned model might work better because domnain specific keywords
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [16]:
index = VectorStoreIndex(
    nodes=data_nodes,
)

# Create QA engine

In [17]:
# using thumb rule of 3 chunks. More experimenation and eval required to find more optimal chunking
query_engine = index.as_query_engine(similarity_top_k=3)

# Example runs

## Example 1

In [43]:
ques = "How does ARK ensure its investment strategies align with reality of disruptive innovation trends?"

In [44]:
response = query_engine.query(ques)
print(response)

ARK ensures its investment strategies align with the reality of disruptive innovation trends by emphasizing the importance of understanding regulatory, market, sector, and company risks associated with investing in disruptive innovation. Additionally, ARK recommends a cross-sector understanding of technology and a combination of top-down and bottom-up research to mitigate risks effectively.


In [45]:
query_engine.retrieve(ques)[0].metadata

{'page_label': '2',
 'file_name': 'c:/Users/singl/studies/Interviews/tifin/data/Investment Case For Disruptive Innovation.pdf',
 'file_path': '..\\data\\Investment Case For Disruptive Innovation.pdf',
 'file_type': 'application/pdf',
 'file_size': 1998242,
 'creation_date': '2024-03-02',
 'last_modified_date': '2024-03-02',
 'questions_this_excerpt_can_answer': '1. What risks are associated with investing in disruptive innovation according to ARK Investment Management LLC?\n2. How does ARK Investment Management LLC suggest investors approach understanding the regulatory, market, sector, and company risks of investing in innovation?\n3. What strategies does ARK Investment Management LLC recommend for investors to mitigate the risks of investing in disruptive innovation, considering factors such as regulatory hurdles and competitive landscape?',
 'excerpt_keywords': 'disruptive innovation, ARK Investment Management LLC, investment, risks, regulatory hurdles, competitive landscape, market

In [46]:
query_engine.retrieve(ques)[0].score

0.8541514392779946

## Example 2

In [47]:
ques = "What is the core objective of investing in disruptive innovation according to ARK?"

In [48]:
response = query_engine.query(ques)
print(response)

The core objective of investing in disruptive innovation according to ARK is to capture companies at the forefront of technology-enabled innovation that offer potential for long-term growth, portfolio diversification, and a moderate-to-high risk-reward profile that complements traditional investment strategies.


In [49]:
response.metadata

{'d12394fe-03e5-4316-b203-2b5a4c6cd3ea': {'page_label': '17',
  'file_name': 'c:/Users/singl/studies/Interviews/tifin/data/Investment Case For Disruptive Innovation.pdf',
  'file_path': '..\\data\\Investment Case For Disruptive Innovation.pdf',
  'file_type': 'application/pdf',
  'file_size': 1998242,
  'creation_date': '2024-03-02',
  'last_modified_date': '2024-03-02',
  'questions_this_excerpt_can_answer': '1. How does the ARK Innovation ETF (ARKK) aim to capture disruptive innovation in the market?\n2. What are the potential benefits for investors who choose to invest in companies at the forefront of technology-enabled innovation?\n3. How does the moderate-to-high risk-reward profile of investing in disruptive innovation complement traditional investment strategies for investors with a similar risk profile?',
  'excerpt_keywords': 'disruptive innovation, ARK Innovation ETF, technology-enabled, growth, portfolio diversification, risk-reward profile, secular changes, core portfolios,

# Evaluate QA System

## Retriver Performance

Ideally, labelled data would be required to evaluate the performance of data.

Because of time and resource constraint, page level ranking has been done for 20 eval questions

In [12]:
import pandas as pd

df = pd.read_csv("../data/Evaluation_Questions.csv")
df

Unnamed: 0,question,answer_page
0,What is the core objective of investing in dis...,3
1,What are the significant risks associated with...,2
2,Can you list the converging innovation platfor...,4
3,How does ARK describe the impact of Artificial...,3
4,What transformative potential does Multiomic S...,3
5,What are the implications of declining battery...,3
6,How is the field of Robotics anticipated to ev...,3
7,What does the ARK’s Convergence Scoring Framew...,4
8,How do neural networks serve as a catalyst for...,3
9,What unique view does ARK have towards Autonom...,3


In [18]:
df["predicted_page"] = [int(query_engine.retrieve(q)[0].metadata["page_label"]) for q in df["question"]]

In [19]:
df["embedding score"] = [query_engine.retrieve(q)[0].score for q in df["question"]]

In [20]:
df

Unnamed: 0,question,answer_page,predicted_page,embedding score
0,What is the core objective of investing in dis...,3,17,0.825417
1,What are the significant risks associated with...,2,2,0.883151
2,Can you list the converging innovation platfor...,4,3,0.790328
3,How does ARK describe the impact of Artificial...,3,3,0.746459
4,What transformative potential does Multiomic S...,3,3,0.737435
5,What are the implications of declining battery...,3,3,0.739814
6,How is the field of Robotics anticipated to ev...,3,3,0.729169
7,What does the ARK’s Convergence Scoring Framew...,4,18,0.763357
8,How do neural networks serve as a catalyst for...,3,3,0.747692
9,What unique view does ARK have towards Autonom...,3,16,0.705184


In [21]:
performance = (df["answer_page"]==df["predicted_page"]).mean()
print(f"retreviering performance {round(performance*100)}%")

retreviering performance 60%


## Generation Eval

Possible options - 
1. BERT Score 
- Check similarity betweeen text and generated response - faithfullness
- Check similarity betweeen ques and generated response and ques - relevance (GPT can also be used) 
- Can be deloyed on run time and can be used to indicate the confidence score to user. Making them believe the system more

2. LLM
- LLM can be leveraged for this. Since LLM can hallucinate, for now, we are leverage extrnal methodologies. 
- Not possible to do it real time.

3. More sofisticated methods
- Use NER detect entities and check for hallucionated entities
- Use COT 5/6 and ask GPT for label given answer. Mean and variance can help us understand faithfullness


#### Test code to prove embedding preformance

In [22]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [23]:
sentences_1 = ["helo, this is random stuff"]
sentences_2 = ["What unique view does ARK have towards Autonomous Mobility and its market potential?"]

embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.41846022]]


### Inference for all eval questions

In [24]:
response_list = [query_engine.query(ques) for ques in df["question"]]

In [25]:
docs = []
for i,row in df.iterrows():

    r = response_list[i]
    ques = row["question"]

    key = list(r.metadata.keys())[0]
    for d in data_nodes:
        if key == d.id_:
            doc_context = d.get_content()
    docs.append(doc_context)

In [26]:
docs[0]

'•17  1. Access to Growth Investors who seek to access companies at the forefront of technology-enabled innovation, in some of the most promising areas of the economy, with potential for long-term growth.2. Portfolio DiversificationPotentially Suited for investors who like to diversify their existing portfolio with strategies that offer low correlation to a number of core asset classes held in most investors’ portfolios.3. Moderate-to-High Risk-Reward ProfileA constant focus on secular changes and disruptive innovation can compliment traditional strategies and core portfolios. May be suited for investors who have a moderate-to-high risk profile and intend to stay invested for the medium-to-long term.ARK Seeks to Capture Disruptive Innovation\nThe information herein is general in nature and should not be considered financial advice. An investor should consult a financial professional regarding the investor’s specific situation. Diversification does not assure a profit. The ARK Innovatio

In [27]:
ques_embedding = model.encode(df["question"], normalize_embeddings=True)
text_embedding = model.encode(docs, normalize_embeddings=True)
ans_embedding = model.encode([r.response for r in response_list], normalize_embeddings=True)

In [28]:
ques_embedding.shape

(20, 1024)

In [29]:
text_embedding.shape

(20, 1024)

In [30]:
ans_embedding.shape

(20, 1024)

### faithfullness

In [31]:
similarity = [text_embedding[i,] @ ans_embedding[i,].T for i in range(len(df))]
similarity

[0.8609694,
 0.83047324,
 0.78075856,
 0.8587421,
 0.7058568,
 0.79477906,
 0.75540984,
 0.6618123,
 0.79115546,
 0.47214276,
 0.6028527,
 0.50512916,
 0.6485937,
 0.6916897,
 0.634709,
 0.8471083,
 0.92862535,
 0.82982504,
 0.87315154,
 0.87826455]

In [32]:
print(f"Faithfullness {round((sum(similarity)/len(similarity))*100)}%")

Faithfullness 75%


In [33]:
df["faithfullness score"] = similarity

### Relevance

In [34]:
similarity = [ques_embedding[i,] @ ans_embedding[i,].T for i in range(len(df))]
similarity

[0.91578233,
 0.8718318,
 0.5898199,
 0.8435095,
 0.80790424,
 0.9008014,
 0.8618424,
 0.83895105,
 0.8732161,
 0.86262465,
 0.8687971,
 0.87979054,
 0.84388316,
 0.9451057,
 0.9230579,
 0.87716115,
 0.89752495,
 0.85258627,
 0.8986722,
 0.8957244]

In [35]:
df["relevance score"] = similarity

In [36]:
print(f"Relevance {round((sum(similarity)/len(similarity))*100)}%")

Relevance 86%


In [37]:
df

Unnamed: 0,question,answer_page,predicted_page,embedding score,faithfullness score,relevance score
0,What is the core objective of investing in dis...,3,17,0.825417,0.860969,0.915782
1,What are the significant risks associated with...,2,2,0.883151,0.830473,0.871832
2,Can you list the converging innovation platfor...,4,3,0.790328,0.780759,0.58982
3,How does ARK describe the impact of Artificial...,3,3,0.746459,0.858742,0.843509
4,What transformative potential does Multiomic S...,3,3,0.737435,0.705857,0.807904
5,What are the implications of declining battery...,3,3,0.739814,0.794779,0.900801
6,How is the field of Robotics anticipated to ev...,3,3,0.729169,0.75541,0.861842
7,What does the ARK’s Convergence Scoring Framew...,4,18,0.763357,0.661812,0.838951
8,How do neural networks serve as a catalyst for...,3,3,0.747692,0.791155,0.873216
9,What unique view does ARK have towards Autonom...,3,16,0.705184,0.472143,0.862625
