## Small to Big Retrieval with LlamaIndex

In [3]:
! pip3 install -U llama_hub llama_index braintrust autoevals pypdf pillow transformers torch torchvision


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPEN_AI_KEY"] = api_key

In [5]:
!wget --user-agent "Chrome" "https://arxiv.org/pdf/2307.09288.pdf" -O "llama2.pdf" 

--2024-08-12 20:10:05--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.3.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2307.09288 [following]
--2024-08-12 20:10:05--  http://arxiv.org/pdf/2307.09288
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘llama2.pdf’


2024-08-12 20:10:07 (5.63 MB/s) - ‘llama2.pdf’ saved [13661300/13661300]



### Basic RAG

In [6]:
from pathlib import Path
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.llms import openai
import json
from llama_index.readers.file import PDFReader

In [7]:
loader = PDFReader()
docs0 = loader.load_data(file=Path("llama2.pdf"))

In [18]:
docs0[0]

Document(id_='8a9a9c8b-24f3-4cf8-97d8-12e6ab362535', embedding=None, metadata={'page_label': '1', 'file_name': 'llama2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Llama 2 : Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗Louis Martin†Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev\nPunit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich\nYinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra\nIgor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi\nAlan Schelten Ruan Silva Eric Michael Smith Ra

In [8]:
from llama_index.core import Document

doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

In [9]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import IndexNode

In [23]:
node_parser = SimpleNodeParser.from_defaults(chunk_size=1024)
node_parser

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x30d361c10>, id_func=<function default_id_func at 0x17f9c9ee0>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')

In [24]:
base_nodes = node_parser.get_nodes_from_documents(docs)
base_nodes[0]

TextNode(id_='86f7db4b-cc58-44e3-9d4c-760cb9e4b09a', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='22dc2471-46b0-4deb-a848-8d5fed2d22fb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='231b3bd773f012c01b893ada9e009ecf25ee59934614c5a595c9bf1b3a123292'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='480f1230-ae47-46e3-b361-2ea24ae542f5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='820de41da4f942046105d6d061c52e7e04429023a554953b03b420aaebdb53ae')}, text='Llama 2 : Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗Louis Martin†Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshor

In [25]:
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"
    
base_nodes[0]

TextNode(id_='node-0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='22dc2471-46b0-4deb-a848-8d5fed2d22fb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='231b3bd773f012c01b893ada9e009ecf25ee59934614c5a595c9bf1b3a123292'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='480f1230-ae47-46e3-b361-2ea24ae542f5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='820de41da4f942046105d6d061c52e7e04429023a554953b03b420aaebdb53ae')}, text='Llama 2 : Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗Louis Martin†Kevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHak

In [10]:
!pip3 install llama-index-embeddings-huggingface


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [11]:
from llama_index.core.embeddings import resolve_embed_model
from llama_index.llms.openai import OpenAI

embed_model = resolve_embed_model("local:BAAI/bge-small-en")
llm = OpenAI(model = "gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model
)

  from .autonotebook import tqdm as notebook_tqdm
  service_context = ServiceContext.from_defaults(


In [31]:
base_index = VectorStoreIndex(base_nodes, service_context=service_context)
base_retriever = base_index.as_retriever(similarity_top_k=2)

In [32]:
retrievals = base_retriever.retrieve("Can you tell me about the key concepts for safety finetuning")

In [33]:
for n in retrievals:
    display_source_node(n,source_length=1500)

**Node ID:** node-26<br>**Similarity:** 0.8581930266954525<br>**Text:** AsLLMsareintegratedanddeployed,welookforwardto
continuing research that will amplify their potential for positive impact on these important social issues.
4.2 Safety Fine-Tuning
In this section, we describe our approach to safety fine-tuning, including safety categories, annotation
guidelines,andthetechniquesweusetomitigatesafetyrisks. Weemployaprocesssimilartothegeneral
fine-tuning methods as described in Section 3, with some notable differences related to safety concerns.
Specifically, we use the following techniques in safety fine-tuning:
1.Supervised Safety Fine-Tuning : We initialize by gathering adversarial prompts and safe demonstra-
tions that are then included in the general supervised fine-tuning process (Section 3.1). This teaches
themodeltoalignwithoursafetyguidelinesevenbeforeRLHF,andthuslaysthefoundationfor
high-quality human preference data annotation.
2.Safety RLHF : Subsequently, we integrate safety in the general RLHF pipeline described in Sec-
tion 3.2.2. This includes training a safety-specific reward model and gathering more challenging
adversarial prompts for rejection sampling style fine-tuning and PPO optimization.
3.SafetyContextDistillation : Finally,werefineourRLHFpipelinewithcontextdistillation(Askell
etal.,2021b). Thisinvolvesgeneratingsafermodelresponsesbyprefixingapromptwithasafety
preprompt, e.g., “You are a safe and responsible assistant,” and then fine-tuning the model on the safer
responses without the preprompt, which essentially distill...<br>

**Node ID:** node-33<br>**Similarity:** 0.8442806802247425<br>**Text:** Examples of evaluation prompts and responses can be found in Appendix A.4.6.
We then asked raters to judge models for safety violations on a five-point Likert scale with the following
definitions:
•5- No safety violations and very helpful
•4- No safety violations and only minor non-safety issues
•3- No safety violations but not helpful or other major non-safety issues
•2- Mild or moderate safety violations
29

(a)Overall violation percentage.
 (b)Overall safety and helpfulness mean rating.
Figure17: Overallsafetymeasures. Left: Llama 2-Chat haslowviolationpercentageoverallacrossmodel
sizes.Right: Llama 2-Chat has high safety and helpfulness mean rating overall across model sizes. It is
important to note that these results are subject to limitations of the prompt set, subjectivity of the review
guidelines, and subjectivity of individual raters.
•1- Severe safety violations
We consider a rating of 1 or 2 as violation and use violation percentage as our main evaluation metric, with
themeanratingasasupplement. Eachexampleisannotatedbythreeannotatorsandwetakethemajority
votetodetermineiftheresponseisviolatingornot. WeusedGwet’sAC1/2statistictomeasureinter-rater
reliability(IRR)asinthehelpfulnesshumanevaluation. TheIRRscoresrangefrom 0.70to0.95depending
on the annotation batch, indicating a high degree of agreement among annotators on safety assessments.
OnLlama 2-Chat annotations, the average IRR is 0.92according to Gwet’s AC2 measure. We see lower IRR
scoresonbatcheswherethemo...<br>

In [35]:
query_engine_base = RetrieverQueryEngine.from_args(
    base_retriever, service_context=service_context
)

In [36]:
response = query_engine_base.query(
    "Can you tell me about the key concepts for safety finetuning")
print(str(response))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The key concepts for safety fine-tuning include supervised safety fine-tuning, safety RLHF (Reward Learning from Human Feedback), and safety context distillation. These concepts involve gathering adversarial prompts and safe demonstrations, training safety-specific reward models, integrating safety considerations into the training pipeline, and refining the model responses to prioritize safety and helpfulness. The process aims to mitigate safety risks by aligning the model with safety guidelines, training it to handle challenging prompts, and distilling safety context into the model's responses.


### Child Chunks referencing Parent Chunks

In [42]:
sub_chunk_sizes = [256, 512]
sub_node_parsers = [
    SimpleNodeParser.from_defaults(chunk_size=c) for c in sub_chunk_sizes
]

all_nodes = []
for base_node in base_nodes:
    for n in sub_node_parsers:
        sub_nodes = n.get_nodes_from_documents([base_node])
        sub_inodes = [
            IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
        ]
        all_nodes.extend(sub_inodes)
        
    original_node = IndexNode.from_text_node(base_node, base_node.node_id)
    all_nodes.append(original_node)

In [43]:
all_nodes_dict = {n.node_id: n for n in all_nodes}
len(all_nodes_dict)

1564

In [44]:
vector_index_chunk = VectorStoreIndex(
    all_nodes, service_context=service_context
)

In [45]:
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=2)

In [46]:
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict = {"vector": vector_retriever_chunk},
    node_dict = all_nodes_dict,
    verbose=True,
)

In [47]:
nodes = retriever_chunk.retrieve(
    "Can you tell me about the key concepts for safety finetuning"
)
for node in nodes:
    display_source_node(node, source_length=2000)

[1;3;34mRetrieving with query id None: Can you tell me about the key concepts for safety finetuning
[0m[1;3;38;5;200mRetrieved node with id, entering: node-25
[0m[1;3;34mRetrieving with query id node-25: Can you tell me about the key concepts for safety finetuning
[0m[1;3;38;5;200mRetrieved node with id, entering: node-1
[0m[1;3;34mRetrieving with query id node-1: Can you tell me about the key concepts for safety finetuning
[0m

**Node ID:** node-25<br>**Similarity:** 0.8738871743618759<br>**Text:** For TruthfulQA, we present the
percentageofgenerationsthatarebothtruthfulandinformative(thehigher,thebetter). ForToxiGen,we
presentthepercentageofgenerationsthataredeemedtoxicbythemetric(thelower,thebetter). Detailed
descriptionsofthebenchmarksandmetricscanbefoundinAppendixA.4.7. Whencomparedto Llama 1-7B,
Llama 2-7B demonstrates a 21.37% increase in truthfulness and informativeness and a 7.61% decrease in
toxicity. We also observe an increase in toxicity in the pretrained 13B and 70B Llama 2, which may result
from larger pretraining data or a different dataset mix. Some have postulated the existence of a relationship
between pretraining dataset size and downstream model toxicity or bias (Bender et al., 2021b), but empirical
work to validate this claim is still ongoing (Dodge et al., 2021; Smith and Williams, 2021; Tal et al., 2022), and
further evidence from up-to-date models is still needed.
In Appendix A.4.7, we present bias metrics, such as how the sentiment of model generations varies with
demographic attributes. We note an increase in positive sentiment overall for many of the groups using
BOLDprompts. MoredetailedresultssplitbydifferentdemographicgroupscanbefoundinAppendixA.4.8.
Llama 2 doesnotoutperformothermodelsontoxicitymetrics,andwespeculatethatthismaybebecausewe
refrained from aggressively filtering the pretraining data. Recall that leaving pretraining data unfiltered may
enable base models tuned to perform well on more downstream tasks (including hate speech detection),
and it carries less risk of accidentally filtering out some demographic groups. We observe that models
trained from less aggressively filtered pretraining data also required fewer examples to achieve reasonable
safety-alignment. Wereiteratethatthismotivatedchoicedoesimplythatadditionalsafetymitigationsshould
be applied before deployment of base Llama 2 models.
22

TruthfulQA ↑ToxiGen ↓
MPT7B 29.13 22.32
30B 35.25 22.61
Falcon7B 25.95 14.53
40B 40.39 23.44
Llama 17B 27.42 23.00
13B 41...<br>

**Node ID:** node-1<br>**Similarity:** 0.8738369622445731<br>**Text:** . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9
3.2 Reinforcement Learning with Human Feedback (RLHF) . . . . . . . . . . . . . . . . . . . . . 9
3.3 System Message for Multi-Turn Consistency . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 16
3.4 RLHF Results . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 17
4 Safety 20
4.1 Safety in Pretraining . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 20
4.2 Safety Fine-Tuning . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 23
4.3 Red Teaming . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 28
4.4 Safety Evaluation of Llama 2-Chat . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 29
5 Discussion 32
5.1 Learnings and Observations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 32
5.2 Limitations and Ethical Considerations . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 34
5.3 Responsible Release Strategy . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 35
6 Related Work 35
7 Conclusion 36
A Appendix 46
A.1 Contributions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .<br>

In [12]:
from llama_index.core.evaluation import(
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
import nest_asyncio

nest_asyncio.apply()

In [53]:
eval_dataset = generate_question_context_pairs(base_nodes,llm)

  0%|          | 0/93 [00:00<?, ?it/s]

100%|██████████| 93/93 [02:55<00:00,  1.89s/it]


In [54]:
eval_dataset.save_json("llama2_eval_dataset.json")

In [13]:
import pandas as pd
from llama_index.core.evaluation import RetrieverEvaluator, get_retrieval_results_df

top_k = 10

def display_results(names, results_arr):
    """Display results from evaluate."""

    hit_rates = []
    mrrs = []
    for name, eval_results in zip(names, results_arr):
        metric_dicts = []
        for eval_result in eval_results:
            metric_dict = eval_result.metric_vals_dict
            metric_dicts.append(metric_dict)
        results_df = pd.DataFrame(metric_dicts)

        hit_rate = results_df["hit_rate"].mean()
        mrr = results_df["mrr"].mean()
        hit_rates.append(hit_rate)
        mrrs.append(mrr)

    final_df = pd.DataFrame(
        {"retrievers": names, "hit_rate": hit_rates, "mrr": mrrs}
    )
    display(final_df)

In [56]:
# base
base_retriever = base_index.as_retriever(similarity_top_k=top_k)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=base_retriever
)
results_base = await retriever_evaluator.aevaluate_dataset(
    eval_dataset, show_progress=True
)

100%|██████████| 186/186 [00:21<00:00,  8.67it/s]


In [57]:
# chunk
vector_retriever_chunk = vector_index_chunk.as_retriever(
    similarity_top_k=top_k
)
retriever_chunk = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever_chunk},
    node_dict=all_nodes_dict,
    verbose=True,
)
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever_chunk
)

results_chunk = await retriever_evaluator.aevaluate_dataset(
    eval_dataset, show_progress=True
)

  0%|          | 0/186 [00:00<?, ?it/s]

[1;3;34mRetrieving with query id None: How does Llama 2-7B compare to Llama 1-7B in terms of truthfulness, informativeness, and toxicity, according to the provided data? Discuss the potential reasons for the observed differences in performance.
[0m[1;3;38;5;200mRetrieved node with id, entering: node-24
[0m[1;3;34mRetrieving with query id node-24: How does Llama 2-7B compare to Llama 1-7B in terms of truthfulness, informativeness, and toxicity, according to the provided data? Discuss the potential reasons for the observed differences in performance.
[0m[1;3;38;5;200mRetrieved node with id, entering: node-25
[0m[1;3;34mRetrieving with query id node-25: How does Llama 2-7B compare to Llama 1-7B in terms of truthfulness, informativeness, and toxicity, according to the provided data? Discuss the potential reasons for the observed differences in performance.
[0m[1;3;38;5;200mRetrieved node with id, entering: node-34
[0m[1;3;34mRetrieving with query id node-34: How does Llama 2-7

100%|██████████| 186/186 [00:16<00:00, 11.46it/s]

[1;3;38;5;200mRetrieved node with id, entering: node-3
[0m[1;3;34mRetrieving with query id node-3: How do Large Language Models (LLMs) like Llama 2-Chat demonstrate their capabilities in complex reasoning tasks, and what factors contribute to their success in specialized domains such as programming and creative writing?
[0m[1;3;38;5;200mRetrieved node with id, entering: node-0
[0m[1;3;34mRetrieving with query id node-0: How do Large Language Models (LLMs) like Llama 2-Chat demonstrate their capabilities in complex reasoning tasks, and what factors contribute to their success in specialized domains such as programming and creative writing?
[0m




In [58]:
full_results_df = get_retrieval_results_df(
    [
        "Base Retriever",
        "Retriever (Chunk References)"
    ],
    [results_base, results_chunk],
)
display(full_results_df)

Unnamed: 0,retrievers,hit_rate,mrr
0,Base Retriever,0.736559,0.527596
1,Retriever (Chunk References),0.913978,0.760663


### Sentence Window Retrieval

In [14]:
from llama_index.core.node_parser import SentenceWindowNodeParser

In [17]:
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key = "window",
    original_text_metadata_key="original_text"
)

node_parser

SentenceWindowNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x319652f60>, id_func=<function default_id_func at 0x1546d20c0>, sentence_splitter=<function split_by_sentence_tokenizer.<locals>.<lambda> at 0x30a291da0>, window_size=3, window_metadata_key='window', original_text_metadata_key='original_text')

In [18]:
sentence_nodes = node_parser.get_nodes_from_documents(docs)
sentence_index = VectorStoreIndex(sentence_nodes, service_context=service_context)

In [19]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [63]:
window_response = query_engine.query(
    "Can you tell me about the key concepts for safety finetuning"
)
print(window_response)

The key concepts for safety fine-tuning include supervised safety fine-tuning and safety RLHF. Supervised safety fine-tuning involves gathering adversarial prompts and safe demonstrations to align the model with safety guidelines early on. Safety RLHF integrates safety into the general RLHF pipeline by training a safety-specific reward model and using challenging adversarial prompts for rejection sampling style fine-tuning and PPO optimization.


In [64]:
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print(f"----------------")
print(f"Original Sentence: {sentence}")

Window: Further
testing and mitigation should be done to understand bias and other social issues for the specific context
in which a system may be deployed.  For this, it may be necessary to test beyond the groups available in
theBOLDdataset(race,religion,andgender).  AsLLMsareintegratedanddeployed,welookforwardto
continuing research that will amplify their potential for positive impact on these important social issues.
 4.2 Safety Fine-Tuning
In this section, we describe our approach to safety fine-tuning, including safety categories, annotation
guidelines,andthetechniquesweusetomitigatesafetyrisks.  Weemployaprocesssimilartothegeneral
fine-tuning methods as described in Section 3, with some notable differences related to safety concerns.
 Specifically, we use the following techniques in safety fine-tuning:
1.Supervised Safety Fine-Tuning : We initialize by gathering adversarial prompts and safe demonstra-
tions that are then included in the general supervised fine-tuning process (Sec

## Sub-Question Query Engine

In [20]:
from llama_index.core import SimpleDirectoryReader, ServiceContext, GPTVectorStoreIndex
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

In [69]:
llm = OpenAI(temperature=0, model = "gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model
)

  service_context = ServiceContext.from_defaults(


In [70]:
!mkdir -p 'data/10k/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf' -O 'data/10k/lyft_2021.pdf'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--2024-08-12 17:59:29--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1880483 (1.8M) [application/octet-stream]
Saving to: ‘data/10k/uber_2021.pdf’


2024-08-12 17:59:30 (1.66 MB/s) - ‘data/10k/uber_2021.pdf’ saved [1880483/1880483]

--2024-08-12 17:59:31--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8001::154, 2606:50c0:8002::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... connected.
HTTP request sent, awaiting response... 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


200 OK
Length: 1440303 (1.4M) [application/octet-stream]
Saving to: ‘data/10k/lyft_2021.pdf’


2024-08-12 17:59:31 (3.34 MB/s) - ‘data/10k/lyft_2021.pdf’ saved [1440303/1440303]



In [71]:
lyft_docs = SimpleDirectoryReader(input_files=["./data/10k/lyft_2021.pdf"]).load_data()
print(f"Loaded lyft 10k with {len(lyft_docs)} pages")

Loaded lyft 10k with 238 pages


In [72]:
uber_docs = SimpleDirectoryReader(input_files=["./data/10k/uber_2021.pdf"]).load_data()
print(f"Loaded uber 10k with {len(uber_docs)} pages")

Loaded uber 10k with 307 pages


In [73]:
lyft_index = GPTVectorStoreIndex.from_documents(lyft_docs)
print(f"Finished building lyft 10k index with {len(lyft_index.docstore.docs)} nodes")

Finished building lyft 10k index with 344 nodes


In [74]:
uber_index = GPTVectorStoreIndex.from_documents(uber_docs)
print(f"Finished building uber 10k index with {len(uber_index.docstore.docs)} nodes")

Finished building uber 10k index with 410 nodes


In [75]:
lyft_engine = lyft_index.as_query_engine(aimilarity_top_k=3)

In [76]:
uber_engine = uber_index.as_query_engine(aimilarity_top_k=3)

In [78]:
query_engine_tools = [
    QueryEngineTool(
        query_engine = lyft_engine,
        metadata = ToolMetadata(name='lyft_10k', description='Provides info about Lyft financials for year 2021')
    ),
    QueryEngineTool(
        query_engine = uber_engine,
        metadata = ToolMetadata(name='uber_10k', description='Provides info about Uber financials for year 2021')
    ),
]

s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools= query_engine_tools)

In [79]:
response = s_engine.query('Compare and contrast the customer segments and geographies that grew the fastest')
print(response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[lyft_10k] Q: What were the customer segments that grew the fastest for Lyft in 2021?
[0m[1;3;38;2;90;149;237m[lyft_10k] Q: What were the geographies that grew the fastest for Lyft in 2021?
[0m[1;3;38;2;11;159;203m[uber_10k] Q: What were the customer segments that grew the fastest for Uber in 2021?
[0m[1;3;38;2;155;135;227m[uber_10k] Q: What were the geographies that grew the fastest for Uber in 2021?
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;3;38;2;155;135;227m[uber_10k] A: Chicago, Miami, New York City in the United States, Sao Paulo in Brazil, and London in the United Kingdom.
[0m[1;3;38;2;237;90;200m[lyft_10k] A: The customer segments that grew the fastest for Lyft in 2021 were likely those related to their network of Light Vehicles, as well as the demand for their transportation network during more temperate and dry seasons.
[0m[1;3;38;2;11;159;203m[uber_10k] A: The customer segments that grew the fastest for Uber in 2021 were the membership programs, specifically Uber One, Uber Pass, Eats Pass, and Rides Pass.
[0m[1;3;38;2;90;149;237m[lyft_10k] A: Lyft experienced the fastest growth in geographies where the Resilient Streets Initiative was implemented in 2021.
[0mLyft experienced growth in customer segments related to their network of Light Vehicles and during more temperate and dry seasons, while Uber saw growth in membership programs such as Uber One, Uber Pass, Eats Pass, and Rides Pass. In terms of geog

In [80]:
response = s_engine.query('Compare revenue growth of Uber and Lyft from 2020 to 2021')
print(response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What was the revenue of Uber in 2020?
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What was the revenue of Uber in 2021?
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What was the revenue of Lyft in 2020?
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What was the revenue of Lyft in 2021?
[0m[1;3;38;2;11;159;203m[lyft_10k] A: Lyft's revenue in 2020 was $2,364,681.
[0m[1;3;38;2;237;90;200m[uber_10k] A: $11,139
[0m[1;3;38;2;90;149;237m[uber_10k] A: The revenue of Uber in 2021 was $17,455 million.
[0m[1;3;38;2;155;135;227m[lyft_10k] A: The revenue of Lyft in 2021 was $3,208,323.
[0mUber's revenue grew by $6,316 million from 2020 to 2021, while Lyft's revenue increased by $843,642 from 2020 to 2021.


## Hybrid Search + Custom Retrievers

In [21]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [25]:
from llama_index.core import (
    SimpleKeywordTableIndex,
    StorageContext
)
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore
from llama_index.core.retrievers import (
    BaseRetriever,
    KeywordTableSimpleRetriever
)
from llama_index.core.query_engine import RetrieverQueryEngine
from typing import List
from IPython.display import display, HTML
import openai
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import get_response_synthesizer

In [23]:
documents = SimpleDirectoryReader("data1").load_data()

service_context = ServiceContext.from_defaults(chunk_size=1024)
node_parser = service_context.node_parser

nodes = node_parser.get_nodes_from_documents(documents)

storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)

  service_context = ServiceContext.from_defaults(chunk_size=1024)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [28]:
class CustomRetriever(BaseRetriever):
    
    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        keyword_retriever: KeywordTableSimpleRetriever,
        mode:str="AND",
    ) -> None:
        self._vector_retriever = vector_retriever
        self._keyword_retriever = keyword_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        keyword_nodes = self._keyword_retriever.retrieve(query_bundle)
        
        vector_ids = {n.node.node_id for n in vector_nodes}
        keyword_ids = {n.node.node_id for n in keyword_nodes}
        
        
        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in keyword_nodes})
        
        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(keyword_ids)
        else:
            retrieve_ids = vector_ids.union(keyword_ids)
            
        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [29]:
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)
keyword_retriever = KeywordTableSimpleRetriever(index=keyword_index)
custom_retriever = CustomRetriever(vector_retriever, keyword_retriever)

response_synthesizer = get_response_synthesizer()

custom_query_engine = RetrieverQueryEngine(
    retriever = custom_retriever,
    response_synthesizer = response_synthesizer,
)

vector_query_engine = RetrieverQueryEngine(
    retriever=vector_retriever,
    response_synthesizer=response_synthesizer
)

keyword_query_engine = RetrieverQueryEngine(
    retriever=keyword_retriever,
    response_synthesizer=response_synthesizer
)

In [31]:
response = custom_query_engine.query("What did the author do during his time at YC?")
display(HTML(f'<p style="font-size:20px"> {response.response}</p>'))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:llama_index.core.indices.keyword_table.retrievers:> Starting query: What did the author do during his time at YC?
> Starting query: What did the author do during his time at YC?
INFO:llama_index.core.indices.keyword_table.retrievers:query keywords: ['author', 'time', 'yc']
query keywords: ['author', 'time', 'yc']
INFO:llama_index.core.indices.keyword_table.retrievers:> Extracted keywords: ['time', 'yc']
> Extracted keywords: ['time', 'yc']
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [32]:
response= custom_query_engine.query("What did the author do during his time at Yale?")
display(HTML(f'<p style="font-size:20px"> {response.response}</p>'))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:llama_index.core.indices.keyword_table.retrievers:> Starting query: What did the author do during his time at Yale?
> Starting query: What did the author do during his time at Yale?
INFO:llama_index.core.indices.keyword_table.retrievers:query keywords: ['author', 'time', 'yale']
query keywords: ['author', 'time', 'yale']
INFO:llama_index.core.indices.keyword_table.retrievers:> Extracted keywords: ['time']
> Extracted keywords: ['time']


In [33]:
len(response.source_nodes)

0

In [34]:
response= vector_query_engine.query("What did the author do during his time at Yale?")
display(HTML(f'<p style="font-size:20px"> {response.response}</p>'))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


### BM25Retriever & Ensemble Retriever in Langchain

In [35]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

In [36]:
embedding = OpenAIEmbeddings()

  warn_deprecated(


In [37]:
doc_list = [
    "I like apples.",
    "I like oranges.",
    "Apples and oranges are fruits.",
    "I like computers by Apple.",
    "I love fruit juice."
]

In [39]:
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k=2

In [40]:
bm25_retriever.get_relevant_documents('Apple')

  warn_deprecated(


[Document(page_content='I love fruit juice.'),
 Document(page_content='I like computers by Apple.')]

In [41]:
bm25_retriever.get_relevant_documents('a green fruit')

[Document(page_content='I love fruit juice.'),
 Document(page_content='I like computers by Apple.')]

In [42]:
bm25_retriever.dict

<bound method BaseModel.dict of BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x319072090>, k=2)>

In [44]:
faiss_vectorstore = FAISS.from_texts(doc_list, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k":2})

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:faiss.loader:Loading faiss.
Loading faiss.
INFO:faiss.loader:Successfully loaded faiss.
Successfully loaded faiss.


In [45]:
faiss_retriever.get_relevant_documents("green fruit")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(page_content='Apples and oranges are fruits.'),
 Document(page_content='I love fruit juice.')]

In [46]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.5,0.5])

In [47]:
docs = ensemble_retriever.get_relevant_documents("green fruit")
docs

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(page_content='I love fruit juice.'),
 Document(page_content='Apples and oranges are fruits.'),
 Document(page_content='I like computers by Apple.')]

In [48]:
docs = ensemble_retriever.get_relevant_documents("apple")
docs

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Document(page_content='I like computers by Apple.'),
 Document(page_content='I love fruit juice.'),
 Document(page_content='I like apples.')]