In [5]:
from pathlib import Path

from llama_hub.file.pdf.base import PDFReader
# from llama_hub.file.pymu_pdf.base import PyMuPDFReader

In [9]:
from llama_index.readers import SimpleDirectoryReader

In [11]:
# loader = PyMuPDFReader()
# loader = PDFReader()
# docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
# docs0 = loader.load(file_path=Path("./data/llama2.pdf"))

loader = SimpleDirectoryReader(
    input_dir="./data/", 
    required_exts=[".pdf"],
    # file_extractor={".md": MarkdownDocsReader()},
    recursive=True
)
docs0 = loader.load_data()

In [12]:
from llama_index import Document

doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

In [13]:
from llama_index.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)
node_parser = HierarchicalNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(docs)
len(nodes)

1015

In [15]:
from llama_index.node_parser import get_leaf_nodes, get_root_nodes
leaf_nodes = get_leaf_nodes(nodes)
len(leaf_nodes)

793

In [16]:
root_nodes = get_root_nodes(nodes)

In [17]:
# define storage context
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage import StorageContext
from llama_index import ServiceContext
from llama_index.llms import OpenAI

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

service_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo")
)
## Load index into vector index
from llama_index import VectorStoreIndex

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
    service_context=service_context,
)

In [18]:
from llama_index.retrievers.auto_merging_retriever import AutoMergingRetriever
base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)
# query_str = "What were some lessons learned from red-teaming?"
# query_str = "Can you tell me about the key concepts for safety finetuning"
query_str = (
    "What could be the potential outcomes of adjusting the amount of safety"
    " data used in the RLHF stage?"
)

nodes = retriever.retrieve(query_str)
base_nodes = base_retriever.retrieve(query_str)
# > Merging 4 nodes into parent node.
# > Parent node id: caf5f81c-842f-46a4-b679-6be584bd6aff.
# > Parent node text: We conduct RLHF by first collecting human preference data for safety similar to Section 3.2.2: an...
len(nodes)

> Merging 5 nodes into parent node.
> Parent node id: e3e542f0-e6ed-4451-aee7-7c01007a3d45.
> Parent node text: Wethenusethehuman
preference data to train a safety reward model (see Section 3.2.2), and also re...



2

In [19]:
len(base_nodes)

6

In [20]:
from llama_index.response.notebook_utils import display_source_node

for node in nodes:
    display_source_node(node, source_length=10000)

**Node ID:** c13c3ee5-bd54-4121-b2fa-9ac61bb37261<br>**Similarity:** 0.862531014045397<br>**Text:** We also list two
qualitative examples where safety and helpfulness reward models don’t agree with each other in Table 35.
A.4.2 Qualitative Results on Safety Data Scaling
In Section 4.2.3, we study the impact of adding more safety data into model RLHF in a quantitative manner.
Hereweshowcaseafewsamplestoqualitativelyexaminetheevolutionofmodelbehaviorwhenwescale
safetydatainTables36,37,and38.<br>

**Node ID:** e3e542f0-e6ed-4451-aee7-7c01007a3d45<br>**Similarity:** 0.8572897957134463<br>**Text:** Wethenusethehuman
preference data to train a safety reward model (see Section 3.2.2), and also reuse the adversarial prompts to
sample from the model during the RLHF stage.
BetterLong-TailSafetyRobustnesswithoutHurtingHelpfulness Safetyisinherentlyalong-tailproblem,
wherethe challengecomesfrom asmallnumber ofveryspecific cases. Weinvestigatetheimpact ofSafety
RLHFbytakingtwointermediate Llama 2-Chat checkpoints—onewithoutadversarialpromptsintheRLHF
stageandonewiththem—andscoretheirresponsesonourtestsetsusingoursafetyandhelpfulnessreward
models. In Figure 14, we plot the score distribution shift of the safety RM on the safety test set (left) and that
of the helpfulness RM on the helpfulness test set (right). In the left hand side of the figure, we observe that
thedistributionofsafetyRMscoresonthesafetysetshiftstohigherrewardscoresaftersafetytuningwith
RLHF,andthatthelongtailofthedistributionnearzerothinsout. Aclearclusterappearsonthetop-left
corner suggesting the improvements of model safety. On the right side, we do not observe any gathering
patternbelowthe y=xlineontherighthandsideofFigure14,whichindicatesthatthehelpfulnessscore
distributionispreservedaftersafetytuningwithRLHF.Putanotherway,givensufficienthelpfulnesstraining
data, the addition of an additional stage of safety mitigation does not negatively impact model performance
on helpfulness to any notable degradation. A qualitative example is shown in Table 12.
ImpactofSafetyDataScaling. AtensionbetweenhelpfulnessandsafetyofLLMshasbeenobservedin
previous studies (Bai et al., 2022a). To better understand how the addition of safety training data affects
general model performance, especially helpfulness, we investigate the trends in safety data scaling by
adjustingtheamountofsafetydatausedintheRLHFstage. Inthisablationexperiment,wekeeptheamount
of helpfulness training data unchanged ( ∼0.9M samples) and gradually increase the amount of safety data
used in model tuning, ranging from 0% to 100% ( ∼0.1M samples).<br>

In [21]:
from llama_index.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(retriever)
base_query_engine = RetrieverQueryEngine.from_args(base_retriever)
response = query_engine.query(query_str)
print(str(response))
base_response = base_query_engine.query(query_str)
print(str(base_response))


> Merging 5 nodes into parent node.
> Parent node id: e3e542f0-e6ed-4451-aee7-7c01007a3d45.
> Parent node text: Wethenusethehuman
preference data to train a safety reward model (see Section 3.2.2), and also re...

Adjusting the amount of safety data used in the RLHF stage could potentially have the following outcomes:
1. Improved safety: Increasing the amount of safety data used in model tuning may lead to a shift in the score distribution of the safety reward model towards higher reward scores. This suggests that the model's safety performance improves, as indicated by a clearer cluster appearing in the top-left corner of the score distribution plot.
2. Preserved helpfulness: Adjusting the amount of safety data does not negatively impact the model's performance on helpfulness. The distribution of helpfulness scores is preserved after safety tuning with RLHF, as indicated by the absence of any gathering pattern below the y=x line in the score distribution plot.
3. Long-tail safety robu

In [None]:
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)
from llama_index import ServiceContext
from llama_index.llms import OpenAI
import nest_asyncio

nest_asyncio.apply()
# NOTE: run this if the dataset isn't already saved
# Note: we only generate from the first 20 nodes, since the rest are references
eval_service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-4"))
dataset_generator = DatasetGenerator(
    root_nodes[:20],
    service_context=eval_service_context,
    show_progress=True,
    num_questions_per_chunk=3,
)
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes(num=60)
eval_dataset.save_json("data/llama2_eval_qr_dataset.json")
# optional
eval_dataset = QueryResponseDataset.from_json(
    "data/llama2_eval_qr_dataset.json"
)
결과 비교
우리는 정확성, 의미론적 유사성, 관련성, 충실도 등 각 검색기에 대한 평가를 실행합니다.

import asyncio
import nest_asyncio

nest_asyncio.apply()
from llama_index.evaluation import (
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator,
    RelevancyEvaluator,
    FaithfulnessEvaluator,
    PairwiseComparisonEvaluator,
)


from collections import defaultdict
import pandas as pd

# NOTE: can uncomment other evaluators
evaluator_c = CorrectnessEvaluator(service_context=eval_service_context)
evaluator_s = SemanticSimilarityEvaluator(service_context=eval_service_context)
evaluator_r = RelevancyEvaluator(service_context=eval_service_context)
evaluator_f = FaithfulnessEvaluator(service_context=eval_service_context)
# pairwise_evaluator = PairwiseComparisonEvaluator(service_context=eval_service_context)
from llama_index.evaluation.eval_utils import get_responses, get_results_df
from llama_index.evaluation import BatchEvalRunner
eval_qs = eval_dataset.questions
qr_pairs = eval_dataset.qr_pairs
ref_response_strs = [r for (_, r) in qr_pairs]
pred_responses = get_responses(eval_qs, query_engine, show_progress=True)
base_pred_responses = get_responses(
    eval_qs, base_query_engine, show_progress=True
)

In [None]:
import numpy as np

pred_response_strs = [str(p) for p in pred_responses]
base_pred_response_strs = [str(p) for p in base_pred_responses]
evaluator_dict = {
    "correctness": evaluator_c,
    "faithfulness": evaluator_f,
    "relevancy": evaluator_r,
    "semantic_similarity": evaluator_s,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)
eval_results = await batch_runner.aevaluate_responses(
    eval_qs, responses=pred_responses, reference=ref_response_strs
)
base_eval_results = await batch_runner.aevaluate_responses(
    eval_qs, responses=base_pred_responses, reference=ref_response_strs
)
results_df = get_results_df(
    [eval_results, base_eval_results],
    ["Auto Merging Retriever", "Base Retriever"],
    ["correctness", "relevancy", "faithfulness", "semantic_similarity"],
)
display(results_df)

In [None]:
batch_runner = BatchEvalRunner(
    {"pairwise": pairwise_evaluator}, workers=10, show_progress=True
)
pairwise_eval_results = await batch_runner.aevaluate_response_strs(
    eval_qs,
    response_strs=pred_response_strs,
    reference=base_pred_response_strs,
)
pairwise_score = np.array(
    [r.score for r in pairwise_eval_results["pairwise"]]
).mean()
pairwise_score
0.525