In [1]:
import json

from llama_index.llms import OpenAI
from llama_index.finetuning import EmbeddingQAFinetuneDataset
from llama_index.evaluation import DatasetGenerator, RelevancyEvaluator, FaithfulnessEvaluator
from llama_index import (
    StorageContext,
    load_index_from_storage,
    ServiceContext
)

from openai import AsyncOpenAI

#node sampler - sample more relevant (informative) nodes 
# qagenerator - generate questions to the sampled nodes
from evaluate_rag_utils import NodeSampler, QAGenerator

In [3]:
# get more relevant nodes
# having higher counts of relevant and lower count of irrelevant words
# limit sample to 5% of all nodes
node_sampler = NodeSampler('relevant_words.txt', 'irrelevant_words.txt')
informative_nodes_1000 = node_sampler.informative_nodes(
    video_info_path='../../data/video_info.json', 
    chunk_size=1024, 
    fraction=0.05
    )

informative_nodes_3000 = node_sampler.informative_nodes(
    video_info_path='../../data/video_info.json', 
    chunk_size=1024*3, 
    fraction=0.05
    )

nodes_1000 = informative_nodes_1000[:10]
nodes_3000 = informative_nodes_3000[:10]

In [3]:
# initialize the model
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-1106")

# create QA generator
qa_generator = QAGenerator(llm=llm)

In [6]:
# generate:
# i) question-node pairs, save as json at qa_json_path
# ii) question-document pairs at doc_question_path
qa_dataset_1000 = qa_generator.generate_document_question_pairs(
    nodes=nodes_1000,
    doc_json_path='../../data/video_info.json',
    doc_question_path='doc_question_dataset_1000.json',
    qa_json_path='question_node_dataset_1000.json'
)

qa_dataset_3000 = qa_generator.generate_document_question_pairs(
    nodes=nodes_3000,
    doc_json_path='../../data/video_info.json',
    doc_question_path='doc_question_dataset_3000.json',
    qa_json_path='question_node_dataset_3000.json'
)

100%|██████████| 10/10 [00:16<00:00,  1.69s/it]
100%|██████████| 10/10 [00:18<00:00,  1.88s/it]
