In [None]:
import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('OPENAI_API_KEY')

### Extract Text from PDF Papers

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader('data/PDFs').load_data()
index = VectorStoreIndex.from_documents(documents)

### Generate Questions

In [None]:
from llama_index.evaluation import DatasetGenerator

data_generator = DatasetGenerator.from_documents(documents)
questions = data_generator.generate_questions_from_nodes()
questions

In [None]:
with open('data/llama_index_generated_questions.txt', 'w') as wf:
    for q in questions:
        wf.write("%s\n" % q)

### Evaluation

##### Are questions, answers, and context matching?

In [None]:
import random

sampled_questions = random.sample(questions, 100)

In [None]:
import pandas as pd
import time
from llama_index.evaluation import QueryResponseEvaluator
from llama_index.llm_predictor import LLMPredictor
from langchain.chat_models import ChatOpenAI
from llama_index import ServiceContext

llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-4"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
evaluator = QueryResponseEvaluator(service_context=service_context)

query_engine = index.as_query_engine()
questions_and_answers = []
for q in sampled_questions:
    a = query_engine.query(q)
    eval_result = evaluator.evaluate(q, a)
    questions_and_answers.append({'Question': q, 'Answer': a.response, 'Match': eval_result})
    time.sleep(5)

qa_match_df = pd.DataFrame(questions_and_answers)
qa_match_df


In [None]:
qa_match_df.to_json('questions_answers_english_validation.json', orient='records')

In [None]:
qa_match_df[qa_match_df['Match'] == 'YES']

### Save Preliminary Result

In [None]:
index.set_index_id("vector_index")
index.storage_context.persist('preliminary-llama-index')

### Save List of PDF Papers

In [None]:
import os

filenames = []
dir_path = 'data/PDFs/'
for f in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, f)):
        filenames.append(f)

with open('extracted_files.txt', 'w') as wf:
    for f in filenames:
        wf.write(f + '\n')

### Sample English Question

In [None]:
english_question = "What water supply issues exist in Dania Beach?"
query_engine = index.as_query_engine()
response = query_engine.query(english_question)
print(response)

### Sample Vietnamese Question

In [None]:
vietnamese_question = """Ở đâu trong vùng Mê Công là nơi có mức độ xâm nhập mặn cao nhất trong mùa khô 2019-2020
và gây thiệt hại nghiêm trọng đối với các mô hình trồng cây ăn trái và nuôi trồng thủy sản?"""
query_engine = index.as_query_engine()
response = query_engine.query(vietnamese_question)
print(response)

### Load Index from Local Filesystem

In [None]:
from llama_index import StorageContext, load_index_from_storage

sc = StorageContext.from_defaults(persist_dir='preliminary-llama-index')
index2 = load_index_from_storage(sc, 'vector_index')

In [None]:
# Test same sample Vietnamese question from above
vietnamese_question = """Ở đâu trong vùng Mê Công là nơi có mức độ xâm nhập mặn cao nhất trong mùa khô 2019-2020
và gây thiệt hại nghiêm trọng đối với các mô hình trồng cây ăn trái và nuôi trồng thủy sản?"""
query_engine = index2.as_query_engine()
response = query_engine.query(vietnamese_question)
print(response)