In [None]:
import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('OPENAI_API_KEY')

### Extract Text from PDF Papers

In [None]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader('../../data/PDFs').load_data()
index = VectorStoreIndex.from_documents(documents)

### Generate Questions

In [None]:
from llama_index.evaluation import DatasetGenerator

data_generator = DatasetGenerator.from_documents(documents)
questions = data_generator.generate_questions_from_nodes()
questions

In [None]:
with open('../../data/llama_index_generated_questions.txt', 'w') as wf:
    for q in questions:
        wf.write("%s\n" % q)

### Evaluation

##### Are questions, answers, and context matching?

In [None]:
import random

sampled_questions = random.sample(questions, 100)

In [None]:
import pandas as pd
import time
from llama_index.evaluation import ResponseEvaluator
from llama_index.evaluation import QueryResponseEvaluator
from llama_index.llm_predictor import LLMPredictor
from langchain.chat_models import ChatOpenAI
from llama_index import ServiceContext

llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-4"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
r_evaluator = ResponseEvaluator(service_context=service_context)
qr_evaluator = QueryResponseEvaluator(service_context=service_context)

query_engine = index.as_query_engine()
questions_and_answers = []
for q in sampled_questions:
    a = query_engine.query(q)
    r_eval_result = r_evaluator.evaluate(a)
    time.sleep(8)
    qr_eval_result = qr_evaluator.evaluate(q, a)
    questions_and_answers.append({'Question': q, 'Answer': a.response, 'Answer-Context Match': r_eval_result, 'Question-Answer-Context Match': qr_eval_result})
    time.sleep(8)

qa_match_df = pd.DataFrame(questions_and_answers)
qa_match_df

In [None]:
with open('questions_answers_english_validation.json', 'w', encoding='utf-8') as file:
    qa_match_df.to_json(file, force_ascii=False, orient='records', lines=True, indent=4)

In [None]:
qa_match_df[qa_match_df['Answer-Context Match'] == 'YES']

In [None]:
qa_match_df[qa_match_df['Question-Answer-Context Match'] == 'YES']

### Evaluation Part 2: Questions in Vietnamese

##### Translate sampled questions to Vietnamese

In [None]:
def gpt4_translate_to_vi(question):
    """
    Translate a question to Vietnamese using GPT-4 model.

    :param question: Input question
    :return: Translated question
    """
    messages = [
        {
            "role": "system",
            "content": f"Translate the following question delimited by triple backticks to Vietnamese: {question}"
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        max_tokens=1000,
        temperature=0)
    return response["choices"][0]["message"]["content"].strip()

In [None]:
# Using loop instead of list comprehension because token per minute rate exceeded
sampled_vi_questions = []
for q in sampled_questions:
    sampled_vi_questions.append(gpt4_translate_to_vi(q))
    time.sleep(5)
sampled_vi_questions

In [None]:
vi_questions_and_answers = []
for q in sampled_vi_questions:
    a = query_engine.query(q)
    r_eval_result = r_evaluator.evaluate(a)
    time.sleep(8)
    qr_eval_result = qr_evaluator.evaluate(q, a)
    vi_questions_and_answers.append({'Question': q, 'Answer': a.response, 'Answer-Context Match': r_eval_result, 'Question-Answer-Context Match': qr_eval_result})
    time.sleep(8)

vi_qa_match_df = pd.DataFrame(vi_questions_and_answers)
vi_qa_match_df

In [None]:
with open('questions_answers_vietnamese_validation.json', 'w', encoding='utf-8') as file:
    vi_qa_match_df.to_json(file, force_ascii=False, orient='records', lines=True, indent=4)

In [None]:
vi_qa_match_df[vi_qa_match_df['Answer-Context Match'] == 'YES']

In [None]:
vi_qa_match_df[vi_qa_match_df['Question-Answer-Context Match'] == 'YES']

In [None]:
vi_qa_match_df[vi_qa_match_df['Answer-Context Match'] == 'NO']

### Evaluation Analysis

In [None]:
qa_match_df.rename(columns={'Question': 'English Question', 'Answer': 'English Answer', 'Answer-Context Match': 'English Answer-Context Match', 'Question-Answer-Context Match': 'English Question-Answer-Context Match'}, inplace=True)
qa_match_df

In [None]:
vi_qa_match_df.rename(columns={'Question': 'Vietnamese Question', 'Answer': 'Vietnamese Answer', 'Answer-Context Match': 'Vietnamese Answer-Context Match', 'Question-Answer-Context Match': 'Vietnamese Question-Answer-Context Match'}, inplace=True)
vi_qa_match_df

In [None]:
agg_qa_match_df = pd.concat([qa_match_df, vi_qa_match_df], axis=1)
agg_qa_match_df

In [None]:
largest_discrep_eng_vi_qa_df = agg_qa_match_df[(agg_qa_match_df['Vietnamese Answer-Context Match'] == 'NO') & (agg_qa_match_df['English Answer-Context Match'] == 'YES')]
largest_discrep_eng_vi_qa_df

In [None]:
largest_discrep_eng_vi_ans_df = largest_discrep_eng_vi_qa_df[['English Answer', 'Vietnamese Answer']]
largest_discrep_eng_vi_ans_df

In [None]:
def gpt4_translate_vi_to_en(text):
    """
    Translate from Vietnamese to English using GPT-4 model. Used for translating generated answers in Vietnamese.

    :param question: Input text
    :return: Translated text
    """
    messages = [
        {
            "role": "system",
            "content": f"Translate the following Vietnamese text delimited by triple backticks to English: {text}"
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        max_tokens=1000,
        temperature=0)
    return response["choices"][0]["message"]["content"].strip()

In [None]:
translated_from_vi_to_en = []
for t in largest_discrep_eng_vi_ans_df['Vietnamese Answer'].tolist():
    translated_from_vi_to_en.append(gpt4_translate_vi_to_en(t))
    time.sleep(5)
largest_discrep_eng_vi_ans_df['English Translation of Vietnamese Answer'] = translated_from_vi_to_en
largest_discrep_eng_vi_ans_df

In [None]:
def gpt4_check_texts_similar_meaning(text_one, text_two):
    """
    Checks if two bodies of text have the same meaning using GPT-4 model.

    :param question: Input text
    :return: Translated text
    """
    messages = [
        {
            "role": "system",
            "content": f"""On a scale of 1 to 10, where 1 means completely different and 10 means exactly the same,
            how similar in meaning are the following two bodies of text delimited by triple backticks?
            
            text_one: ```{text_one}```
            text_two: ```{text_two}"""
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        max_tokens=1000,
        temperature=0)
    return response["choices"][0]["message"]["content"].strip()

In [None]:
similar_meaning = []
for en_ans, trans_vi_ans in zip(largest_discrep_eng_vi_ans_df['English Answer'].tolist(), largest_discrep_eng_vi_ans_df['English Translation of Vietnamese Answer']):
    sim_score = gpt4_check_texts_similar_meaning(en_ans, trans_vi_ans)
    if len(sim_score) > 0:
        similar_meaning.append(float(sim_score))
    else:
        similar_meaning.append(99999)
    time.sleep(5)
largest_discrep_eng_vi_ans_df['Similar Answer Score'] = similar_meaning
largest_discrep_eng_vi_ans_df

In [None]:
from langdetect import detect

en_ans_source_lang, vi_ans_source_lang = [], []
for en_q, vi_q in zip(agg_qa_match_df['English Question'], agg_qa_match_df['Vietnamese Question']):
    en_ans = query_engine.query(en_q)
    src_langs = []
    for source_node in en_ans.source_nodes:
        src_langs.append(detect(source_node.node.text))
    en_ans_source_lang.append(src_langs)
    time.sleep(4)
    vi_ans = query_engine.query(vi_q)
    src_langs = []
    for source_node in vi_ans.source_nodes:
        src_langs.append(detect(source_node.node.text))
    vi_ans_source_lang.append(src_langs)
    time.sleep(4)
agg_qa_match_df['Source Language(s) of English Answer'] = en_ans_source_lang
agg_qa_match_df['Source Language(s) of Vietnamese Answer'] = vi_ans_source_lang
agg_qa_match_df

In [None]:
for idx, row in agg_qa_match_df.iterrows():
    for l in row['Source Language(s) of English Answer']:
        if l != 'en':
            print('Context for English answer not in English for idx ', idx)
    for l in row['Source Language(s) of Vietnamese Answer']:
        if l != 'vi':
            print('Context for Vietnamese answer not in Vietnamese for idx ', idx)

In [None]:
with open('questions_answers_aggregate_validation.json', 'w', encoding='utf-8') as file:
    agg_qa_match_df.to_json(file, force_ascii=False, orient='records', lines=True, indent=4)

### Save Vector Index Preliminary Result

In [None]:
index.set_index_id("vector_index")
index.storage_context.persist('preliminary-llama-index')

### Save List of PDF Papers

In [None]:
import os

filenames = []
dir_path = '../../data/PDFs/'
for f in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, f)):
        filenames.append(f)

with open('extracted_files.txt', 'w') as wf:
    for f in filenames:
        wf.write(f + '\n')

### Sample English Question

In [None]:
english_question = "What water supply issues exist in Dania Beach?"
query_engine = index.as_query_engine()
response = query_engine.query(english_question)
print(response)

### Sample Vietnamese Question

In [None]:
vietnamese_question = """Ở đâu trong vùng Mê Công là nơi có mức độ xâm nhập mặn cao nhất trong mùa khô 2019-2020
và gây thiệt hại nghiêm trọng đối với các mô hình trồng cây ăn trái và nuôi trồng thủy sản?"""
query_engine = index.as_query_engine()
response = query_engine.query(vietnamese_question)
print(response)

### Load Index from Local Filesystem

In [None]:
from llama_index import StorageContext, load_index_from_storage

sc = StorageContext.from_defaults(persist_dir='preliminary-llama-index')
index2 = load_index_from_storage(sc, 'vector_index')

In [None]:
# Test same sample Vietnamese question from above
vietnamese_question = """Ở đâu trong vùng Mê Công là nơi có mức độ xâm nhập mặn cao nhất trong mùa khô 2019-2020
và gây thiệt hại nghiêm trọng đối với các mô hình trồng cây ăn trái và nuôi trồng thủy sản?"""
query_engine = index2.as_query_engine()
response = query_engine.query(vietnamese_question)
print(response)

## GPT-3.5 Fine-Tuning

### Get Question-Answer Pairs with Question-Answer-Context Match

In [None]:
questions = open('../../data/llama_index_generated_questions.txt', 'r').read().splitlines()
len(questions)

In [None]:
sampled_questions = random.sample(questions, 550)

In [None]:
import pandas as pd
import time
from llama_index.evaluation import QueryResponseEvaluator
from llama_index.llm_predictor import LLMPredictor
from langchain.chat_models import ChatOpenAI
from llama_index import ServiceContext

llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-4"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
qr_evaluator = QueryResponseEvaluator(service_context=service_context)

query_engine = index.as_query_engine()
questions_and_answers = []
for q in sampled_questions:
    a = query_engine.query(q)
    time.sleep(5)
    qr_eval_result = qr_evaluator.evaluate(q, a)
    questions_and_answers.append({'Question': q, 'Answer': a.response, 'Question-Answer-Context Match': qr_eval_result})
    time.sleep(4)

qa_match_df = pd.DataFrame(questions_and_answers)
qa_match_df

In [None]:
qa_match_df = qa_match_df[qa_match_df['Question-Answer-Context Match'] == 'YES']
qa_match_df

### Take Half and Translate to Vietnamese

###### Half of the training examples for fine-tuning will be in English, and the other half will be in Vietnamese

In [None]:
mid = len(qa_match_df) // 2
en_qa_match_df = qa_match_df.iloc[:mid]
vi_qa_match_df = qa_match_df.iloc[mid:]
vi_qa_match_df

In [None]:
def gpt4_translate_en_to_vi(text):
    """
    Translate English text to Vietnamese using GPT-4 model.

    :param text: Input English text
    :return: Translated Vietnamese text
    """
    messages = [
        {
            "role": "system",
            "content": f"Translate the following English text delimited by triple backticks to Vietnamese: {text}"
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        max_tokens=1000,
        temperature=0)
    return response["choices"][0]["message"]["content"].strip()

In [None]:
vi_questions, vi_answers = [], []
npairs = 0
for q, a in zip(vi_qa_match_df['Question'].tolist(), vi_qa_match_df['Answer'].tolist()):
    vi_q = gpt4_translate_en_to_vi(q)
    vi_questions.append(vi_q)
    time.sleep(4)
    vi_a = gpt4_translate_en_to_vi(a)
    vi_answers.append(vi_a)
    time.sleep(4)
    npairs += 1
    if npairs > 1 and npairs % 10 == 0:
        print(f"Translated {npairs} QA pairs...")
vi_qa_match_df['Question'] = vi_questions
vi_qa_match_df['Answer'] = vi_answers
vi_qa_match_df

In [None]:
en_qa_match_df

In [None]:
stripped_en_answers = [a.strip() for a in en_qa_match_df['Answer'].tolist()]
stripped_en_answers

In [None]:
en_qa_match_df['Answer'] = stripped_en_answers
en_qa_match_df

### Prepare Data

In [None]:
agg_qa_match_df = pd.concat([en_qa_match_df, vi_qa_match_df], ignore_index=True)
agg_qa_match_df

In [None]:
# Remove rows where language of question and answer is different (i.e. English question and Vietnamese answer, and vice versa)
from langdetect import detect, LangDetectException

questions, answers, qac_match = [], [], []
for idx, row in agg_qa_match_df.iterrows():
    q = row['Question']
    a = row['Answer']
    qac = row['Question-Answer-Context Match']
    
    try:
        q_lang = detect(q)
        a_lang = detect(a)
        
        if (q_lang == 'en' and a_lang == 'vi') or (q_lang == 'vi' and a_lang == 'en'):
            continue
        else:
            questions.append(q)
            answers.append(a)
            qac_match.append(qac)
    except LangDetectException:
        # Handle the exception, e.g., skip the row or log the error
        print("Can't detect language but still including anyway...")
        print("Question: ", q)
        print("Answer: ", a)
        print()
        questions.append(q)
        answers.append(a)
        qac_match.append(qac)
        continue

agg_qa_match_df = pd.DataFrame({'Question': questions, 'Answer': answers, 'Question-Answer-Context Match': qac_match})
agg_qa_match_df

In [None]:
agg_qa_match_df.drop(3, inplace=True)
agg_qa_match_df

In [None]:
agg_qa_match_df.drop(6, inplace=True)
agg_qa_match_df.head(10)

In [None]:
SYSTEM_MESSAGE = {"role": "system", "content": "You are a Q&A chatbot specializing in saltwater intrusion in the Mekong Delta."}
training_examples = []
for q, a in zip(agg_qa_match_df['Question'].tolist(), agg_qa_match_df['Answer'].tolist()):
    q_message = {"role": "user", "content": q}
    a_message = {"role": "assistant", "content": a}
    training_examples.append([SYSTEM_MESSAGE, q_message, a_message])
training_examples_df = pd.DataFrame({"messages": training_examples})
training_examples_df

In [None]:
with open('gpt_finetuning_dataset.json', 'w', encoding='utf-8') as file:
    training_examples_df.to_json(file, force_ascii=False, orient='records', lines=True)

In [None]:
with open('gpt_finetuning_dataset.jsonl', 'w', encoding='utf-8') as file:
    training_examples_df.to_json(file, force_ascii=False, orient='records', lines=True)

In [None]:
agg_qa_match_df.drop(columns=['Question-Answer-Context Match']).to_csv('gpt_finetuning_dataset.tsv', sep='\t', index=False, encoding='utf-8')

### Get Metadata of Sources for Answers to Questions for GPT-3.5 Fine-tuning Data

##### Note: Because didn't previously save metadata, have to query and generate answers again

In [None]:
import pandas as pd

training_examples_df = pd.read_json('gpt_finetuning_dataset.json', lines=True)
training_examples_df

In [None]:
questions = []
for l in training_examples_df['messages'].tolist():
    questions.append(l[1]['content'])
len(questions)

In [None]:
import time

from llama_index import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader('../../data/PDFs').load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()

In [None]:
questions_answers_sourcemeta = []
for q in questions:
    a = query_engine.query(q)
    source_nodes = a.source_nodes
    metadata_sources = [sn.node.metadata for sn in source_nodes]
    questions_answers_sourcemeta.append({'Question': q, 'Answer': a.response, 'Source Metadata': metadata_sources})
    time.sleep(2)

qa_source_df = pd.DataFrame(questions_answers_sourcemeta)
qa_source_df

In [None]:
answers = []
for a in qa_source_df['Answer'].tolist():
    answers.append(a.strip())
qa_source_df['Answer'] = answers

In [None]:
with open('gpt_finetuning_dataset_qa_sourcemetadata.json', 'w', encoding='utf-8') as file:
    qa_source_df.to_json(file, force_ascii=False, orient='records', lines=True, indent=4)