In [None]:
import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

chat = ChatOpenAI(temperature=1)

In [None]:
from langchain.chains import SequentialChain

### Generate Questions with Given Topic

In [None]:
first_prompt = ChatPromptTemplate.from_template(
    "Generate a question related to \
    the topic of {topic}?"
)

chain_one = LLMChain(llm=chat, prompt=first_prompt, output_key="english_question")

In [None]:
second_prompt = ChatPromptTemplate.from_template(
    "Translate the following question to \
    Vietnamese: {english_question}"
)

chain_two = LLMChain(llm=chat, prompt=second_prompt, output_key="vietnamese_question")

In [None]:
third_prompt = ChatPromptTemplate.from_template(
    "Answer the following question in \
    Vietnamese: {vietnamese_question}"
)

chain_three = LLMChain(llm=chat, prompt=third_prompt, output_key="vietnamese_answer")

In [None]:
# OVERALL CHAIN
# Input: topic
# Outputs: english_question,vietnamese_question,vietnamese_answer
overall_chain = SequentialChain(
    chains=[chain_one, chain_two, chain_three],
    input_variables=["topic"],
    output_variables=["english_question", "vietnamese_question","vietnamese_answer"],
    verbose=True
)

In [None]:
import pandas as pd

topics = ["water intrusion", "salinity intrusion"]
seen_qas = set()
qas = []
for topic in topics:
    for i in range(100):
        qa = overall_chain(topic)
        qa['topic'] = topic
        if qa['vietnamese_question'] not in seen_qas:
            qas.append(qa)
            seen_qas.add(qa['vietnamese_question'])
df_qa_total = pd.DataFrame(qas)

In [None]:
df_qa_total.to_excel('../../data/first_iteration_qa_pairs.xlsx')

In [None]:
df_qa_total['english_question'].to_csv('../../data/init_english_questions.csv')

In [None]:
df_qa_total.to_json('../../data/first_iteration_qa_pairs.json', orient='records')

### Generate Paraphrased Questions

In [None]:
sample_questions = [l.strip() for l in open('../../data/salinity_intrusion_sample_questions.txt').readlines()]
sample_questions

In [None]:
first_prompt = ChatPromptTemplate.from_template(
    "Paraphrase the following question: {question}?"
)

chain_one = LLMChain(llm=chat, prompt=first_prompt, output_key="english_question")

In [None]:
second_prompt = ChatPromptTemplate.from_template(
    "Translate the following question to \
    Vietnamese: {english_question}"
)

chain_two = LLMChain(llm=chat, prompt=second_prompt, output_key="vietnamese_question")

In [None]:
third_prompt = ChatPromptTemplate.from_template(
    "Answer the following question in \
    Vietnamese: {vietnamese_question}"
)

chain_three = LLMChain(llm=chat, prompt=third_prompt, output_key="vietnamese_answer")

In [None]:
# OVERALL CHAIN
# Input: question
# Outputs: english_question,vietnamese_question,vietnamese_answer
overall_chain = SequentialChain(
    chains=[chain_one, chain_two, chain_three],
    input_variables=["question"],
    output_variables=["english_question", "vietnamese_question","vietnamese_answer"],
    verbose=True
)

In [None]:
seen_qas = set()
paraphrased_qas = []
for sq in sample_questions:
    for i in range(10):
        qa = overall_chain(sq)
        if qa['vietnamese_question'] not in seen_qas:
            paraphrased_qas.append(qa)
            seen_qas.add(qa['vietnamese_question'])
df_paraphrased_qas = pd.DataFrame(paraphrased_qas)
df_paraphrased_qas

In [None]:
df_paraphrased_qas.to_json('../../data/second_iteration_qa_pairs.json', orient='records')
df_paraphrased_qas.to_excel('../../data/second_iteration_qa_pairs.xlsx')