#### Set Up

In [None]:
import os
from utils import OpenAIClient
from dotenv import load_dotenv

load_dotenv()
openai_client = OpenAIClient(api_key=os.getenv('OPENAI_API_KEY'))

In [None]:
import json

raw_texts_en_path = '../data/raw_texts_en.json'
raw_texts_vi_path = '../data/raw_texts_vi.json'

raw_texts_en = []
raw_texts_vi = []

with open(raw_texts_en_path) as f:
    raw_texts_en = json.load(f)

with open(raw_texts_vi_path) as f:
    raw_texts_vi = json.load(f)

In [None]:
with open('alignment_prompt.txt', 'r') as f:
    alignment_prompt = f.read()

#### Labeling

In [None]:
en_segmentations = [2, 5, 10, 17, 18, 25, 30, 34, 37, 39, 43, 44, 48, 49, 53, 54, 56, 58, 63, 64, 66, 73, 77, 81, 86, 87, 87]
vi_segmentations = [3, 7, 13, 20, 22, 30, 37, 41, 45, 48, 52, 53, 58, 60, 65, 66, 69, 71, 78, 80, 81, 90, 95, 98, 105, 106, 106]

en_segmentations_test = [2, 5, 10]
vi_segmentations_test = [3, 7, 13]

assert len(en_segmentations) == len(vi_segmentations)

In [None]:
from tqdm import tqdm

def get_texts_in_range(raw_texts, start_page, end_page):
    return " ".join(
        entry['texts'] for entry in raw_texts
        if start_page <= entry['page_id'] <= end_page
    )

for i in tqdm(range(len(en_segmentations_test) - 1)):
    en_range = (en_segmentations_test[i], en_segmentations_test[i + 1])
    vi_range = (vi_segmentations_test[i], vi_segmentations_test[i + 1])
    
    texts_en_curr = get_texts_in_range(raw_texts_en, *en_range)
    texts_vi_curr = get_texts_in_range(raw_texts_vi, *vi_range)
    prompt = alignment_prompt.format(en_text=texts_en_curr, vi_text=texts_vi_curr)
    print(prompt)