In [9]:
import json, os
from tqdm import tqdm
import requests

In [10]:
from openai import OpenAI
client = OpenAI()

In [11]:
lit_data = json.load(open("lit_data.json", "r"))

In [12]:
history_lit_data = [x for x in lit_data if x["info"]["subject"] == "History"]

In [13]:
len(history_lit_data)

559

In [14]:
num_answers = [len(x["question"]['choices']) for x in history_lit_data]

In [15]:
set(num_answers)

{4}

In [16]:
history_lit_data[0]

{'id': '0e0fb6d2-7e7b-11ea-9eb1-54bef70b159e',
 'question': {'stem': '1917 m. Sovietų Rusija:',
  'choices': [{'text': 'pradėjo masinę kolektyvizaciją;', 'label': 'A'},
   {'text': 'pasirašė taikos sutartį su Vokietija;', 'label': 'B'},
   {'text': 'vykdė Naująją ekonominę politiką.', 'label': 'C'},
   {'text': 'atsisakė tęsti karą Antantės pusėje;', 'label': 'D'}]},
 'answerKey': 'D',
 'info': {'grade': 12, 'subject': 'History', 'language': 'Lithuanian'}}

In [17]:
def prepare_sample_message(sample):
    a = [x for x in sample["question"]["choices"] if x['label'] == "A"][0]["text"]
    b = [x for x in sample["question"]["choices"] if x['label'] == "B"][0]["text"]
    c = [x for x in sample["question"]["choices"] if x['label'] == "C"][0]["text"]
    d = [x for x in sample["question"]["choices"] if x['label'] == "D"][0]["text"]
    
    
    return f"""Question:\n{sample['question']['stem']}\n\nAnswers:\nA. {a}\nB. {b}\nC. {c}\nD. {d}"""

def get_pred_with_examples(
    sample,
    examples,
    system_text,
    modelname
) -> str:

    # prepare examples + final question we need an answer for
    messages = [
        {"role": "system", "content": system_text}
    ]
    for example_dict in examples:
        messages.append(
            {"role": "user", "content": example_dict["input"]}
        )
        messages.append(
            {"role": "assistant", "content": example_dict["output"]}
        )

    messages.append({"role": "user", "content": prepare_sample_message(sample)})
    
    output = client.chat.completions.create(
        model=modelname,
        messages=messages
    )

    return output.choices[0].message.content

In [18]:
# For this workshop, Nordic countries include
# Åland Islands, Denmark, Faroe Islands, Finland, Greenland, Iceland, Norway and Sweden.
# Baltic countries include Estonia, Latvia, and Lithuania.

In [42]:
target = "English"

SYSTEM = f"""You are translating texts for a history exams from Lithuanian to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


In [43]:
SYSTEM

'You are translating texts for a history exams from Lithuanian to English. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:\n\n{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}\n\nDO NOT MESS UP THE ORDER!'

In [44]:
EN_EXAMPLES = [
    {
        "input": """Question:
Kada buvo įkurta Lietuvos Didžioji Kunigaikštystė?

Answers:
A. XIII a. pradžioje
B. XIV a. viduryje
C. XV a. pradžioje
D. XII a. pabaigoje""",
        "output": json.dumps(
            {
                "question": "When was the Grand Duchy of Lithuania founded?",
                "A": "At the beginning of XIII century",
                "B": "In the middle of XIV century",
                "C": "At the beginning of XV century",
                "D": "At the end of XII century"
            }
        )
    },
    {
        "input": """Question:
Kas yra laikomas pirmuoju Lietuvos valdovu?

Answers:
A. Jogaila
B. Vytautas Didysis
C. Mindaugas
D. Gediminas""",
        "output": json.dumps(
            {
                "question": "Who is considered the first ruler of Lithuania?",
                "A": "Jogaila",
                "B": "Vytautas the Great",
                "C": "Mindaugas",
                "D": "Gediminas"
            }
        )
    },
    {
        "input": """Question:
 Kodėl Lietuva pasirinko Liublino uniją su Lenkija 1569 metais, sujungiant abiejų šalių valdžias į bendrą Abiejų Tautų Respubliką?
 
 Answers:
 A. Lietuvą skatino dažnėjantys Maskvos kunigaikštystės puldinėjimai, todėl buvo ieškoma sąjungininkų stipresnei gynybai.
 B. Abiejų šalių bajorija siekė platesnių prekybos ir ekonomikos ryšių su Vakarų Europa.
 C. Lietuvos bajorai tikėjosi išplėsti savo įtaką Lenkijos karalystėje ir stiprinti savo socialinį statusą.
 D.Lietuvos kunigaikščiai siekė užkirsti kelią Reformacijai ir įtvirtinti katalikybę kaip pagrindinę religiją abiejose šalyse.""",
        "output": json.dumps(
            {
                "question": "Why did Lithuania choose to enter into the Union of Lublin with Poland in 1569, merging both nations into the Polish-Lithuanian Commonwealth?",
                "A": "Lithuania faced increasing attacks from the Grand Duchy of Moscow, prompting a search for allies to strengthen its defense.",
                "B": "Nobility in both countries aimed to expand trade and economic connections with Western Europe.",
                "C": "Lithuanian nobles hoped to extend their influence in the Kingdom of Poland and strengthen their social standing.",
                "D": "Lithuanian dukes sought to counter the Reformation and establish Catholicism as the primary religion in both countries."
            }
        )
    }   
]


In [48]:
SAVE_DIR = f"LT-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

examples = EN_EXAMPLES
system_text = SYSTEM


for i, sample in tqdm(enumerate(history_lit_data)):
    out = get_pred_with_examples(
        sample,
        system_text=system_text,
        examples=examples,
        modelname="gpt-4o-2024-08-06"
    )

    sample["gpt-4o-2024-08-06"] = out
    
    with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
        json.dump(sample, f)

559it [18:30,  1.99s/it]


In [19]:
def parse_samples_lt_to_en(sample):
    loaded_tr = json.loads(sample["gpt-4o-2024-08-06"])

    data_tr = {
        "id": sample["id"],
        "original": sample,
        "translated": {
            "question": {
                "stem":loaded_tr["question"],
                "choices": [
                    {
                        "text": v,
                        "label": k
                    } for k,v in loaded_tr.items() if k!="question"
                ],
                'answerKey': sample["answerKey"],
                'info': {
                    'grade': sample['info']['grade'],
                    'subject': sample['info']['subject'],
                    'language': 'English'}}
            },
        "target_lang": "en",
        "source_lang": "lt"
    }
    return data_tr


parsed_lt_en = []
for filename in tqdm(os.listdir("LT-English")):
    if filename.endswith(".json"):
        parsed_sample = parse_samples_lt_to_en(
            sample=json.load(
                open(os.path.join("LT-English/", filename), "r")
            )
        )
        parsed_lt_en.append(parsed_sample)

100%|███████████████████████████████████████| 559/559 [00:00<00:00, 5278.50it/s]


In [20]:
parsed_lt_en[0]

{'id': '4227b2ea-67b2-11ea-844d-54bef70b159e',
 'original': {'id': '4227b2ea-67b2-11ea-844d-54bef70b159e',
  'question': {'stem': 'Musulmonų religijoje Mahometas yra:',
   'choices': [{'text': 'paskutinysis pranašas.', 'label': 'A'},
    {'text': 'islamo Dievas;', 'label': 'B'},
    {'text': 'arabų valstybes karalius;', 'label': 'C'},
    {'text': 'Alacho sūnus;', 'label': 'D'}]},
  'answerKey': 'A',
  'info': {'grade': 12, 'subject': 'History', 'language': 'Lithuanian'},
  'gpt-4o-2024-08-06': '{"question": "In the Muslim religion, Muhammad is:", "A": "the last prophet.", "B": "the God of Islam;", "C": "the king of Arab states;", "D": "the son of Allah;"}'},
 'translated': {'question': {'stem': 'In the Muslim religion, Muhammad is:',
   'choices': [{'text': 'the last prophet.', 'label': 'A'},
    {'text': 'the God of Islam;', 'label': 'B'},
    {'text': 'the king of Arab states;', 'label': 'C'},
    {'text': 'the son of Allah;', 'label': 'D'}],
   'answerKey': 'A',
   'info': {'grade'

In [20]:
os.makedirs("lt_translated_datasets", exist_ok=True)

with open("lt_translated_datasets/lt_to_en.json", "w") as f:
    json.dump(parsed_lt_en, f)

In [21]:
unicode_replacements = {
    "\\u00xf5": "õ",
    
    # Estonian, Latvian, and Lithuanian
    "\\u00e4": "ä", "\\u00f5": "õ", "\\u00f6": "ö", "\\u00fc": "ü",
    "\\u00d5": "Õ", "\\u00c4": "Ä", "\\u00d6": "Ö", "\\u00dc": "Ü",
    "\\u0161": "š", "\\u0160": "Š", "\\u017e": "ž", "\\u017d": "Ž",
    "\\u0101": "ā", "\\u0100": "Ā", "\\u0113": "ē", "\\u0112": "Ē",
    "\\u012b": "ī", "\\u012a": "Ī", "\\u016b": "ū", "\\u016a": "Ū",
    "\\u010d": "č", "\\u010c": "Č", "\\u0173": "ų", "\\u0172": "Ų",
    "\\u0117": "ė", "\\u0116": "Ė", "\\u0119": "ę", "\\u0118": "Ę",
    "\\u0105": "ą", "\\u0104": "Ą", "\\u0123": "ģ", "\\u0122": "Ģ",
    "\\u0137": "ķ", "\\u0136": "Ķ", "\\u013c": "ļ", "\\u013b": "Ļ",
    "\\u0146": "ņ", "\\u0145": "Ņ", "\\u0157": "ŗ", "\\u0156": "Ŗ",

    # Icelandic
    "\\u00e1": "á", "\\u00c1": "Á", "\\u00ed": "í", "\\u00cd": "Í",
    "\\u00f3": "ó", "\\u00d3": "Ó", "\\u00fa": "ú", "\\u00da": "Ú",
    "\\u00fd": "ý", "\\u00dd": "Ý", "\\u00f0": "ð", "\\u00d0": "Ð",
    "\\u00fe": "þ", "\\u00de": "Þ", "\\u00e6": "æ", "\\u00c6": "Æ",

    # Finnish and Estonian (shared with Swedish)
    "\\u00e5": "å", "\\u00c5": "Å",

    # Norwegian and Danish
    "\\u00e6": "æ", "\\u00c6": "Æ", "\\u00f8": "ø", "\\u00d8": "Ø",

    # Swedish (already includes Å, Ä, and Ö)
    "\\u00e5": "å", "\\u00c5": "Å",
}

In [22]:
def parse_tr_samples(sample, target_lang, source_lang):

    gen_text = sample["gpt-4o-2024-08-06"]
    for repl in unicode_replacements:
        gen_text = gen_text.replace(repl, unicode_replacements[repl])
    
    loaded_tr = json.loads(gen_text)

    data_tr = {
        "id": sample["id"],
        "original": sample,
        "translated": {
            "question": {
                "stem":loaded_tr["question"],
                "choices": [
                    {
                        "text": v,
                        "label": k
                    } for k,v in loaded_tr.items() if k!="question"
                ],
                'answerKey': sample['original']["answerKey"],
                'info': {
                    'grade': sample['original']['info']['grade'],
                    'subject': sample['original']['info']['subject'],
                    'language': target_lang}}
            },
        "target_lang": target_lang,
        "source_lang": source_lang
    }
    return data_tr

In [23]:
def prepare_sample_message(sample):
    a = [x for x in sample["question"]["choices"] if x['label'] == "A"][0]["text"]
    b = [x for x in sample["question"]["choices"] if x['label'] == "B"][0]["text"]
    c = [x for x in sample["question"]["choices"] if x['label'] == "C"][0]["text"]
    d = [x for x in sample["question"]["choices"] if x['label'] == "D"][0]["text"]
    
    
    return f"""Question:\n{sample['question']['stem']}\n\nAnswers:\nA. {a}\nB. {b}\nC. {c}\nD. {d}"""

def get_pred_with_examples(
    sample,
    examples,
    system_text,
    modelname
) -> str:

    # prepare examples + final question we need an answer for
    messages = [
        {"role": "system", "content": system_text}
    ]
    for example_dict in examples:
        messages.append(
            {"role": "user", "content": example_dict["input"]}
        )
        messages.append(
            {"role": "assistant", "content": example_dict["output"]}
        )

    messages.append({"role": "user", "content": prepare_sample_message(sample['translated'])})
    
    output = client.chat.completions.create(
        model=modelname,
        messages=messages
    )

    return output.choices[0].message.content

In [13]:
target = "Estonian"

EN_EST_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_EST_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "Millal taastas Leedu ametlikult iseseisvuse Nõukogude Liidust?",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "Mis oli Poola Kuningriigi ja Leedu Suurvürstiriigi vahel moodustatud liidu nimi?",
                "A": "Lublini unioon",
                "B": "Varssavi pakt",
                "C": "Jagielloonide liit",
                "D": "Bresti unioon"
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "1940. aastal okupeeris Nõukogude Liit Leedu. Milline järgmistest oli selle okupatsiooni peamine mõju Leedu ühiskonnale?",
                "A": "Suurem poliitiline autonoomia",
                "B": "Massilised küüditamised Siberisse",
                "C": "Suurem usuvabadus",
                "D": "Kohalike tööstuste areng"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity.
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage.
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence.
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles.""",
        "output": json.dumps(
            {
                "question": "Mis järgmistest kirjeldab kõige paremini Leedu jagamise peamist tagajärge Venemaa poolt aastal 1795?",
                "A": "Leedu aadlile pakuti Venemaa impeeriumis privilegeeritud staatust, mis võimaldas neil säilitada piiratud kohalikku valitsemist vastutasuks truuduse eest tsaarile, kuigi see vähendas Leedu kultuurilise identiteedi mõju.",
                "B": "Leedu keele kasutamine avalikus hariduses ja usulistes asutustes keelustati, mis viis kirjaoskuse vähenemiseni ja tekitas põlvkondade vahelise lõhe kultuuriliste teadmiste ja pärandi osas.",
                "C": "Paljud väljapaistvad Leedu intellektuaalid ja rahvuslased küüditati või vangistati Siberis, mis tekitas intellektuaalse diasporaa, mis mõjutas rahvusliku sõltumatuse toetamist välismaistel põrandaalustel liikumistel.",
                "D": "Venemaa kehtestas Leedu kaubateedele majanduslikud piirangud, vähendades kohalike kaupade eksporti ja põhjustades piirkondliku majanduse, eriti põllumajanduse ja tekstiilitööstuse, märkimisväärse languse."
            }
        )
    }
]


In [14]:
SAVE_DIR = f"EN-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_EST_EXAMPLES
system_text = EN_EST_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    out = get_pred_with_examples(
        sample,
        system_text=system_text,
        examples=examples,
        modelname="gpt-4o-2024-08-06"
    )

    sample["gpt-4o-2024-08-06"] = out
    
    with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
        json.dump(sample, f)

    parsed_outputs.append(
        parse_tr_samples(sample, target_lang=target, source_lang=target)
    )

with open("lt_translated_datasets/en_to_est.json", "w") as f:
    json.dump(parsed_outputs, f)

100%|█████████████████████████████████████████| 559/559 [26:00<00:00,  2.79s/it]


In [15]:
target = "Finnish"

EN_FN_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_FN_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "Minä vuonna Liettua itsenäistyi virallisesti Neuvostoliitosta?",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "Mikä oli nimi liitolle, joka muodostettiin Puolan kuningaskunnan ja Liettuan suuriruhtinaskunnan välillä?",
                "A": "Lublinin unioni",
                "B": "Varsovan sopimus",
                "C": "Jagellonialainen liitto",
                "D": "Brestin unioni"
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "Vuonna 1940 Neuvostoliitto miehitti Liettuan. Mikä seuraavista oli tämän miehityksen keskeinen vaikutus liettualaiseen yhteiskuntaan?",
                "A": "Poliittisen autonomian lisääntyminen",
                "B": "Massakarkotukset Siperiaan",
                "C": "Uskonnonvapauden lisääntyminen",
                "D": "Paikallisten teollisuudenalojen kasvu"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity.
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage.
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence.
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles.""",
        "output": json.dumps(
            {
                "question": "Mikä seuraavista parhaiten kuvaa Venäjän keisarikunnan vuonna 1795 toteuttaman Liettuan jakamisen ensisijaista vaikutusta?",
                "A": "Liettuan aatelistolle tarjottiin etuoikeutettu asema Venäjän keisarikunnassa, mikä salli heidän säilyttää rajoitetun paikallishallinnon uskollisuutta vastaan tsaariin, mutta vähensi liettualaisen kulttuuri-identiteetin vaikutusta.",
                "B": "Liettuan kieltä tukahdutettiin julkisessa opetuksessa ja uskonnollisissa laitoksissa, mikä johti kansallisen lukutaitotason laskuun ja sukupolvien väliseen kuiluun kulttuuritiedossa ja perinnössä.",
                "C": "Monet merkittävät liettualaiset älyköt ja nationalistit karkotettiin tai vangittiin Siperiaan, mikä johti älylliseen diasporaan, joka vaikutti maanalaiseen nationalistiseen liikkeeseen ulkomailla Liettuan itsenäisyyden tukemiseksi.",
                "D": "Venäjän keisarikunta määräsi joukon taloudellisia rajoituksia Liettuan kauppareiteille, mikä vähensi paikallisten tuotteiden vientiä ja aiheutti merkittävää laskusuhdannetta alueen taloudessa, erityisesti maataloudessa ja tekstiilialalla."
            }
        )
    }
]


In [24]:
SAVE_DIR = f"EN-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_FN_EXAMPLES
system_text = EN_FN_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    if f"{i}.json" in os.listdir(SAVE_DIR):
        parsed_outputs.append(json.load(open(os.path.join(SAVE_DIR, f"{i}.json"), 'r')))
    else:

        try:
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        except:
            print(i, "rerun")
            print(out)
            
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        

with open("lt_translated_datasets/en_to_fin.json", "w") as f:
    json.dump(parsed_outputs, f)

  0%|                                                   | 0/559 [00:00<?, ?it/s]

288 rerun
{"question": "Mik\u00e4 oli yksi ensimm\u00e4isen maailmansodan syist\u00e4?", "A": "Bolshevikkien pyrkimys vallata Balkan.", "B": "Tyytym\u00e4tt\u00f6myys Kansainliiton passiiviseen politiikkaan.", "C": "Tyytym\u00e4tt\u00f6myys Versailles’n sopimuksen ehtoihin.", "D": "Tavoitteena jakaa maailma uudelleen suurvaltojen kesken.", "16. Mik\u00e4 arkkitehtoninen tyyli on t\u00e4m\u00e4 linna? Loire Castle, Espanja (1070)"}

{"A": "Gotiikka", "B": "Barokki", "C": "Renessanssi", "D": "Romaaninen"}


100%|█████████████████████████████████████████| 559/559 [12:25<00:00,  1.33s/it]


In [25]:
target = "Swedish"

EN_SW_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_SW_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "Vilket år återfick Litauen officiellt sin självständighet från Sovjetunionen?",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "Vad hette unionen som bildades mellan kungariket Polen och Storfurstendömet Litauen?",
                "A": "Lublinfördraget",
                "B": "Warszawapakten",
                "C": "Jagellonska alliansen",
                "D": "Unionen i Brest"
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "År 1940 ockuperades Litauen av Sovjetunionen. Vilken av följande var en viktig följd av denna ockupation för det litauiska samhället?",                
                "A": "Ökad politisk autonomi",
                "B": "Massdeportationer till Sibirien",
                "C": "Större religiös frihet",
                "D": "En ökning av lokala industrier"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity.
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage.
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence.
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles.""",
        "output": json.dumps(
            {
                "question": "Vilken av följande beskriver bäst en primär konsekvens av den ryska imperiets uppdelning av Litauen år 1795?",
                "A": "Den litauiska adeln erbjöds en privilegierad status inom det ryska imperiet, vilket gav dem möjlighet att behålla begränsat lokalt självstyre i utbyte mot lojalitet till tsaren, även om detta minskade inflytandet av den litauiska kulturella identiteten.",
                "B": "Det litauiska språket undertrycktes i offentlig utbildning och religiösa institutioner, vilket ledde till en minskning i nationella läskunnighetsnivåer och ett generationsglapp i kulturell kunskap och arv.",
                "C": "Många framstående litauiska intellektuella och nationalister blev exilerade eller fängslade i Sibirien, vilket resulterade i en intellektuell diaspora som påverkade underjordiska nationalistiska rörelser utomlands till stöd för litauisk självständighet.",
                "D": "Det ryska imperiet införde en rad ekonomiska restriktioner på litauiska handelsvägar, vilket minskade exporten av lokala varor och orsakade betydande nedgångar i den regionala ekonomin, särskilt inom jordbruk och textilier."
            }
        )
    }
]


In [26]:
SAVE_DIR = f"EN-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_SW_EXAMPLES
system_text = EN_SW_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    if f"{i}.json" in os.listdir(SAVE_DIR):
        parsed_outputs.append(json.load(open(os.path.join(SAVE_DIR, f"{i}.json"), 'r')))
    else:

        try:
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        except:
            print(i, "rerun")
            print(out)
            
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        

with open("lt_translated_datasets/en_to_sw.json", "w") as f:
    json.dump(parsed_outputs, f)

 47%|███████████████████▍                     | 265/559 [11:37<11:45,  2.40s/it]

265 rerun
{"question": "Vilken f\u00f6respr\u00e5kare av litauisk kultur organiserade n\u00fykterhetsr\u00f6relsen?", "A": "Jonas Basanavičius", "B": "Simonas Daukantas", "C": "Motiejus Valančius", "D": "Vincas Kudirka"}


100%|█████████████████████████████████████████| 559/559 [26:25<00:00,  2.84s/it]


In [27]:
target = "Latvian"

EN_LAV_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_LAV_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "Kurā gadā Lietuva oficiāli atguva neatkarību no Padomju Savienības?",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "Kāds bija nosaukums savienībai, kas tika izveidota starp Polijas Karalisti un Lietuvas Lielkņazisti?",
                "A": "Lublinas līgums",
                "B": "Varšavas pakts",
                "C": "Jagelloniešu alianse",
                "D": "Brestas savienība"
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "1940. gadā Lietuvu okupēja Padomju Savienība. Kurš no šiem bija galvenais šīs okupācijas efekts uz Lietuvas sabiedrību?",
                "A": "Politiskās autonomijas palielināšanās",
                "B": "Masveida deportācijas uz Sibīriju",
                "C": "Lielāka reliģiskā brīvība",
                "D": "Vietējo rūpniecību pieaugums"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity.
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage.
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence.
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles.""",
        "output": json.dumps(
            {
                "question": "Kurš no šiem vislabāk raksturo galveno 1795. gada Lietuvas sadalījuma sekas Krievijas impērijas ietvaros?",
                "A": "Lietuvas muižniecībai tika piedāvāts privilēģēts statuss Krievijas impērijā, ļaujot saglabāt ierobežotu vietējo pārvaldi pretī lojalitātei caram, taču tas samazināja Lietuvas kultūras identitātes ietekmi.",
                "B": "Lietuviešu valoda tika apspiesta sabiedriskajā izglītībā un reliģiskajās institūcijās, kas noveda pie nacionālā lasītprasmes samazināšanās un paaudžu plaisas kultūras zināšanās un mantojumā.",
                "C": "Daudzi ievērojami Lietuvas intelektiķi un nacionālisti tika izsūtīti vai ieslodzīti Sibīrijā, izraisot intelektuālo diaspōru, kas ietekmēja zemgriezes nacionālistiskās kustības ārzemēs, atbalstot Lietuvas neatkarību.",
                "D": "Krievijas impērija uzlika virkni ekonomisku ierobežojumu Lietuvas tirdzniecības maršrutiem, samazinot vietējo preču eksportu un radot būtiskus ekonomikas kritumus reģionā, īpaši lauksaimniecībā un tekstilrūpniecībā."
            }
        )
    }
]


In [28]:
SAVE_DIR = f"EN-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_LAV_EXAMPLES
system_text = EN_LAV_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    if f"{i}.json" in os.listdir(SAVE_DIR):
        parsed_outputs.append(json.load(open(os.path.join(SAVE_DIR, f"{i}.json"), 'r')))
    else:

        try:
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        except:
            print(i, "rerun")
            print(out)
            
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        

with open("lt_translated_datasets/en_to_lav.json", "w") as f:
    json.dump(parsed_outputs, f)

100%|█████████████████████████████████████████| 559/559 [24:29<00:00,  2.63s/it]


In [6]:
target = "Danish"

EN_DN_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_DN_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "Hvilket år genvandt Litauen officielt sin uafhængighed fra Sovjetunionen?",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "Hvad var navnet på unionen dannet mellem Kongeriget Polen og Storhertugdømmet Litauen?",
                "A": "Lublinunionen",
                "B": "Warszawapagten",
                "C": "Den Jagiellonske Alliance",
                "D": "Brestunionen"
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "I 1940 blev Litauen besat af Sovjetunionen. Hvilket af følgende var en vigtig effekt af denne besættelse på det litauiske samfund?",
                "A": "Øget politisk autonomi",
                "B": "Massedeportationer til Sibirien",
                "C": "Større religionsfrihed",
                "D": "En stigning i lokale industrier"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity.
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage.
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence.
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles.""",
        "output": json.dumps(
            {
                "question": "Hvilket af følgende beskriver bedst en primær konsekvens af delingen af Litauen i 1795 af det russiske imperium?",
                "A": "Den litauiske adel blev tilbudt en privilegeret status inden for det russiske imperium, som gjorde det muligt for dem at bevare begrænset lokal regering i bytte for loyalitet overfor tsaren, selvom dette reducerede indflydelsen af den litauiske kulturelle identitet.",
                "B": "Det litauiske sprog blev undertrykt i offentlig uddannelse og religiøse institutioner, hvilket førte til et fald i nationale læsefærdigheder og en generationskløft i kulturel viden og arv.",
                "C": "Mange fremtrædende litauiske intellektuelle og nationalister blev eksileret eller fængslet i Sibirien, hvilket resulterede i en intellektuel diaspora, der påvirkede underjordiske nationalistiske bevægelser i udlandet til støtte for Litauens uafhængighed.",
                "D": "Det russiske imperium pålagde en række økonomiske restriktioner på litauiske handelsruter, hvilket reducerede eksporten af lokale varer og forårsagede betydelige nedgange i den regionale økonomi, især inden for landbrug og tekstiler."
            }
        )
    }
]


In [7]:
SAVE_DIR = f"EN-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_DN_EXAMPLES
system_text = EN_DN_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    if f"{i}.json" in os.listdir(SAVE_DIR):
        parsed_outputs.append(json.load(open(os.path.join(SAVE_DIR, f"{i}.json"), 'r')))
    else:

        try:
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        except:
            print(i, "rerun")
            print(out)
            
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        

with open("lt_translated_datasets/en_to_dn.json", "w") as f:
    json.dump(parsed_outputs, f)

 72%|█████████████████████████████▎           | 400/559 [19:30<06:55,  2.61s/it]

400 rerun
{"question": "Hvilken politisk bev\ u00e6gelse\ s repr\ u00e6sentanter p\ u00e5stod, at kirken er s \u00f8jlen i staten og samfundet?", "A": "Konservatisme", "B": "Socialisme", "C": "Liberalisme", "D": "Anarkisme"}


 86%|███████████████████████████████████▏     | 479/559 [23:23<03:18,  2.49s/it]

479 rerun
{"question": "Hvad foranledigede implementeringen af Valakai-reformen i Litauen?", "A": "Nedgangen i godsejernes indflydelse.", "B": "\u00d8gningen i eftersp\u00f8rgslen p\u00e5 korn.", "C": Afskaffelsen af livegenskab.", "D": "Afslutningen af Lublinunionen."}


100%|█████████████████████████████████████████| 559/559 [26:46<00:00,  2.87s/it]


In [24]:
target = "Ukrainian"

EN_UA_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_UA_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "В якому році Литва офіційно здобула незалежність від Радянського Союзу?",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "Як називалося обʼєднання Королівства Польського та Великого князівства Литовського?",
                "A": "Люблінська унія",
                "B": "Варшавський договір",
                "C": "Ягеллонський Альянс",
                "D": "Берестейська унія "
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "У 1940 році Литва була окупована Радянським Союзом. Що з перерахованого було ключовим впливом цієї окупації на литовське суспільство?",
                "A": "Збільшення політичної автономії",
                "B": "Масові депортації до Сибіру",
                "C": "Більша релігійна свобода",
                "D": "Підйом місцевої промисловості"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity.
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage.
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence.
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles.""",
        "output": json.dumps(
            {
                "question": "Що з наведеного нижче найкраще описує основний наслідок поділу Литви Російською імперією у 1795 році?",
                "A": "Литовській знаті було запропоновано привілейований статус у Російській імперії, що дозволяло їй зберігати обмежене місцеве управління в обмін на лояльність до царя, хоча це зменшувало вплив литовської культурної ідентичності.",
                "B": "Литовська мова придушувалася в державній освіті та релігійних установах, що призвело до зниження рівня національної грамотності та розриву між поколіннями в культурних знаннях і спадщині.",
                "C": "Багато видатних литовських представників інтелігенції і націоналістів були заслані або ув’язнені в Сибіру, ​​в результаті чого утворилася інтелектуальна діаспора, яка вплинула на підпільні націоналістичні рухи за кордоном на підтримку незалежності Литви.",
                "D": "Російська імперія наклала низку економічних обмежень на литовські торгові шляхи, скоротивши експорт місцевих товарів і спричинивши значний спад у регіональній економіці, зокрема у сільському господарстві та текстилі."
            }
        )
    }
]


In [27]:
SAVE_DIR = f"EN-{target}-v1"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_UA_EXAMPLES
system_text = EN_UA_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    if f"{i}.json" in os.listdir(SAVE_DIR):
        parsed_outputs.append(json.load(open(os.path.join(SAVE_DIR, f"{i}.json"), 'r')))
    else:

        try:
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f, ensure_ascii=True)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        except:
            print(i, "rerun")
            print(out)
            
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f, ensure_ascii=False)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        

with open("lt_translated_datasets/en_to_ua_v1.json", "w") as f:
    json.dump(parsed_outputs, f)

  6%|██▌                                       | 34/559 [03:29<53:54,  6.16s/it]

34 rerun
{"question": "\u0425\u0442\u043e \u0437 \u0446\u0438\u0445 \u043e\u0441\u0456\u0431 \u043d\u0435 \u0431\u0443\u0432 \u0432\u044f\u0437\u043d\u0435\u043d\u0438\u0439 \u043f\u0456\u0434 \u0447\u0430\u0441 \u0440\u0430\u0434\u044f\u043d\u0441\u044c\u043a\u043e\u0457 \u043e\u043a\u0443\u043f\u0430\u0446\u0456\u0457?", "A": "\u041d\u0456\u0439\u043e\u043b\u0435 \u0421\u0430\u0434\u0443\u043d\u0430\u0439\u0442\u0435", "B": "\u0421\u0456\u0433\u0456\u0442\u0430\u0441 \u0422\u0430\u043c\u043a\u0435\u0432\u0456\u0447\u044e\u0441", "C": "\u0412\u0456\u043d\u0446\u0435\u043d\u0442\u0430\u0441 \u0421\u043b\u0430\u0434\u043a\u0435\u0432\u0456\u0447\u044e\u0441", "D": "\u0410\u043d\u0442\u0430\u043d\u0430\u0441 \u0422\u0435\u0440\u043b\u0435\u0439\u043a\u0430\u0441"}




KeyboardInterrupt



In [None]:
target = "Arabic"

EN_AR_SYSTEM = f"""You are translating texts for a history exams from English to {target}. You are provided with a question and a list of answers, marked with A,B,C, and D. Make sure to capture exactly what the question and answers are about and do not add anything new to it! Do not answer the question. Output your answer in json format:

{{"question": TRANSALATED QUESTION, "A": TRANSLATED ANSWER A,  "B": TRANSLATED ANSWER B, "C": TRANSLATED ANSWER C, "D": TRANSLATED ANSWER D}}

DO NOT MESS UP THE ORDER!"""


EN_AR_EXAMPLES = [
    {
        "input": """Question:
In which year did Lithuania officially regain independence from the Soviet Union?

Answers:
A. 1988
B. 1990
C. 1991
D. 1993""",
        "output": json.dumps(
            {
                "question": "في أي عام استقلت ليتوانيا رسميًا عن الاتحاد السوفيتي؟",
                "A": "1988",
                "B": "1990",
                "C": "1991",
                "D": "1993"
            }
        )
    },
    {
        "input": """Question:
What was the name of the union formed between the Kingdom of Poland and the Grand Duchy of Lithuania?

Answers:
A. The Treaty of Lublin
B. The Warsaw Pact
C. The Jagiellonian Alliance
D. The Union of Brest""",
        "output": json.dumps(
            {
                "question": "ما هو اسم الاتحاد الذي تم تشكيله بين مملكة بولندا ودوقية ليتوانيا الكبرى؟",
                "A": " معاهدة لوبلين",
                "B": "حلف وارسو",
                "C": "التحالف الجاجيلوني",
                "D": "اتحاد بريست"
            }
        )
    },
    {
        "input": """Question:
In 1940, Lithuania was occupied by the Soviet Union. Which of the following was a key effect of this occupation on Lithuanian society?

Answers:
A. Increased political autonomy
B. Mass deportations to Siberia
C. Greater religious freedom
D. A rise in local industries""",
        "output": json.dumps(
            {
                "question": "في عام 1940، احتل الاتحاد السوفي تي ليتوانيا. أي من الآثار التالية كان تأثيرًا رئيسيًا لهذا الاحتلالعلى المجتمع الليتواني؟",
                "A": "زيادة الاستقلال السياسي",
                "B": "عمليات ترحيل جماعي إلى سيبيريا",
                "C": "حرية دينية أكبر",
                "D": "ازدهار الصناعات المحلية"
            }
        )
    },
    {
        "input": """Question:
Which of the following best describes a primary consequence of the 1795 partitioning of Lithuania by the Russian Empire?

Answers:
A. Lithuanian nobility were offered a privileged status within the Russian Empire, allowing them to retain limited local governance in exchange for loyalty to the Tsar, though this reduced the influence of Lithuanian cultural identity
B. The Lithuanian language was suppressed in public education and religious institutions, which led to a decline in national literacy rates and a generation gap in cultural knowledge and heritage
C. Many prominent Lithuanian intellectuals and nationalists were exiled or imprisoned in Siberia, resulting in an intellectual diaspora that influenced underground nationalist movements abroad in support of Lithuanian independence
D. The Russian Empire imposed a series of economic restrictions on Lithuanian trade routes, reducing the export of local goods and causing significant downturns in the regional economy, particularly in agriculture and textiles""",
        "output": json.dumps(
            {
                "question": "أي من الخيارات التالية يصف بشكل أفضل أحد النتائج الرئيسية لتقسيم ليتوانيا عام 1795 من قبل الإمبراطورية الروسية؟",
                "A": "مُنح النبلاء الليتوانيون مكانة مميزة داخل الإمبراطورية الروسية، مما سمح لهم بالحفاظ على قدر محدود من الحكم المحلي مقابل الولاء للقيصر، لكن ذلك قلل من تأثير الهوية الثقافية الليتوانية",                
                "B": "تم قمع اللغة الليتوانية في التعليم العام والمؤسسات الدينية، مما أدى إلى انخفاض معدلات محو الأمية وظهور فجوة بين الأجيال في المعرفة والثقافة التراثية",
                "C": "نُفي العديد من المثقفين والقوميين الليتوانيين البارزين أو سُجنوا في سيبيريا، مما أدى إلى ظهور شتات فكري ساهم في دعم الحركات القومية السرية في الخارج لصالح استقلال ليتوانيا",
                "D": "فرضت الإمبراطورية الروسية سلسلة من القيود الاقتصادية على طرق التجارة الليتوانية، مما قلل من صادرات البضائع المحلية وتسبب في تدهور كبير في الاقتصاد الإقليمي، خاصة في الزراعة والنسيج"
            }
        )
    }
]


In [None]:
SAVE_DIR = f"EN-{target}"
os.makedirs(SAVE_DIR, exist_ok=True)

parsed_lt_en = json.load(open("lt_translated_datasets/lt_to_en.json", "r"))

examples = EN_AR_EXAMPLES
system_text = EN_AR_SYSTEM


parsed_outputs = []

for i, sample in tqdm(enumerate(parsed_lt_en), total=len(parsed_lt_en)):
    if f"{i}.json" in os.listdir(SAVE_DIR):
        parsed_outputs.append(json.load(open(os.path.join(SAVE_DIR, f"{i}.json"), 'r')))
    else:

        try:
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        except:
            print(i, "rerun")
            print(out)
            
            out = get_pred_with_examples(
                sample,
                system_text=system_text,
                examples=examples,
                modelname="gpt-4o-2024-08-06"
            )
        
            sample["gpt-4o-2024-08-06"] = out
            
            with open(os.path.join(SAVE_DIR, f"{i}.json"), 'w') as f:
                json.dump(sample, f)
    
            parsed_outputs.append(
                parse_tr_samples(sample, target_lang=target, source_lang=target)
            )
        

with open("lt_translated_datasets/en_to_ar.json", "w") as f:
    json.dump(parsed_outputs, f)