In [1]:
import os
import dspy
from datasets import load_dataset

from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
key = os.environ.get("OPENAI_API_KEY")

In [3]:
lm = dspy.LM("openai/gpt-4.1-mini", temperature=1, api_key=key, max_tokens=32000)
dspy.configure(lm=lm)

In [5]:
dataset = load_dataset("facebook/belebele", "yor_Latn")

Generating test split: 100%|██████████| 900/900 [00:00<00:00, 114884.46 examples/s]


In [7]:
dataset["test"]

Dataset({
    features: ['link', 'question_number', 'flores_passage', 'question', 'mc_answer1', 'mc_answer2', 'mc_answer3', 'mc_answer4', 'correct_answer_num', 'dialect', 'ds'],
    num_rows: 900
})

In [19]:
from datasets import DatasetDict

def split_and_merge_answers(dataset_dict: DatasetDict):
    """
    Split dataset into two (Yoruba, Nigerian Pidgin) and
    merge solution0/solution1 into a single list field `solutions`.
    Drops the original solution0 and solution1 fields.
    """

    def transform(batch):
        # create the new list field
        s1 = batch["mc_answer1"] if batch["mc_answer1"] is not None else "<NO_ANSWER>"
        s2 = batch["mc_answer2"] if batch["mc_answer2"] is not None else "<NO_ANSWER>"
        s3 = batch["mc_answer3"] if batch["mc_answer3"] is not None else "<NO_ANSWER>"
        s4 = batch["mc_answer4"] if batch["mc_answer4"] is not None else "<NO_ANSWER>"


        batch["answers"] = [s1, s2, s3, s4]

        del batch["mc_answer1"]
        del batch["mc_answer2"]
        del batch["mc_answer3"]
        del batch["mc_answer4"]

        return batch

    # apply transformation to the entire train split
    ds = dataset_dict["test"].map(transform)

    return DatasetDict({
        "test": ds,
    })


In [20]:
new_datasets = split_and_merge_answers(dataset)


In [24]:
new_datasets["test"][0]

{'link': 'https://en.wikibooks.org/wiki/Accordion/Right_hand',
 'question_number': 1,
 'flores_passage': 'Rí i dájú pé ọwọ́ rẹ ní ìsinmi bó se yẹ nígbàtí o bá ń kọlu gbogbo àkíyèsí tí ó tó - pẹ̀lú pé ki o má dààmú ọmọ ìka ọwọ́ rẹ jùlọ. Bó ba n ṣe báyí kò ní rẹ̀ ẹ́ jù. Rántí pé o kò níláti fi agbára tẹ ojú rẹ̀ kí ohun rẹ̀ ga síi bí ti piano. Láti mú ohun akodioni tóbi si wàá lò àwọn belò pẹ̀lú agbára tàbí ere sí i.',
 'question': 'Gẹgẹ bi oju ewe naa, kini a o pe ni ojulowo itọni fun imọ titẹ àkọdiọnu daadaa??',
 'correct_answer_num': '1',
 'dialect': 'yor_Latn',
 'ds': datetime.datetime(2023, 6, 1, 0, 0),
 'answers': ['Fun afikun iwọn didun, lo agbara pipọ pẹlu eyiti o lu awọn bọtini naa',
  'Maṣe maa rin ainidi kiri ki o le ni okun si',
  'Ṣe akiyesi lilu awọn nooti lakoko mimu ọwọ rẹ rọ',
  'Mu iyara pọ si pẹlu eyi ti o fi ṣẹ awọn ohun agbara isalẹ lati ṣaṣeyọri iwọn didun afikun']}

In [27]:

def init_dataset():
    dataset = new_datasets["test"]
    dataset = dataset.shuffle(seed=42)
    train_split = dataset.select(range(0, 50))
    validation_split = dataset.select(range(50, 100))
    test_split = dataset.select(range(100, 500))
    train_split = [
        dspy.Example({
            "passage": x['flores_passage'],
            "question": x['question'],
            "answer_list": x['answers'],
            "correct_answer_number": x['correct_answer_num'],
        }).with_inputs("passage", "question", "answer_list")
        for x in train_split
    ]

    validation_split = [
        dspy.Example({
            "passage": x['flores_passage'],
            "question": x['question'],
            "answer_list": x['answers'],
            "correct_answer_number": x['correct_answer_num'],
        }).with_inputs("passage", "question", "answer_list")   
        for x in validation_split
    ]    

    test_split = [
        dspy.Example({
            "passage": x['flores_passage'],
            "question": x['question'],
            "answer_list": x['answers'],
            "correct_answer_number": x['correct_answer_num'],
        }).with_inputs("passage", "question", "answer_list")     
        for x in test_split
    ]

    train_set = train_split
    val_set = validation_split
    test_set = test_split

    return train_set, val_set, test_set

In [28]:
train_set, val_set, test_set = init_dataset()

len(train_set), len(val_set), len(test_set)

(50, 50, 400)

In [29]:
print("Question:")
print(train_set[0]['question'])
print("\n\Passage:")
print(train_set[0]['passage'])
print("\n\nAnswer List:")
print(train_set[0]['answer_list'])
print("\n\nCorrect Answer:")
print(train_set[0]['correct_answer_number'])

Question:
Gẹgẹ bi oju ewe naa, tani o kọ orin ifarajin ni akoko ipese ounjẹ?

\Passage:
Eto naa bere ni dede ago mejo abo ale ti ile naa (15.00 UTC). Awon ogbontarigi olorin kakiri orileede sagbekale bhajans abi orin isin si abe ese Shyam. Akọrin Sanju Ṣama ló bẹ̀rẹ̀ etò ìrọ̀lẹ́, Jai Ṣanka Ṣudiri ló tẹ́lẹ̀ Jai Sanaka Sandiri naa. Akọrin Raju Kandiwali n kọri tẹ́lẹ̀. Lẹ́yìn náà, Lakkha Singh ni ó gba ìdarí kíkọ orin bhajans. Àwọn Chhappan Bhog méjìdínláàdọ́fà (ni Hinduism, mẹ́rìndílọ́gọ́ta ọ̀tọ̀tọ̀ ohun tó ṣe é jẹ, bíi súùtì èso, ẹ̀pà, ounjẹ abbl. Tí wọ́n gbé fún àwọn òrìṣà) ni wọ́n gbé fún Baba Shyam. Lakkha Singh ti se ìfàkalẹ̀ chhappan bhog bhajan pẹ̀lú. Olórin Raju Khandelwal ló ń kọ́ọ̀wọ́rìn pẹ̀lú rẹ̀.


Answer List:
['Lakkha Singh', 'Jai Shankar Choudhary', 'Shri Shyam', 'Sanju Sharma']


Correct Answer:
1


In [30]:
print("Question:")
print(test_set[0]['question'])
print("\n\Passage:")
print(test_set[0]['passage'])
print("\n\nAnswer List:")
print(test_set[0]['answer_list'])
print("\n\nCorrect Answer:")
print(test_set[0]['correct_answer_number'])

Question:
Gẹgẹ bi oju ewe naa, ewo ninu wọnyi nipa idẹnukọlẹ Luno ni kii ṣe otitọ?

\Passage:
Ọkọ̀ ojú omi náà gbé ọgọ́fà sí ọgọ́jọ ìwọ̀n mita epo nígbà tí ó dẹnu kọlẹ̀ tí ìjì àti ìgbì tì í sínú ibi ọkọ̀ omi . Oko ofurufu gba awom mejila to je osise inu baalu na la ati pe enikan lo fi imu sese. Ọkọ̀ ojú omi 100 metre tí padà láti lọ gbé àwọn ajílẹ̀ àwọn òṣìṣk sì bẹ̀rù pé ọkọ̀ náà lè dànù lọ́nà.


Answer List:
['Idẹnukọlẹ naa waye ṣaaju gbigbe ẹru', 'Ọkọ naa ni awọn mita 100 ti epo ni ori rẹ lakoko idẹnukọle', 'Ninu awọn ọmọ ẹgbẹ mejila, ọkan ni ifarapa', 'Wọn ti ọkọ naa sinu idina omi']


Correct Answer:
2


In [31]:
class ReadingComprehension(dspy.Signature):
    """Answer a multiple-choice question using the passage."""
    question = dspy.InputField(desc="The question to answer from the passage.")
    passage = dspy.InputField(desc="The passage that contains the answer.")
    answer_list = dspy.InputField(desc="List of answer options.")
    correct_answer_number = dspy.OutputField(desc="Return the 1-based index of the correct option.")

program = dspy.ChainOfThought(ReadingComprehension)

In [32]:
def metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
    correct_answer = str(example["correct_answer_number"]).strip()
    llm_answer = str(prediction.correct_answer_number).strip()

    if llm_answer not in {"1", "2", "3", "4"}:
        return 0

    return int(llm_answer == correct_answer)


In [33]:
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

Average Metric: 215.00 / 400 (53.8%): 100%|██████████| 400/400 [01:03<00:00,  6.33it/s]

2025/12/18 05:34:39 INFO dspy.evaluate.evaluate: Average Metric: 215 / 400 (53.8%)





Unnamed: 0,passage,question,answer_list,example_correct_answer_number,reasoning,pred_correct_answer_number,metric
0,Ọkọ̀ ojú omi náà gbé ọgọ́fà sí ọgọ́jọ ìwọ̀n mita epo nígbà tí ó dẹ...,"Gẹgẹ bi oju ewe naa, ewo ninu wọnyi nipa idẹnukọlẹ Luno ni kii ṣe ...","['Idẹnukọlẹ naa waye ṣaaju gbigbe ẹru', 'Ọkọ naa ni awọn mita 100 ...",2,"Lati inu oju ewe ti a fun, o sọ pe ọkọ oju omi naa ti gbe epo rẹ s...",1,✔️ [0]
1,Àwọn ilé ìtura kan ma n ní ohun ìní láti àkókò iyebíye ti relúwé e...,Ewo ninu awọn atẹle yii ni o kere julọ lati rii ni iru awọn ile it...,"['Ibi imuti oriṣiriṣi', 'Ile ounjẹ ayika alarinrin', 'Awọn alejo ẹ...",4,"Ninu passage ti a fun, ile itura ti a ṣapejuwe jẹ́ ti igba atijọ, ...",4,✔️ [1]
2,Jas 39C lọ kọlu ibi tí ènìyàn ma ń gbà ní ago mẹsán àbọ̀ (0230 UTC...,Kini o ṣẹlẹ si ọkọ ina ti o dahun si ijamba ti JAS 39C Gripen?,"[O ni ijamba, O ṣubu, O gbina, Gaasi tan ninu rẹ]",2,"Ninu passage, a sọ pé ọkọ̀ panápaná ilé isé ọkọ̀ òfuurufú subú níg...",2,✔️ [1]
3,Bọ́ọ̀mù ṣiṣẹ́ lóri pé ó gba agbára láti jẹ́ kó wà paps pẹ̀lú nucle...,"Gẹgẹ bi oju ewe naa, kini o n ṣẹlẹ nigba ti nukilọsi ba pinya?","[O gba purọtonu, Agbara jade , Awọn atọmu n ṣe segesege , Ipadanu ...",2,"Ninu oju ewe naa, a sọ pe bọ́ọ̀mù n gba agbára lati jẹ ki o wa pap...",2,✔️ [1]
4,Ọpẹlopé àsopọ̀ okùn fibre lábẹ́ òkun tsi Yuropi ati boolu gbohung...,"Gẹgẹ bi oju iwe naa, kini awọn arinrin-ajo ti o n lọ si Greenland ...","['Awọn agbegbe kan ti a mọ fun awọn oṣuwọn irufin pupọ', 'Mu aṣọ t...",4,"Láti inú ọ̀rọ̀ tó wà nínú ìpínrọ̀ náà, ó dájú pé àwọn arinrin-ajo ...",4,✔️ [1]
...,...,...,...,...,...,...,...
395,"Ètò PBS ní tó àmì ẹ̀yẹ Emmy tó lé ní dọ́sìnì méjì, Ó sì ma ń kéré...",Ọkọọkan iṣẹlẹ Kika Oṣumare ni o fun awọn ọmọde ni awọn iyanju fun ...,"[Awọn afihan Amohunmaworan, Iyara ikawe agbegbe, Awọn akọle, Awọn ...",4,"Nínú àpèjúwe náà, Johanu Granti sọ pé Kika Oṣumare n fi ìmòràn fún...",4,✔️ [1]
396,Awon ara Bailoni ko tempili gbendeke fun awon orisa kookan won ti ...,Ewo ninu wọnyi ni o wa fun awọn alufa?,"[Awọn ajọdun, Ikọkọ awọn ibi mimọ, Awọn ayẹyẹ, Awọn pẹpẹ giga onig...",2,"Ninu passage, a sọ pe tẹmpili kọọkan ni ""ojúde àgbàlà tẹ́ḿpìlì àt...",2,✔️ [1]
397,Àwọn ìyípadà lẹ ní àwọn oríṣi àbéjádẹ pèlú dídálórí irú àyípadà tó...,"Dida lori alaye ti a funni ni oju ewe naa, kini awọn ipa oriṣi awọ...","['Meloo jiini ohun elo naa ni o kan', 'Boya awọn sẹẹli naa jẹ ila-...",1,"Gẹgẹ bi alaye ti a funni ninu oju ewe naa, awọn ipa orisi awọn ayi...",1,✔️ [1]
398,"Wíwọ ọkọ̀ rẹ, pẹ̀lú lílọ sí ọ̀nà jínjìn ní àyò abínibí ní ọ̀nà tó ...","Gẹgẹ bi oju iwe naa, kini awọn ọlọkọ apagọ fẹ lati tẹra fun bi wọn...","['Wiwa ọkọ nla, bii SUV tabi ọkọ ajagbe kekere', 'Rinrin irin-ajo ...",3,"Ninu oju iwe naa, o sọ pe ""Ibudo inuoko seese ti o ba ni oko nla m...",1,✔️ [0]


EvaluationResult(score=53.75, results=<list of 400 results>)

In [34]:

def metric_with_feedback(example, prediction, trace=None, pred_name=None, pred_trace=None):
    answers = example["answer_list"]
    gold = str(example["correct_answer_number"]).strip()
    passage = example.get("passage", "")

    pred = str(getattr(prediction, "correct_answer_number", "")).strip()

    valid = {str(i) for i in range(1, len(answers) + 1)}

    if pred not in valid:
        gold_idx = int(gold) - 1
        feedback_text = (
            f"You are to return a single option number in {sorted(valid)}."
            f"You returned '{pred}' which is not a valid option. "
            f"The correct option is `{gold}` in the answer list, which corresponds to: {answers[gold_idx]}."
        )
        if passage:
            feedback_text += (
            f"\n\n Here is the passage that contains the correct answer:\n{passage}\n\n"
            "Next time: locate the sentence that directly answers the question, then map it to the option list."
        )        
        return dspy.Prediction(score=0, feedback=feedback_text)

    score = int(pred == gold)
    gold_idx = int(gold) - 1
    pred_idx = int(pred) - 1

    if score == 1:
        feedback_text = (
            f"Your answer is correct! It is option `{gold}` in the answer list, "
            f"which corresponds to: `{answers[gold_idx]}`."
        )
    else:
        feedback_text = (
            f"Your answer is incorrect! "
            f"The correct option is `{gold}` in the answer list, which corresponds to: `{answers[gold_idx]}`."
        )

    if passage:
        feedback_text += (
        f"\n\n Here is the passage that contains the correct answer:\n{passage}\n\n"
        "Next time: locate the sentence that directly answers the question, then map it to the option list."
    )

    return dspy.Prediction(score=score, feedback=feedback_text)

In [35]:
from dspy import GEPA

optimizer = GEPA(
    metric=metric_with_feedback,
    auto="light",
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=3,
    seed=42,
    reflection_lm=dspy.LM(model="gpt-5", temperature=1.0, max_tokens=32000, api_key=key)
)

optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

2025/12/18 06:07:27 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 580 metric calls of the program. This amounts to 5.80 full evals on the train+val set.
2025/12/18 06:07:27 INFO dspy.teleprompt.gepa.gepa: Using 50 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/580 [00:00<?, ?rollouts/s]2025/12/18 06:07:41 INFO dspy.evaluate.evaluate: Average Metric: 37.0 / 50 (74.0%)
2025/12/18 06:07:41 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.74
GEPA Optimization:   9%|▊         | 50/580 [00:13<02:27,  3.59rollouts/s]2025/12/18 06:07:41 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.74


Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:09<00:00,  3.09s/it]

2025/12/18 06:07:50 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/18 06:08:36 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: Task: Answer a multiple‑choice question using only the provided passage.

Input format:
- passage: A short text (often in Yorùbá).
- question: A multiple‑choice question about the passage.
- answer_list: A list of options. Indexing is 1-based (the first option is 1).

Your job:
1) Understand the question type.
   - Identify whether it asks for who/what/which, a responsibility/role, or a “not mentioned/except/which is NOT” item.
   - Watch polarity carefully in Yorùbá: phrases like “kí ni a kò mẹ́nuba,” “ẹ̀yà tí kì í ṣe,” “yàtọ̀ sí,” “except,” indicate a negative/NOT question.

2) Locate the exact supporting sentence(s) in the passage.
   - Find the line that directly answers the question.
   - Do not rely on external knowledge or inference beyond what is stated.
   - Attribute correctly: choose the entity that performs the action in the passage (e.g., if the passage says A appointed B, and B

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:02<00:00,  1.05it/s] 

2025/12/18 06:08:51 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/18 06:09:31 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: Task: Answer a multiple‑choice question using only the provided passage.

Input format:
- passage: A short text (often in Yorùbá).
- question: A multiple‑choice question about the passage.
- answer_list: A list of options. Indexing is 1-based (the first option is 1).

Your job:
1) Understand the question type.
   - Identify whether it asks for who/what/which, a responsibility/role, or a NOT/EXCEPT item.
   - Pay close attention to negative polarity in Yorùbá: phrases like “kí ni a kò mẹ́nuba,” “ẹ̀yà tí kì í ṣe,” “yàtọ̀ sí,” “ẹ̀yà tí kò bá mu,” “àfi/except,” “eyiti kii ṣe,” indicate a negative/NOT question.
   - If the question asks “eyi wo ni o ṣeese julọ…,” still select the item explicitly stated in the passage, not an inferred/related concept.

2) Locate the exact supporting sentence(s) in the passage.
   - Find the line that directly answers the question. Do not rely on external knowledge

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:04<00:00,  1.61s/it] 

2025/12/18 06:09:53 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/18 06:40:47 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: Task: Answer a multiple-choice question using only the given passage (often in Yoruba). Return the index of the correct option from the provided answer_list.

Input format:
- passage: A short text in Yoruba.
- question: A question in Yoruba about the passage.
- answer_list: A list of options (1-based ordering).

Output format:
- Provide two fields:
  - reasoning: A brief explanation of how the passage supports the chosen option. Keep it concise and cite the specific clause(s) that answer the question.
  - correct_answer_number: The 1-based index (integer) of the chosen option.

General approach:
1. Carefully read the passage and the question. Work only with information explicitly stated in the passage; do not use external knowledge.
2. Locate the exact sentence(s) or clause(s) in the passage that directly answer the question. Prefer explicit statements over implied ones.
3. Map that evidence

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:02<00:00,  1.11it/s] 

2025/12/18 06:41:03 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/18 06:42:03 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: Task
- Answer a multiple-choice question using only the given Yoruba passage. Return the index of the correct option from the provided answer_list.

Input format
- passage: A short text in Yoruba.
- question: A Yoruba question about the passage.
- answer_list: A list of options (1-based ordering).

Output format
- Provide exactly two top-level fields:
  - reasoning: A brief explanation of how the passage supports the chosen option. Keep it concise and cite the specific clause(s) from the passage that answer the question.
  - correct_answer_number: The 1-based index (integer) of the chosen option.
- Do not add extra fields or formatting beyond these two fields.

General approach
1. Read the passage and question carefully. Use only information explicitly stated in the passage. Do not use outside knowledge or make inferences not grounded in the text.
2. Identify the exact sentence(s) or clause(

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:08<00:00,  2.84s/it]

2025/12/18 06:42:15 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/18 06:43:11 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: Task: Answer a multiple-choice question using ONLY the given passage.

Input format:
- passage: a short text (often in Yorùbá)
- question: one specific query about the passage
- answer_list: a list of options (1-indexed) in the same language as the passage/question

Your goal:
- Locate the exact line(s)/phrase(s) in the passage that directly answer the question.
- Map that evidence precisely to one option in answer_list.
- Be careful with logical qualifiers (e.g., “kii ṣe” = not, “ṣẹẹ̀ṣe/kò ṣẹẹ̀ṣe” = may/may not, “sábà” = usually, not always).
- Prefer the option whose wording most closely and explicitly matches the passage, rather than a looser paraphrase or inference.
- Do not use outside knowledge; rely only on the passage for truth/falsehood.

Output format (machine-parseable):
- reasoning: 1–3 concise sentences explaining why the chosen option matches the passage, quoting or paraphrasin

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:02<00:00,  1.15it/s] 

2025/12/18 06:43:28 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/18 06:44:02 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: Task: Answer a multiple-choice question using only the given passage (often in Yoruba). Return the index of the correct option from the provided answer_list.

Input format:
- passage: A short text in Yoruba.
- question: A question in Yoruba about the passage.
- answer_list: A list of options (1-based ordering).

Output format:
- Provide two fields:
  - reasoning: A brief explanation of how the passage supports the chosen option. Keep it concise and cite the specific clause(s) that answer the question (quote the exact Yoruba phrase(s) when possible).
  - correct_answer_number: The 1-based index (integer) of the chosen option.

General approach:
1. Read the passage and question carefully. Use only what is explicitly stated in the passage; do not use external knowledge or assumptions.
2. Identify the exact sentence(s) or clause(s) in the passage that directly answer the question. Prefer explici

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:01<00:00,  1.56it/s] 

2025/12/18 06:44:26 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/18 06:55:14 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: Task: Answer a multiple-choice question using only the given passage (often in Yoruba). Return the index of the correct option from the provided answer_list.

Input format:
- passage: A short text in Yoruba (or mostly Yoruba).
- question: A question in Yoruba about the passage.
- answer_list: A list of options (1-based ordering).

Output format:
- Return exactly two fields (no extra text):
  - reasoning: A brief explanation in Yoruba that cites the exact clause(s) from the passage that answer the question. Keep it concise and quote the key phrase(s).
  - correct_answer_number: The 1-based index (integer) of the chosen option.

General approach:
1. Read the passage and question carefully. Use ONLY information explicitly stated in the passage; do not use outside knowledge.
2. Locate the exact sentence(s) or clause(s) that directly answer the question. Prefer explicit statements over implied on

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:03<00:00,  1.14s/it]

2025/12/18 06:55:29 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/12/18 06:56:15 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: Task: Answer a multiple-choice question using ONLY the given passage.

Input format:
- passage: a short text (often in Yorùbá)
- question: one precise query about the passage
- answer_list: a list of options (1-indexed) in the same language as the passage/question

Your goal:
- Locate the exact line(s)/phrase(s) in the passage that directly answer the question.
- Map that evidence precisely to one option in answer_list.
- Prefer the option whose wording most closely and explicitly matches the passage, rather than a looser paraphrase or inference.
- Do not use outside knowledge; rely only on the passage for truth/falsehood.

Output format (machine-parseable):
- reasoning: 1–3 concise sentences in the passage’s language (Yorùbá when possible) explaining why the chosen option matches the passage. Quote or tightly paraphrase the key phrase(s) that justify your choice. If the question asks for th

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:03<00:00,  1.16s/it]

2025/12/18 06:56:27 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/12/18 06:56:27 INFO dspy.teleprompt.gepa.gepa: Iteration 9: All subsample scores perfect. Skipping.
2025/12/18 06:56:27 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Reflective mutation did not propose a new candidate
GEPA Optimization:  78%|███████▊  | 451/580 [49:00<11:38,  5.41s/rollouts]2025/12/18 06:56:27 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Selected program 0 score: 0.74



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:05<00:00,  1.81s/it] 

2025/12/18 06:56:33 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/18 07:07:27 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: Task:
Answer a multiple‑choice question strictly using the provided passage.

Inputs:
- passage: A short text (often in Yorùbá) containing the information needed.
- question: A multiple‑choice question about the passage.
- answer_list: A list of answer options (1‑based order).

Output:
Provide:
- reasoning: 1–3 concise sentences explaining the choice by citing or paraphrasing the specific part of the passage that answers the question.
- correct_answer_number: The 1‑based index of the chosen option from answer_list.

Language:
- Write the reasoning in the same language as the passage/question (often Yorùbá). Keep terminology consistent with the passage.

Method (locate → match → map):
1) Understand the question:
   - Identify exactly what is being asked (e.g., which item had a problem; what information authorities confirmed; what a capacity helps people do vs what the capacity is).
   - Note

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:04<00:00,  1.46s/it]

2025/12/18 07:07:34 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/12/18 07:08:14 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: Task: Answer a multiple-choice question using only the provided passage.

Input format:
- passage: A short text (often in Yoruba; keep diacritics as-is).
- question: A multiple-choice question about the passage (same language as the passage).
- answer_list: An ordered list of answer options. Indexing starts at 1.

Your goal:
- Select the single option from answer_list that is best supported by the passage.

Strict rules and strategy:
1) Read the passage carefully in its original language. Do not translate. Do not use outside knowledge.
2) Locate the exact sentence(s) or phrase(s) in the passage that directly answer the question. Prefer explicit statements over inference.
3) Map the located statement to the option list:
   - Choose the option whose wording most directly matches the passage’s wording or meaning.
   - If multiple options seem plausible, pick the one that is most specific to th

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:01<00:00,  1.72it/s]

2025/12/18 07:08:24 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/12/18 07:19:24 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for predict: Task
- Answer a multiple-choice question using only the provided passage.

Input format
- passage: A short text (often in Yoruba; keep diacritics as-is).
- question: A multiple-choice question about the passage (same language as the passage).
- answer_list: An ordered list of answer options. Indexing starts at 1.

Your goal
- Select the single option from answer_list that is best supported by the passage.

Strict rules and strategy
1) Read the passage carefully in its original language. Do not translate. Do not use outside knowledge.
2) Locate the exact sentence(s) or phrase(s) in the passage that directly answer the question. Prefer explicit statements over inference.
3) Map the located statement to the option list:
   - Choose the option whose wording most directly matches the passage’s wording or meaning.
   - Prefer exact lexical matches from the passage when available (e.g., words like

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:03<00:00,  1.24s/it]

2025/12/18 07:19:39 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/18 07:20:30 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for predict: Task: Answer a multiple-choice question using only the given passage.

Inputs:
- passage: A short text (may be in Yoruba or other languages, possibly with diacritics/orthographic variants).
- question: A question about the passage (can be factual, “who/what/where/when/how,” or a “which of the following is NOT …” type).
- answer_list: An ordered list of options (1-based indexing).

Output format:
- Provide exactly two top-level fields:
  - reasoning: 1–3 concise sentences explaining how the passage supports the choice. Prefer quoting or paraphrasing the exact clause that answers the question.
  - correct_answer_number: The 1-based index (integer) of the chosen option in answer_list. Do not output the option text.

General guidelines:
1. Use ONLY information from the passage. Do not rely on outside knowledge or speculation.
2. Locate the sentence or clause in the passage that most directly an

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:09<00:00,  3.17s/it]

2025/12/18 07:20:43 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/18 07:21:26 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: Task: Answer a multiple-choice question using the provided passage (often in Yorùbá). Choose exactly one option from the given answer_list.

How to solve:
1) Read the entire passage carefully. Do not use outside knowledge; base your answer only on the passage.
2) Parse the question precisely:
   - Identify the subject, action, and object.
   - Pay special attention to negation and qualifiers in Yorùbá (e.g., “kò”, “kì í”, “kọ̀”, “kò fọwọ́ sí”, “kò gba”), tense, and question focus (who/what/when/where/which).
3) Locate the sentence(s) in the passage that directly answer the question. Prefer explicit statements over general context.
4) Map the evidence to the answer_list:
   - Find the option that best paraphrases or exactly matches the relevant phrase(s) in the passage.
   - Account for Yorùbá orthographic/diacritic variations and synonyms (e.g., “ọ̀pá ààbò” ≈ “bàà ààbò/baa alaboo”; “akókò t

In [36]:
evaluate(optimized_program)

Average Metric: 216.00 / 400 (54.0%): 100%|██████████| 400/400 [01:01<00:00,  6.52it/s]

2025/12/18 07:56:42 INFO dspy.evaluate.evaluate: Average Metric: 216 / 400 (54.0%)





Unnamed: 0,passage,question,answer_list,example_correct_answer_number,reasoning,pred_correct_answer_number,metric
0,Ọkọ̀ ojú omi náà gbé ọgọ́fà sí ọgọ́jọ ìwọ̀n mita epo nígbà tí ó dẹ...,"Gẹgẹ bi oju ewe naa, ewo ninu wọnyi nipa idẹnukọlẹ Luno ni kii ṣe ...","['Idẹnukọlẹ naa waye ṣaaju gbigbe ẹru', 'Ọkọ naa ni awọn mita 100 ...",2,"Idẹnukọlẹ náà ṣẹlẹ̀ nígbà tí ọkọ ojú omi naa ""gbé ọgọ́fà sí ọgọ́jọ...",1,✔️ [0]
1,Àwọn ilé ìtura kan ma n ní ohun ìní láti àkókò iyebíye ti relúwé e...,Ewo ninu awọn atẹle yii ni o kere julọ lati rii ni iru awọn ile it...,"['Ibi imuti oriṣiriṣi', 'Ile ounjẹ ayika alarinrin', 'Awọn alejo ẹ...",4,"Passage describes old-style hotels characterized by wealth, tradit...",4,✔️ [1]
2,Jas 39C lọ kọlu ibi tí ènìyàn ma ń gbà ní ago mẹsán àbọ̀ (0230 UTC...,Kini o ṣẹlẹ si ọkọ ina ti o dahun si ijamba ti JAS 39C Gripen?,"[O ni ijamba, O ṣubu, O gbina, Gaasi tan ninu rẹ]",2,"Passage states ""kọ́ọ̀kọ́ panápaná ilé ìṣé ọkọ̀ òfuurufú ṣubú nígbà...",2,✔️ [1]
3,Bọ́ọ̀mù ṣiṣẹ́ lóri pé ó gba agbára láti jẹ́ kó wà paps pẹ̀lú nucle...,"Gẹgẹ bi oju ewe naa, kini o n ṣẹlẹ nigba ti nukilọsi ba pinya?","[O gba purọtonu, Agbara jade , Awọn atọmu n ṣe segesege , Ipadanu ...",2,Oju ewe naa sọ pé bọ́ọ̀mù ṣiṣẹ́ nitori agbára rẹ lati jẹ́ ki nucle...,2,✔️ [1]
4,Ọpẹlopé àsopọ̀ okùn fibre lábẹ́ òkun tsi Yuropi ati boolu gbohung...,"Gẹgẹ bi oju iwe naa, kini awọn arinrin-ajo ti o n lọ si Greenland ...","['Awọn agbegbe kan ti a mọ fun awọn oṣuwọn irufin pupọ', 'Mu aṣọ t...",4,"Awọn arinrin-ajo gbọdọ yago fun lilo ọrọ ""Eskimo"" nitori o jẹ eebu...",4,✔️ [1]
...,...,...,...,...,...,...,...
395,"Ètò PBS ní tó àmì ẹ̀yẹ Emmy tó lé ní dọ́sìnì méjì, Ó sì ma ń kéré...",Ọkọọkan iṣẹlẹ Kika Oṣumare ni o fun awọn ọmọde ni awọn iyanju fun ...,"[Awọn afihan Amohunmaworan, Iyara ikawe agbegbe, Awọn akọle, Awọn ...",4,"Passage states that Kika Oṣumare encourages children ""lati gbe iwe...",4,✔️ [1]
396,Awon ara Bailoni ko tempili gbendeke fun awon orisa kookan won ti ...,Ewo ninu wọnyi ni o wa fun awọn alufa?,"[Awọn ajọdun, Ikọkọ awọn ibi mimọ, Awọn ayẹyẹ, Awọn pẹpẹ giga onig...",2,"Ìpinnu wa lati inu ìpínrọ̀ náà ni pé ""Tẹ́m̀pìlì kọ̀ọ̀kan ló ní ojú...",2,✔️ [1]
397,Àwọn ìyípadà lẹ ní àwọn oríṣi àbéjádẹ pèlú dídálórí irú àyípadà tó...,"Dida lori alaye ti a funni ni oju ewe naa, kini awọn ipa oriṣi awọ...","['Meloo jiini ohun elo naa ni o kan', 'Boya awọn sẹẹli naa jẹ ila-...",1,Alaye ninu oju èwè náà sọ pé awọn ipa ayipada jẹ̀mọ̀ pẹ̀lú irú a...,4,✔️ [0]
398,"Wíwọ ọkọ̀ rẹ, pẹ̀lú lílọ sí ọ̀nà jínjìn ní àyò abínibí ní ọ̀nà tó ...","Gẹgẹ bi oju iwe naa, kini awọn ọlọkọ apagọ fẹ lati tẹra fun bi wọn...","['Wiwa ọkọ nla, bii SUV tabi ọkọ ajagbe kekere', 'Rinrin irin-ajo ...",3,"Ìwé náà sọ pé ""Ibudo inuoko seese ti o ba ni oko nla mnifaani, SUV...",1,✔️ [0]


EvaluationResult(score=54.0, results=<list of 400 results>)

In [38]:
from pathlib import Path
output_dir = "results/belebele"
output_dir = Path(output_dir)

model_name = "openai-gpt-4-1-mini"
lang = "yor"

base_dir = output_dir / "base"
optimized_dir = output_dir / "optimized"
programs_dir = output_dir / "programs"

# base_dir.mkdir(parents=True, exist_ok=True)
# optimized_dir.mkdir(parents=True, exist_ok=True)
programs_dir.mkdir(parents=True, exist_ok=True)

# unopt_path = base_dir / f"{model_name.replace('/', '-')}_{lang}.json"
# opt_path = optimized_dir / f"{model_name.replace('/', '-')}_{lang}.json"
program_path = programs_dir / f"{model_name.replace('/', '-')}_{lang}.json"

# evaluate(program, save_as_json=unopt_path)
# evaluate(optimized_program, save_as_json=opt_path)
optimized_program.save(program_path, save_program=False)

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/Users/ajayi/Documents/Atinuda/.venv/lib/python3.11/site-packages/zmq/eventloop/zmqstream.py", line 565, in _log_error
    f.result()
  File "/Users/ajayi/Documents/Atinuda/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 341, in dispatch_control
    await self.process_control(msg)
  File "/Users/ajayi/Documents/Atinuda/.venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 347, in process_control
    idents, msg = self.session.feed_identities(msg, copy=False)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ajayi/Documents/Atinuda/.venv/lib/python3.11/site-packages/jupyter_client/session.py", line 994, in feed_identities
    raise ValueError(msg)
ValueError: DELIM not in msg_list
ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/Users/ajayi/Documents/Atinuda/.venv/lib