In [1]:
import os
import dspy
from datasets import load_dataset

from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
key = os.environ.get("OPENAI_API_KEY")

In [3]:
lm = dspy.LM("openai/gpt-4.1-mini", temperature=1, api_key=key, max_tokens=32000)
dspy.configure(lm=lm)

In [4]:
from tasks.baselines.afriqa import _parse_str_list

In [5]:

def init_dataset():
    train_split = load_dataset("masakhane/afriqa-gold-passages", "yor")["train"]
    train_split = [
        dspy.Example({
            "question_lang": x['question_lang'],
            'context': x['context'],
            'answer_lang': _parse_str_list(x['answer_lang']),
        }).with_inputs("question_lang", "context")
        for x in train_split
    ]
    import random
    random.Random(0).shuffle(train_split)
    train_split = train_split[:100]
    tot_num = len(train_split)

    test_split = load_dataset("masakhane/afriqa-gold-passages", "yor")['test']
    test_split = [
        dspy.Example({
            "question_lang": x['question_lang'],
            'context': x['context'],
            'answer_lang': _parse_str_list(x['answer_lang']),
        }).with_inputs("question_lang", "context")
        for x in test_split
    ]

    train_set = train_split[:int(0.5 * tot_num)]
    val_set = train_split[int(0.5 * tot_num):]
    test_set = test_split

    return train_set, val_set, test_set

In [6]:
train_set, val_set, test_set = init_dataset()

len(train_set), len(val_set), len(test_set)

(50, 50, 253)

In [7]:
print("Question:")
print(train_set[0]['question_lang'])
print("\n\Context:")
print(train_set[0]['context'])
print("\n\nAnswer:")
print(train_set[0]['answer_lang'])

Question:
Ọdun wo s'ọdun wo ni Olúṣẹ́gun Ọ̀ṣọbà jẹ  Gómìnà Ìpínlẹ̀ Ògùn n'ilẹ Naijiria?

\Context:
Osoba was elected on two occasions as Governor of Ogun State first from January 1992 until November 1993 with the Social Democratic Party (SDP). He was removed from office by Sani Abacha's administration on 17 November 1993. In the 1999 Ogun State gubernatorial election, he was elected again as governor with the Alliance for Democracy party (AD), holding office between May 1999 and May 2003.


Answer:
['oṣu kini ọdun 1992 titi di oṣu kọkanla ọdun 1993']


In [8]:
print("Question:")
print(val_set[0]['question_lang'])
print("\n\Context:")
print(val_set[0]['context'])
print("\n\nAnswer:")
print(val_set[0]['answer_lang'])

Question:
Ki l'orukọ aja bulu (bull dog) to wa ninu ere 'Tom and Jerry'?

\Context:
Spike, occasionally referred to as Butch or Killer, is a stern though occasionally dim-witted grey bulldog who is particularly disapproving of cats, but is gentle towards mice (though in his debut appearance, Dog Trouble (1942), Spike goes after both Tom and Jerry), and later, his son Tyke.


Answer:
['Spike']


In [9]:
print("Question:")
print(test_set[0]['question_lang'])
print("\n\Context:")
print(test_set[0]['context'])
print("\n\nAnswer:")
print(test_set[0]['answer_lang'])

Question:
Ọmọ ọdun melo ni Cosmas Maduka nigbati o fẹ iyawo rẹ Charity?

\Context:
The startup also failed sooner than expected and Maduka went on to found Coscharis Motor with the sum of three hundred nairas (N300) which focused on sales of automobile spare parts in 1977. The name of the company according to him is a combination of his first name, Cosmos, and his wife, Charity, whom he married at age 21. Maduka's business breakthrough started in 1982 when the Nigerian government granted ten (10) motor companies import licenses, for which Coscharis was selected.


Answer:
['21']


In [10]:
class GenerateResponse(dspy.Signature):
    """Answer the question using the context provided."""
    question_lang = dspy.InputField()
    context  = dspy.InputField()
    answer_lang   = dspy.OutputField(desc="Short final answer only")

program = dspy.ChainOfThought(GenerateResponse)

In [15]:
import ast
import re
import unicodedata
from collections import Counter

In [11]:
def _strip_accents(s: str) -> str:
    return "".join(ch for ch in unicodedata.normalize("NFD", s)
                   if unicodedata.category(ch) != "Mn")

def _normalize(s: str) -> str:
    s = (s or "").strip().lower()
    s = _strip_accents(s)
    s = re.sub(r"[^\w\s]", " ", s, flags=re.UNICODE)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _token_f1(pred: str, gold: str) -> float:
    p = _normalize(pred).split()
    g = _normalize(gold).split()
    if not p and not g: return 1.0
    if not p or not g: return 0.0
    common = Counter(p) & Counter(g)
    num_same = sum(common.values())
    if num_same == 0: return 0.0
    prec = num_same / len(p)
    rec  = num_same / len(g)
    return (2 * prec * rec) / (prec + rec)

In [12]:
def metric_em(example, prediction, **_):
    golds = example.get("answer_lang")
    pred  = getattr(prediction, "answer_lang", "")
    pn = _normalize(pred)
    return float(any(pn == _normalize(g) for g in golds))

def metric_f1(example, prediction, **_):
    golds = example.get("answer_lang")
    pred  = getattr(prediction, "answer_lang", "")
    return max((_token_f1(pred, g) for g in golds), default=0.0)

In [13]:
def metrics(example, prediction, trace=None, **kwargs):
    em = metric_em(example, prediction, trace=trace)
    f1 = metric_f1(example, prediction, trace=trace)
    return {"em": em, "f1": f1}

In [16]:
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=metric_em,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

Average Metric: 64.00 / 253 (25.3%): 100%|██████████| 253/253 [00:00<00:00, 524.52it/s]

2025/12/07 00:08:31 INFO dspy.evaluate.evaluate: Average Metric: 64.0 / 253 (25.3%)





Unnamed: 0,question_lang,context,example_answer_lang,reasoning,pred_answer_lang,metric_em
0,Ọmọ ọdun melo ni Cosmas Maduka nigbati o fẹ iyawo rẹ Charity?,The startup also failed sooner than expected and Maduka went on to...,[21],"The context clearly states that Cosmas Maduka married his wife, Ch...",21 ọdún,✔️ [0.000]
1,Igbawo ni Popu John Paul II ku ?,Pope John Paul II (Latin: Ioannes Paulus II; Italian: Giovanni Pao...,[2005],Popu John Paul II jẹ olori Ile ijọ Katoliki Romu ati olori ipinle ...,2 Oṣù Kẹrin ọdun 2005,✔️ [0.000]
2,Ta lẹni to jẹ gomina ti Eko nigba ti wọn pari kikọ National Arts T...,The building of the National Theatre started when General Yakubu G...,[Oluṣẹgun ọbasanjọ],"Nigbati a pari kikọ National Arts Theatre ni Eko, Olusegun Obasanj...",Lateef Jakande,✔️ [0.000]
3,Ipinlẹ wo l'Amẹrika ni Wikimedia Foundation kọ olu ile iṣẹ wọn si?,"Wikimedia Foundation, Inc. (WMF, also colloquially referred to as ...",[California],Niwọn igba ti Wikimedia Foundation jẹ ile-iṣẹ ti kii ṣe èrè ti o w...,California,✔️ [1.000]
4,Eroja pataki wo ni awọn ara Igbo fi maa n se ounjẹ abacha ?,Abacha is popular in the Eastern part of Nigeria. It is made using...,[Gbaguda],"Lati inu ọrọ tí a fúnni, à ń rí i pé abacha jẹ oúnjẹ tí a ṣe pẹ̀lú...","Cassava tí wọn rẹ̀, epo pupa (palm oil), crayfish, àti ugba.",✔️ [0.000]
...,...,...,...,...,...,...
248,Toonu cashu melo ni wọn gbejade lagbaye lọdun 2019?,"In 2019, four million tonnes of cashew nuts were produced globally...",[four million],"Ninu ọrọ ti a fi silẹ, o sọ pe ni ọdun 2019, a gbe cashew nuts mej...",4 million tonnes,✔️ [0.000]
249,Ọmọ ọdun melo ni Maryam Abacha nigbati Sani abacha ku ?,As of 2000 Maryam Abacha remained in Nigeria and continued to proc...,[ọmọbinrin mẹta ati ọmọkunrin meje],The context provided does not include the birth year of Maryam Aba...,A ko le sọ ọdun Maryam Abacha nigbati Sani Abacha ku nitori a ko f...,✔️ [0.000]
250,Ami ẹyẹ goolu Olympic melo ni Usain Bolt ni lapapo?,"An eight-time Olympic gold medallist, Bolt is the only sprinter to...",[Mẹ́jọ],"Usain Bolt won gold medals in three consecutive Olympics: 2008, 20...",8,✔️ [0.000]
251,Ọdun wo ni Orlando Owoh bẹrẹ si n kọrin ?,As a young man Owoh initially entered into the carpentry trade unt...,[1958],Orlando Owoh started his musical career when he was hired by Niger...,1958,✔️ [1.000]


EvaluationResult(score=25.3, results=<list of 253 results>)

In [17]:
evaluate_with_f1 = dspy.Evaluate(
    devset=test_set,
    metric=metric_f1,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate_with_f1(program)

Average Metric: 109.68 / 253 (43.4%): 100%|██████████| 253/253 [00:00<00:00, 738.54it/s]

2025/12/07 00:08:40 INFO dspy.evaluate.evaluate: Average Metric: 109.68127179997552 / 253 (43.4%)





Unnamed: 0,question_lang,context,example_answer_lang,reasoning,pred_answer_lang,metric_f1
0,Ọmọ ọdun melo ni Cosmas Maduka nigbati o fẹ iyawo rẹ Charity?,The startup also failed sooner than expected and Maduka went on to...,[21],"The context clearly states that Cosmas Maduka married his wife, Ch...",21 ọdún,✔️ [0.667]
1,Igbawo ni Popu John Paul II ku ?,Pope John Paul II (Latin: Ioannes Paulus II; Italian: Giovanni Pao...,[2005],Popu John Paul II jẹ olori Ile ijọ Katoliki Romu ati olori ipinle ...,2 Oṣù Kẹrin ọdun 2005,✔️ [0.333]
2,Ta lẹni to jẹ gomina ti Eko nigba ti wọn pari kikọ National Arts T...,The building of the National Theatre started when General Yakubu G...,[Oluṣẹgun ọbasanjọ],"Nigbati a pari kikọ National Arts Theatre ni Eko, Olusegun Obasanj...",Lateef Jakande,✔️ [0.000]
3,Ipinlẹ wo l'Amẹrika ni Wikimedia Foundation kọ olu ile iṣẹ wọn si?,"Wikimedia Foundation, Inc. (WMF, also colloquially referred to as ...",[California],Niwọn igba ti Wikimedia Foundation jẹ ile-iṣẹ ti kii ṣe èrè ti o w...,California,✔️ [1.000]
4,Eroja pataki wo ni awọn ara Igbo fi maa n se ounjẹ abacha ?,Abacha is popular in the Eastern part of Nigeria. It is made using...,[Gbaguda],"Lati inu ọrọ tí a fúnni, à ń rí i pé abacha jẹ oúnjẹ tí a ṣe pẹ̀lú...","Cassava tí wọn rẹ̀, epo pupa (palm oil), crayfish, àti ugba.",✔️ [0.000]
...,...,...,...,...,...,...
248,Toonu cashu melo ni wọn gbejade lagbaye lọdun 2019?,"In 2019, four million tonnes of cashew nuts were produced globally...",[four million],"Ninu ọrọ ti a fi silẹ, o sọ pe ni ọdun 2019, a gbe cashew nuts mej...",4 million tonnes,✔️ [0.400]
249,Ọmọ ọdun melo ni Maryam Abacha nigbati Sani abacha ku ?,As of 2000 Maryam Abacha remained in Nigeria and continued to proc...,[ọmọbinrin mẹta ati ọmọkunrin meje],The context provided does not include the birth year of Maryam Aba...,A ko le sọ ọdun Maryam Abacha nigbati Sani Abacha ku nitori a ko f...,✔️ [0.000]
250,Ami ẹyẹ goolu Olympic melo ni Usain Bolt ni lapapo?,"An eight-time Olympic gold medallist, Bolt is the only sprinter to...",[Mẹ́jọ],"Usain Bolt won gold medals in three consecutive Olympics: 2008, 20...",8,✔️ [0.000]
251,Ọdun wo ni Orlando Owoh bẹrẹ si n kọrin ?,As a young man Owoh initially entered into the carpentry trade unt...,[1958],Orlando Owoh started his musical career when he was hired by Niger...,1958,✔️ [1.000]


EvaluationResult(score=43.35, results=<list of 253 results>)

In [19]:

def metric_with_feedback(example, prediction, trace=None, pred_name=None, pred_trace=None):
    golds = example.get("answer_lang", [])
    if isinstance(golds, str):
        golds = [golds]
    golds = [g.strip() for g in golds if isinstance(g, str) and g.strip()]

    context = example.get("context", "")

    pred = getattr(prediction, "answer_lang", "")
    pred = (pred or "").strip()

    if not pred:
        feedback_text = (
            "The final answer must be a non-empty short string and nothing else. "
            f"You responded with {pred!r}."
        )
        if golds:
            feedback_text += f" The correct exact-match answer is one of: {golds}."
        if context:
            feedback_text += (
                f"\n\nHere's some context useful in answering the question:\n{context}\n\n"
                "Think about what takeaways you can learn from this context to improve your future answers "
                "and approach to similar question."
            )
        return dspy.Prediction(score=0.0, feedback=feedback_text)

    score = float(any(pred == g for g in golds))

    if score == 1.0:
        feedback_text = f"Your answer is correct. The correct answer is '{pred}'."
    else:
        feedback_text = (
            "Your answer is incorrect. The correct and exact-match answer is "
            f"one of: {golds}."
        )

    if context:
        feedback_text += (
            f"\n\nHere's some context useful in answering the question:\n{context}\n\n"
            "Think about what takeaways you can learn from this context to improve your future answers "
            "and approach to similar question."
        )

    return dspy.Prediction(score=score, feedback=feedback_text)


In [20]:
from dspy import GEPA

optimizer = GEPA(
    metric=metric_with_feedback,
    auto="light",
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=3,
    seed=42,
    reflection_lm=dspy.LM(model="gpt-5", temperature=1.0, max_tokens=32000, api_key=key)
)

optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

2025/12/07 00:10:19 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 580 metric calls of the program. This amounts to 5.80 full evals on the train+val set.
2025/12/07 00:10:19 INFO dspy.teleprompt.gepa.gepa: Using 50 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget. GEPA requires you to provide the smallest valset that is just large enough to match your downstream task distribution, while providing as large trainset as possible.
GEPA Optimization:   0%|          | 0/580 [00:00<?, ?rollouts/s]2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 9.0 / 50 (18.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.18
GEPA Optimization:   9%|▊         | 50/580 [00:00<00:01, 390.23rollouts/s]2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.18


Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:00<00:00, 653.59it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: Task: Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

General rules:
- Use only facts stated in the context. Do not rely on outside knowledge unless explicitly encoded in the special cases below.
- Return exactly one short answer string. Do not include reasoning, punctuation beyond what appears in the target string, quotes, parentheses, acronyms, or extra words.
- Prefer the most specific form present in the context (e.g., a full date over just a year) if the question asks “when.”
- For named entities, output the base name exactly as it appears in the context, but:
  - Remove a leading “


Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:00<00:00, 474.15it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: Task: Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

General rules:
- Use only facts stated in the context. Do not rely on outside knowledge unless explicitly covered by the special cases below.
- Return exactly one short answer string. No reasoning, no extra words, no quotes, no leading/trailing spaces, and no trailing punctuation not present in the target string.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the listed special-case canonical answers).
- Prefer the most specific form present in the context (e.g




2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 50 (34.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: New program is on the linear pareto front
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Full valset score for new program: 0.34
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Full train_val score for new program: 0.34
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Individual valset scores for new program: [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 2: New valset pareto front scores: [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:00<00:00, 481.22it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: Task: Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output:
- Return exactly one short answer string.
- No reasoning, no extra words, no quotes, no leading/trailing spaces, and no trailing punctuation not present in the target string.

Core rules:
- Use only facts explicitly stated in the context. Do not rely on outside knowledge.
- Identify the required fact type (yes/no, year, date, name, place, etc.).
- Match the requested granularity:
  - If the question asks for a year, return only the year (apply the Yorùbá “Ọdun” special case below when applicable).
  - If the question asks “wh


Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:00<00:00, 610.14it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly answers the question.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the




2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 18.0 / 50 (36.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New program is on the linear pareto front
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full valset score for new program: 0.36
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Full train_val score for new program: 0.36
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Individual valset scores for new program: [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 4: New valset pareto front scores: [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:00<00:00, 510.92it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly answers the question.
- When the question is in English, preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as the




2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 50 (34.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full valset score for new program: 0.34
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Full train_val score for new program: 0.34
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Individual valset scores for new program: [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 5: New valset pareto front scores: [1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:00<00:00, 484.44it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)
2025/12/07 00:10:20 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: Task: Given a question (often in Yorùbá) and an English context passage, answer using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often Yorùbá)
- context: a short passage (English) containing the needed fact

Output:
- Return exactly one short answer string.
- No reasoning, no extra words, no quotes, no leading/trailing spaces, and no trailing punctuation not present in the target string.

Core rules:
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below applies.
- Choose the smallest text span that directly answers the question.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear i


Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:00<00:00, 581.95it/s]

2025/12/07 00:10:20 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/07 00:41:24 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly answers the question.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the listed special-case canonical answers).
- Prefer the most specific form present

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:00<00:00, 372.62it/s]

2025/12/07 00:41:27 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/07 00:42:17 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output
- Return exactly one short answer string.
- No explanations, no reasoning text, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation that isn’t in the target string.

Core rules
- Use only facts stated or directly implied in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest text span that directly answers the question.
- Match the question language:
  - If the question is in English, preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the 

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]

2025/12/07 00:42:22 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/12/07 00:43:10 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly answers the question.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the listed special-case canonical answers).
- Match the type of answer requested (e

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:01<00:00,  1.66it/s] 

2025/12/07 00:43:19 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/07 00:44:10 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: Task: Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output:
- Exactly one short answer string.
- No reasoning, no extra words, no quotes, no emojis.
- No leading/trailing spaces.
- No trailing punctuation that isn’t part of the answer itself.

Core rules:
- Use only facts stated in the context. Do not rely on outside knowledge unless a special-case canonical answer below applies.
- Choose the smallest text span that directly answers the question.
- Prefer the most specific form provided in the context (e.g., a full date over just a year if the question asks “when” and the context provides it).
- Preserve the exact wording, spelling, diacritics, capitalization, an

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:01<00:00,  2.82it/s] 

2025/12/07 00:44:16 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/07 00:55:43 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: Task: Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Core output rules:
- Output exactly one short answer string. No reasoning, no explanations, no quotes, no leading/trailing spaces, and no trailing punctuation not present in the target string.
- Use only facts stated in the context. Do not rely on outside knowledge unless a special case below explicitly instructs it.
- Preserve the exact wording, spelling, capitalization, diacritics, hyphenation, and spacing as they appear in the context (or in the special-case canonical answers below).
- Prefer the most specific form present in the context (e.g., a full date over just a year) if the question asks “when,” subjec

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]

2025/12/07 00:55:50 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/12/07 01:06:47 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for predict: Task: Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format:
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output:
- Return exactly one short answer string.
- No reasoning, no extra words, no quotes, no leading/trailing spaces, and no trailing punctuation not present in the target string.

Core rules:
- Use only facts stated in the context. Do not rely on outside knowledge unless explicitly covered by the special cases below.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the listed special-case canonical answers).
- Prefer the most specific form present in the context (e.g., a full date over just a year) if the question asks “when.”
- Choose t

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:02<00:00,  1.34it/s] 

2025/12/07 01:06:55 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/12/07 01:07:44 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly and uniquely answers the question.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the listed special-case canonical answers).
- Match the type of answe

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:01<00:00,  2.63it/s]

2025/12/07 01:07:47 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/12/07 01:08:26 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly answers the question.
- When multiple equivalent numeric/measurement formats are present (e.g., metric and imperial), prefer the shorter, most concise span. If the desired value appears in parentheses, output only the content

In [21]:
print(optimized_program.predict.signature.instructions)


Task
Answer the question using ONLY the information in the provided context, and return a single, exact-match answer string with no explanations.

Input format
- question_lang: the question (often in Yorùbá)
- context: a short passage (often in English) containing the needed fact

Output format
- Return exactly one short answer string.
- No explanations, no reasoning, no quotes.
- No leading/trailing spaces.
- Do not add trailing punctuation not present in the target string.

Core rules
- Use only facts stated in the context. Do not rely on outside knowledge, except where a special-case canonical answer below explicitly applies.
- Choose the smallest span that directly answers the question.
- Preserve the exact wording, spelling, capitalization, diacritics, and hyphenation as they appear in the context (or in the listed special-case canonical answers).
- Prefer the most specific form present in the context when the question asks “when” (e.g., a full date over just a year), unless a spe

In [22]:
evaluate(optimized_program)


Average Metric: 105.00 / 253 (41.5%): 100%|██████████| 253/253 [00:14<00:00, 17.96it/s]

2025/12/07 02:18:12 INFO dspy.evaluate.evaluate: Average Metric: 105.0 / 253 (41.5%)





Unnamed: 0,question_lang,context,example_answer_lang,reasoning,pred_answer_lang,metric_em
0,Ọmọ ọdun melo ni Cosmas Maduka nigbati o fẹ iyawo rẹ Charity?,The startup also failed sooner than expected and Maduka went on to...,[21],The context states that Maduka married Charity at age 21.,21,✔️ [1.000]
1,Igbawo ni Popu John Paul II ku ?,Pope John Paul II (Latin: Ioannes Paulus II; Italian: Giovanni Pao...,[2005],The passage states that Pope John Paul II was born on 18 May 1920 ...,2 April 2005,✔️ [0.000]
2,Ta lẹni to jẹ gomina ti Eko nigba ti wọn pari kikọ National Arts T...,The building of the National Theatre started when General Yakubu G...,[Oluṣẹgun ọbasanjọ],The National Arts Theatre was completed when Olusegun Obasanjo was...,Olusegun Obasanjo,✔️ [1.000]
3,Ipinlẹ wo l'Amẹrika ni Wikimedia Foundation kọ olu ile iṣẹ wọn si?,"Wikimedia Foundation, Inc. (WMF, also colloquially referred to as ...",[California],The context states that the Wikimedia Foundation is headquartered ...,California,✔️ [1.000]
4,Eroja pataki wo ni awọn ara Igbo fi maa n se ounjẹ abacha ?,Abacha is popular in the Eastern part of Nigeria. It is made using...,[Gbaguda],"The context specifies that abacha is prepared with dried, shredded...","dried, shredded cassava, palm oil, crayfish, ugba",✔️ [0.000]
...,...,...,...,...,...,...
248,Toonu cashu melo ni wọn gbejade lagbaye lọdun 2019?,"In 2019, four million tonnes of cashew nuts were produced globally...",[four million],"The context states that in 2019, four million tonnes of cashew nut...",four million tonnes,✔️ [0.000]
249,Ọmọ ọdun melo ni Maryam Abacha nigbati Sani abacha ku ?,As of 2000 Maryam Abacha remained in Nigeria and continued to proc...,[ọmọbinrin mẹta ati ọmọkunrin meje],The context does not specify the birth year of Maryam Abacha or th...,unknown,✔️ [0.000]
250,Ami ẹyẹ goolu Olympic melo ni Usain Bolt ni lapapo?,"An eight-time Olympic gold medallist, Bolt is the only sprinter to...",[Mẹ́jọ],The context states that Usain Bolt is an eight-time Olympic gold m...,eight,✔️ [0.000]
251,Ọdun wo ni Orlando Owoh bẹrẹ si n kọrin ?,As a young man Owoh initially entered into the carpentry trade unt...,[1958],Orlando Owoh began his music career in 1958 when he was hired by N...,Ọdun 1958,✔️ [0.000]


EvaluationResult(score=41.5, results=<list of 253 results>)