# Task 4.4.0
Task instruction:
Implement the DSPy Module: Create a DSPy module that uses the strategy you devise to generate a cooperative answer.

# Step 1: configure the LLM API

In [17]:
# Configure the LLM Model
# lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
import dspy
import os
from dotenv import load_dotenv
load_dotenv("../grok_key.ini",override=True)
lm = dspy.LM('xai/grok-3-mini', api_key=os.environ['XAI_API_KEY'])
dspy.configure(lm=lm)

# Step 2: Build the DSPY signatures and pipeline

IMPORTANT Note for Michael

Why we chose to enrich the model with the intermediary fields “student_goal” and “pragmatic_need”:

1) Capture the core idea: Cooperative answers rely on Theory of Mind (ToM) elements—understanding the user’s goals, intent, and needs. These fields allow the model to explicitly represent and reason about them.

2) Cost efficiency: If we instead generated “cooperative” questions and re-queried the context, the number of LLM calls and input tokens would increase significantly, raising costs. By focusing directly on the core idea, we can anticipate such questions more efficiently.

In [18]:
class ToMQuery(dspy.Signature):
    """
    Signature that enriches a user query with Theory of Mind insights - the asker's goals, intent and need
    """
    history = dspy.InputField(description="Previous conversation history of the user")
    current_question = dspy.InputField(description="The user's current question")
    context = dspy.InputField(description="Relevant context passages retrieved from the knowledge base")

    student_goal = dspy.OutputField(description="1-2 lines Summary of the user's learning goal or intent")
    pragmatic_need = dspy.OutputField(description="1-2 lines about the Underlying cooperative/pragmatic need inferred from the question")



class CooperativeAnswer(dspy.Signature):
    """
    Signature for generating a cooperative answer to the user's question.
    This module uses the enriched Theory of Mind information along with the
    retrieved context to produce a pragmatic and anticipatory answer that could also handle potential follow up questions.
    """
    history = dspy.InputField(
        description="Previous conversation history of the user, used for context and continuity"
    )
    current_question = dspy.InputField(
        description="The current question posed by the user"
    )
    
    context = dspy.InputField(
        description="Relevant context passages retrieved from the knowledge base or search module"
    )
    student_goal = dspy.InputField(
        description="Summary of the user's goal or interests, inferred from the conversation history"
    )
    pragmatic_need = dspy.InputField(
        description="Underlying cooperative or pragmatic need inferred from the user's question"
    )

    cooperative_answer = dspy.OutputField(
        description="Based on the input fields, generates a pragmatic and cooperative answer that anticipates potential follow-up questions"
    )


In [19]:
class CooperativeQAPipeline(dspy.Module):
    """
    Pipeline that:
      1) Retrieves top context passages,
      2) Enriches the query with Theory-of-Mind (ToM) fields,
      3) Generates a cooperative answer using context + enrichment.
    """

    def __init__(self):
        super().__init__()
        # self.enrich_query = dspy.ChainOfThought(ToMQuery)
        # self.generate_answer = dspy.ChainOfThought(CooperativeAnswer)
        self.enrich_query = dspy.ChainOfThought(ToMQuery)
        self.generate_answer = dspy.ChainOfThought(CooperativeAnswer) # CooperativeAnswer

    def forward(self, examples: list[dspy.Example]):
        if not examples:
            return []

        # Step 1: Retrieve context
        print(f"Step 1: Retrieving top-k passages for {len(examples)} examples...")
        for ex in examples:
            search = get_retriever(ex.topic) # defined below
            ex.context = search(ex.current_question).passages # use just the question for cost efficiency       
        print("Step 1 complete ✅")

        # Step 2: ToM enrichment
        print("Step 2: Running ToM enrichment...")
        tom_outputs = self.enrich_query.batch(examples)

        # Re-wrap enriched examples
        enriched_examples = []
        for idx, (ex, tom) in enumerate(zip(examples, tom_outputs), start=1):
            enriched_examples.append(
                dspy.Example(
                    history=ex.history,
                    current_question=ex.current_question,
                    context=ex.context,
                    student_goal=tom.student_goal,
                    pragmatic_need=tom.pragmatic_need,
                ).with_inputs("history", "current_question", "context", "student_goal", "pragmatic_need")
            )
            print(f"  → Enriched example {idx}/{len(examples)}")
        print("Step 2 complete ✅")

        # # Step 3: Generate cooperative answers
        # print("Step 3: Generating cooperative answers...")
        # answers = self.generate_answer.batch(enriched_examples)
        # for idx, _ in enumerate(answers, start=1):
        #     print(f"  → Answer generated for example {idx}/{len(enriched_examples)}")
        # print("Step 3 complete ✅")

        # return answers

        # Step 3: Generate cooperative answers
        print("Step 3: Generating cooperative answers...")
        answers = self.generate_answer.batch(enriched_examples)

        # Attach context to each answer
        for idx, (ans, ex) in enumerate(zip(answers, enriched_examples), start=1):
            ans.context = ex.context  # add the context to the returned answer
            print(f"  → Answer generated for example {idx}/{len(enriched_examples)}")
        print("Step 3 complete ✅")

        for ans in answers:
            ans.response = ans.cooperative_answer

        return answers


# STEP 3: set up the context retriever

In [20]:
# utility function from "pragmaticqa.ipynb" with cleanup
# Traverse a directory and read html files - extract text from the html files - and remove empty lines to reduce tokens usage laters

import os
from bs4 import BeautifulSoup
import re

def read_html_files_clean(directory):
    
    texts = []
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                soup = BeautifulSoup(file, 'html.parser')
                text = soup.get_text()
                # remove empty lines
                text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
                # optional: collapse multiple spaces
                text = re.sub(r'\s+', ' ', text)
                texts.append(text)
    return texts


In [5]:
# if the below code fails to run , run this:
# %pip install -U faiss-cpu  # or faiss-gpu if you have a GPU

In [None]:
# load the corpus , which is all the PragmatiCQA-sources html files

# import os
# folder_names = os.listdir("../PragmatiCQA-sources")
# corpus = []
# for folder in folder_names:
#     texts = read_html_files_clean("../PragmatiCQA-sources/" + folder)
#     corpus.extend(texts)
#     print(f"added {len(texts)} files from {folder}")
#     print("the total number of loaded files is: ", len(corpus))

added 449 files from 'Cats' Musical
the total number of loaded files is:  449
added 250 files from A Nightmare on Elm Street
the total number of loaded files is:  699
added 499 files from Arrowverse
the total number of loaded files is:  1198
added 492 files from Barney
the total number of loaded files is:  1690
added 498 files from Baseball
the total number of loaded files is:  2188
added 496 files from Batman
the total number of loaded files is:  2684
added 265 files from Big Nate
the total number of loaded files is:  2949
added 470 files from Bleach
the total number of loaded files is:  3419
added 310 files from Britney Spears
the total number of loaded files is:  3729
added 282 files from Detective Conan
the total number of loaded files is:  4011
added 499 files from Dinosaur
the total number of loaded files is:  4510
added 36 files from Doctor Who
the total number of loaded files is:  4546
added 322 files from Doom Patrol
the total number of loaded files is:  4868
added 61 files fr

In [None]:
import os
from sentence_transformers import SentenceTransformer
import dspy

# load all folder corpora
base_path = "../PragmatiCQA-sources"
folder_names = os.listdir(base_path)

corpus_per_folder = []
for i, folder in enumerate(folder_names):
    texts = read_html_files_clean(os.path.join(base_path, folder))  
    # texts should already be a list[str]
    # so just append directly
    corpus_per_folder.append(texts)  
    
    # print(f"added {len(texts)} files from {folder}")
    print(f"loading files to the corpus. progress: {(i+1) / len(folder_names) * 100:.2f}%")


# configure retriever settings
max_characters = 10000 # 99-th percentaile
topk_docs_to_retrieve = 5
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")
embedder = dspy.Embedder(model.encode)

# # create the mapping
# topic_to_retriever = {
#     folder: dspy.retrievers.Embeddings(
#         embedder=embedder,
#         corpus=texts,   # list[str], NOT list[list[str]]
#         k=topk_docs_to_retrieve
#     )
#     for folder, texts in zip(folder_names, corpus_per_folder)
# }


# def normalize_topic(topic: str) -> str:
#     # strip extra parentheses text
#     topic = topic.split("(")[0].strip()

#     topic_to_folder_map = {
#         "Popeye": "Popeye the Sailor",
#         "A Nightmare on Elm Street": "A Nightmare on Elm Street",
#         "The Wonderful Wizard of Oz": "Wizard of Oz",
#         "Alexander Hamilton": "Hamilton the Musical",
#         "Snoopy": "Peanuts Comics",
#         "Po": "Kung Fu Panda",
#         "Spirited Away": "Studio Ghibli",
#     }

#     return topic_to_folder_map.get(topic, topic)

# def get_retriever(topic: str):
#     folder_name = normalize_topic(topic)
#     return topic_to_retriever[folder_name]


loading files to the corpus. progress: 1.37%
loading files to the corpus. progress: 2.74%
loading files to the corpus. progress: 4.11%
loading files to the corpus. progress: 5.48%
loading files to the corpus. progress: 6.85%
loading files to the corpus. progress: 8.22%
loading files to the corpus. progress: 9.59%
loading files to the corpus. progress: 10.96%
loading files to the corpus. progress: 12.33%
loading files to the corpus. progress: 13.70%
loading files to the corpus. progress: 15.07%
loading files to the corpus. progress: 16.44%
loading files to the corpus. progress: 17.81%
loading files to the corpus. progress: 19.18%
loading files to the corpus. progress: 20.55%
loading files to the corpus. progress: 21.92%
loading files to the corpus. progress: 23.29%
loading files to the corpus. progress: 24.66%
loading files to the corpus. progress: 26.03%
loading files to the corpus. progress: 27.40%
loading files to the corpus. progress: 28.77%
loading files to the corpus. progress: 30

In [46]:
# create the mapping
topic_to_retriever = {
    folder: dspy.retrievers.Embeddings(
        embedder=embedder,
        corpus=texts,   # list[str], NOT list[list[str]]
        k=topk_docs_to_retrieve
    )
    for folder, texts in zip(folder_names, corpus_per_folder)
}


def normalize_topic(topic: str) -> str:
    # strip extra parentheses text
    topic = topic.split("(")[0].strip()

    topic_to_folder_map = {
        "Popeye": "Popeye the Sailor",
        "A Nightmare on Elm Street": "A Nightmare on Elm Street",
        "The Wonderful Wizard of Oz": "Wizard of Oz",
        "Alexander Hamilton": "Hamilton the Musical",
        "Snoopy": "Peanuts Comics",
        "Po": "Kung Fu Panda",
        "Spirited Away": "Studio Ghibli",
        "Cats Musical Wiki": "'Cats' Musical"
    }

    return topic_to_folder_map.get(topic, topic)

def get_retriever(topic: str):
    folder_name = normalize_topic(topic)
    return topic_to_retriever[folder_name]

In [None]:
# # load all folder corpora
# base_path = "../PragmatiCQA-sources"
# folder_names = os.listdir(base_path)


# import json
# import os  
# def read_data(filename, dataset_dir="../PragmatiCQA/data"):
#     corpus = []
#     with open(os.path.join(dataset_dir, filename), 'r') as f:
#         for line in f:
#             corpus.append(json.loads(line))
#     return corpus

# dataset_filename = "val.jsonl"
# pcqa_test = read_data("val.jsonl")
# pcqa_add = read_data("test.jsonl")

# # unpack the questions into a single object
# all_topics_val = [pcqa_test[i]['topic'] for i in range(len(pcqa_test))]
# all_topics_test = [pcqa_add[i]['topic'] for i in range(len(pcqa_test))]
# print("all topics below:")
# print(set(all_topics_val))
# print(set(all_topics_test))
# print("all folder names below: ")
# print(set(folder_names))
# print(len(set(all_topics_test + all_topics_val)))

# # corpus_per_folder = []
# # for i, folder in enumerate(folder_names):
# #     texts = read_html_files_clean(os.path.join(base_path, folder))
# #     corpus_per_folder.append(texts)
# #     print(f"added {len(texts)} files from {folder}")
# #     print(f"progress: {(i+1) / len(folder_names) * 100:.2f}%")

all topics below:
{'Batman', 'Popeye', 'Dinosaur', 'Game of Thrones', 'Enter the Gungeon', 'The Karate Kid', 'Jujutsu Kaisen', 'A Nightmare on Elm Street (2010 film)', 'The Wonderful Wizard of Oz (book)', 'Alexander Hamilton', 'Supernanny'}
{'Snoopy', 'Britney Spears', 'Fallout', 'LEGO', 'Po', 'Throne of Glass', 'Spirited Away', 'Mystery Science Theater 3000', 'The Legend of Zelda'}
all folder names below: 
{'Bleach', 'pragmaticqa_corpus', 'Big Nate', 'H. P. Lovecraft', 'Detective Conan', 'Barney', 'Edens Zero', 'Fallout', 'Six the Musical', 'Lemony Snicket', 'Rap', 'Shaman King', 'Madagascar', 'The Formula 1', 'Supernanny', 'Studio Ghibli', 'Batman', 'Dr. Stone', 'Hamilton the Musical', 'The BIONICLE', 'The Wheel of Time', 'Fullmetal Alchemist', 'Half-Life series', 'LEGO', 'Sonic the Hedgehog', 'Pixar', 'Throne of Glass', 'Inazuma Eleven', 'Non-alien Creatures', 'The Matrix', 'Wings of Fire', 'Lady Gaga', 'Popeye the Sailor', 'A Nightmare on Elm Street', 'Peanuts Comics', 'ShowBiz Piz

In [None]:
# # define the embedder and retriever , in similar manner to "rag.ipynb"
# max_characters = 10000  # for truncating >99th percentile of documents
# topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

# from sentence_transformers import SentenceTransformer

# # Load an extremely efficient local model for retrieval
# model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1", device="cpu")

# # Create an embedder using the model's encode method
# embedder = dspy.Embedder(model.encode)

# search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve) # set up the retriever function on the corpus

Training a 32-byte FAISS index with 338 partitions, based on 28573 x 1024-dim embeddings


# STEP 4: load the dataset which will be used by the pipeline

In [None]:
## "FOR THE FIRST QUESTION OF EACH CONVERSATION ONLY"

# load the dataset

# Load jsonl from dataset directory
import json
import os  
def read_data(filename, dataset_dir="../PragmatiCQA/data"):
    corpus = []
    with open(os.path.join(dataset_dir, filename), 'r') as f:
        for line in f:
            corpus.append(json.loads(line))
    return corpus

dataset_filename = "val.jsonl"
print("loading the dataset from: ", dataset_filename)
pcqa_test = read_data("val.jsonl")
print("Done loading the dataset.")



# unpack the questions into a single object
# all_questions = [pcqa_test[i]['qas'][j]['q'] for i in range(len(pcqa_test)) for j in range(len(pcqa_test[i]['qas']))]
# print(f"loaded: {len(all_questions)} questions")

# in task 4.4.1 , we need only the first question from the "val" set
# first_questions_only = [pcqa_test[i]['qas'][0]['q'] for i in range(len(pcqa_test))]
import dspy
first_questions_examples = [dspy.Example(current_question=pcqa_test[i]['qas'][0]['q'],
                                         topic = pcqa_test[i]["topic"],
                                         history="",
                                         context="").with_inputs('current_question', 'history', 'context') for i in range(len(pcqa_test))]


loading the dataset from:  val.jsonl
Done loading the dataset.
179


# Step 5: run the pipeline on the dataset

In [None]:
rag = CooperativeQAPipeline()  # create once
preds = rag(first_questions_examples)
print(preds)

Step 1: Retrieving top-k passages for 10 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 2 / 10 examples:  10%|█         | 1/10 [00:00<00:00, 878.39it/s]



Processed 10 / 10 examples: 100%|██████████| 10/10 [00:00<00:00, 493.05it/s]
  → Enriched example 1/10
  → Enriched example 2/10
  → Enriched example 3/10
  → Enriched example 4/10
  → Enriched example 5/10
  → Enriched example 6/10
  → Enriched example 7/10
  → Enriched example 8/10
  → Enriched example 9/10
  → Enriched example 10/10
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 5 / 10 examples:  40%|████      | 4/10 [00:00<00:00, 1264.58it/s]



Processed 6 / 10 examples:  50%|█████     | 5/10 [00:00<00:00, 377.56it/s] 



Processed 7 / 10 examples:  60%|██████    | 6/10 [00:00<00:00, 191.40it/s]



Processed 10 / 10 examples: 100%|██████████| 10/10 [00:00<00:00, 132.43it/s]
  → Answer generated for example 1/10
  → Answer generated for example 2/10
  → Answer generated for example 3/10
  → Answer generated for example 4/10
  → Answer generated for example 5/10
  → Answer generated for example 6/10
  → Answer generated for example 7/10
  → Answer generated for example 8/10
  → Answer generated for example 9/10
  → Answer generated for example 10/10
Step 3 complete ✅
[Prediction(
    reasoning="Based on the provided context, which details Freddy Krueger's origins, characteristics, and appearances in the A Nightmare on Elm Street series, I will craft a response that aligns with the user's goal of learning about his identity, origins, and key traits in a non-spoilery manner. The history is empty, so this is a standalone query, but I'll use the inferred pragmatic need to keep the answer concise, accurate, and engaging to avoid overwhelming the user. This ensures cooperation by providi

# TASK 4.4.1
Task Instruction: Perform the same evaluation as in 4.3 on the first questions in each conversation and compare the results of your model with the one in 4.3 based on the traditional text-to-text transformer.

# Note for Michael:
In task 4.3 , we are requested to compute 3 different configurations because "the model depends on the accuracy of the retriever module".
But in the LLM it is not the case: it uses a combination of artifacts including the chat history and the pretraining knowledge. so we will compare just to the "retrieved answer".


# Another Note for Michael:

the class dspy.Evaluate.SemanticF1 directly calculates the F1 score , which is harmonic mean of precision and recall.
As you can see in the source code here - https://dspy.ai/api/evaluation/SemanticF1/#dspy.evaluate.SemanticF1.forward - 
it is calculated internally and we as users cannot alter it and return just the precision or recall without changing the "forward" source code.
So we will use just the F1 score.

In [28]:
from dspy.evaluate import SemanticF1

metric = SemanticF1(decompositional=True)

examples = [
    dspy.Example(
        question=pcqa_test[i]['qas'][0]['q'],
        response=pcqa_test[i]['qas'][0]['a']) 
        for i in range(len(preds))]


# Compute the metric score for each prediction.
scores = []
for i in range(len(preds)):
    print(examples[i])
    print(preds[i])
    score = metric(examples[i], preds[i])
    print(f"completed {i}/{len(preds)} scorings")
    print(score)
    scores.append(score)


# calculate the average F1 score and standard deviation , and provide a report
import numpy as np

np_scores = np.array(scores)
sd = np.std(np_scores)
mean = np.mean(np_scores)

Example({'question': 'who is freddy krueger?', 'response': "Freddy Kruger is the nightmare in nighmare on Elm street. Please note, and to be very clear, the system that loads up wiki is not allowing access to Adam Prag, to the page... so I'll have to go from memory.  Normally you can paste things and back up what you are saying, but today that's not happening. alas."}) (input_keys=None)
Prediction(
    reasoning="Based on the provided context, which details Freddy Krueger's origins, characteristics, and appearances in the A Nightmare on Elm Street series, I will craft a response that aligns with the user's goal of learning about his identity, origins, and key traits in a non-spoilery manner. The history is empty, so this is a standalone query, but I'll use the inferred pragmatic need to keep the answer concise, accurate, and engaging to avoid overwhelming the user. This ensures cooperation by providing essential facts while anticipating follow-ups, such as questions about the movies, h

# THE ANSWER TO 4.4.1 IS HERE BELOW

In [29]:
#############################################################
########## THE ANSWER TO 4.4.1 IS HERE BELOW !!! ############
#############################################################

print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n")
print("the total number of samples is: ", len(np_scores))
print("the average F1 score is: ", mean)
print("the F1 score standard deviation is: ", sd)
print("the max score is: ", max(np_scores))
print("the min score is: ", min(np_scores))
print("\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

the total number of samples is:  10
the average F1 score is:  0.34662079097304865
the F1 score standard deviation is:  0.1813301525490811
the max score is:  0.6667
the min score is:  0.125

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


# Insights:

1) The model achieves higher scores when the question is specific and contains named entities as well as unambiguous nouns and verbs. For example: “What is Batmans real name?” (score = 0.75)

2) This specificity provides a double advantage: it allows the retriever to fetch more relevant context, and it helps the pipeline rely effectively on the context in situations where there is no prior conversation history, which is the case in this experiment.

3) Conversely, the model performs poorly and can even score zero when the question is ambiguous or amorphous, for example: “What is the movie about?” (score = 0.0)
In such cases, the retriever often returns weak or unrelated context, and without previous chat history,the generated answer is largely irrelevant and is based on knowledge from the LLM's pretraining or on the irrelevant context.

4) these findings highlight the significant importance of the context , including both relevant documents and chat history , for cooperative answering.

# Comparing the results of 4.3 and 4.4.1
double mistake - false enrichments first , then false answering 
difference in the max size  - there 4000 tokens , here 10000 tokens
topic indexing in the traditional approahc - does it help? amorphous questions with no context are better at global search . but specific questions are better with folder speciifc quesiton to retrieve the correct context
Am I correct saying that the traditional has access to the prgamatic and literal answers from the data set? this could improve the answers dramatically.. because it is a very very strong context...


# Task 4.4.2

Now consider all questions in the conversations and take into account the conversational context.
Compile and Evaluate: Compile your DSPy program (you can use a small training set from the PRAGMATICQA data for this) and evaluate it on the validation set using the same metrics as in 4.3. Explain which metric you use to drive the optimization.

# Note for Michael:
Following our mail conversation , it was agreed that using 400 examples from the "val" dataset is enough.

# Explanations
Our task involves a two-step pipeline. Given relevant source files (the context), the user’s conversational history (the history), and a question, the model first enriches the input by inferring theory of mind (ToM) elements such as the user’s goal and pragmatic need. In the second step, the model answers the question cooperatively based on this enriched representation.

In this section, we chose MIPROv2 as the optimizer for the two-step pipeline and SemanticF1 as the optimization metric, for the following reasons:

**SemanticF1 as the metric**

First, it is the metric we already used in previous sections to evaluate answer quality. Since this section aims to optimize performance relative to earlier ones, it is most consistent to optimize with respect to the same metric.

Second, more generally, SemanticF1 is well-suited for open-ended questions where answers can be long, complex, and non-uniform. It evaluates semantic similarity between system outputs and ground-truth answers, capturing partial matches and conceptual overlap rather than requiring exact wording.

**MIPROv2 as the optimizer**
MIPROv2 generates instructions and few-shot demonstrations for each step. Its instruction generation is both data-aware and demonstration-aware. It uses Bayesian Optimization to efficiently search over the space of possible generation instructions and demonstrations across modules, making it well-suited for optimizing multi-step pipelines like ours.

**Step 1:** define utility function to extract the conversational history

In [32]:
def get_history(qas, current_index):
    """
    Concatenate all q+a pairs before the current_index into a single string.

    Args:
        qas (list): list of dicts, each containing 'q' and 'a'.
        current_index (int): index of the current question.

    Returns:
        str: concatenated string of all previous q+a pairs.
    """
    history_parts = []
    for i in range(current_index):
        q = qas[i].get("q", "")
        a = qas[i].get("a", "")
        history_parts.append(f"Q: {q}\nA: {a}")
    return "\n".join(history_parts)

**Step 2:** load the trainset (n=10) and the evluation set (n=400) with conversational history

In [50]:
import random
import pprint
from dspy import Example


def sample_questions(dataset, n=10, seed=None, with_answers=False):
    """
    Sample n QA pairs from the dataset and return a list of DSPy Examples.
    
    Each Example has:
        - inputs: current_question, history, context
        - outputs: cooperative_answer (only if with_answers=True)
        - with_inputs used to explicitly mark input fields
    """
    # Collect all (item_index, qa_index) pairs
    all_indices = [
        (item_idx, qa_idx)
        for item_idx, item in enumerate(dataset)
        for qa_idx, _ in enumerate(item["qas"])
    ]
    
    # Safe-guard if n > total number of QA pairs
    n = min(n, len(all_indices))
    
    # Sample n pairs
    if seed is not None:
        random.seed(seed)
    sampled = random.sample(all_indices, n)
    
    # Build Examples
    examples = []
    for item_idx, qa_idx in sampled:
        topic = dataset[item_idx]["topic"]
        qas = dataset[item_idx]["qas"]
        question = qas[qa_idx]["q"]
        response = qas[qa_idx]["a"]
        history = get_history(qas, qa_idx)  # your existing function
        # outputs = {"cooperative_answer": qas[qa_idx]["a"]} if with_answers else {}
        # outputs = {"answer": qas[qa_idx]["a"]} if with_answers else {}

        
        example = Example(
            # question=question,
            topic = topic,
            current_question=question,
            question=question,
            history=history,
            context="",  # will be replaced by pipeline search()
            response = response
        ).with_inputs("topic","current_question", "history", "context") # "topic",
        
        # print("!!!!! ADDED with TOPIC in input !!!!!!")
        examples.append(example)
    
    return examples


# --------------------------
# Load datasets
# --------------------------
trainset_filename = "train.jsonl"
print("Loading train dataset from:", trainset_filename)
trainset = read_data(trainset_filename)
print(f"Done. Train dataset has {len(trainset)} conversations.")

valset_filename = "val.jsonl"
print("Loading validation dataset from:", valset_filename)
valset = read_data(valset_filename)
print(f"Done. Validation dataset has {len(valset)} conversations.")


# --------------------------
# Sample Examples
# --------------------------
trainset_examples = sample_questions(trainset, n=10, seed=42, with_answers=True)
valset_examples = sample_questions(valset, n=400, seed=42, with_answers=False)


pprint.pprint(trainset_examples[:5])
print("Total train examples:", len(trainset_examples))

pprint.pprint(valset_examples[:5])
print("Total validation examples:", len(valset_examples))



Loading train dataset from: train.jsonl
Done. Train dataset has 476 conversations.
Loading validation dataset from: val.jsonl
Done. Validation dataset has 179 conversations.
[Example({'topic': 'Cats Musical Wiki', 'current_question': 'Yes that sounds a good place to go.', 'question': 'Yes that sounds a good place to go.', 'history': 'Q: What is Cats the musical?\nA: Cats is a musical based on Old Possum\'s book of Practical Cats by T.S. Eliot, a poet.\nQ: When was it performed?\nA: Wow, all over, first in West End in London (1981, then next year on Broadway). Also various revivals and openings all over the world.\nQ: is it still going now?\nA: It closed on Broadway, but if you want tickets and live near any sort of urban population anywhere in the world, you\'re likely to be able to find it, local companies and troupes perform it all the time, all over the world. Tickets often are under 100.00\nQ: cool, who is the main character?\nA: There are several intetral characters, Grizzabella t

In [48]:
class CooperativeQAPipelineWrapper(dspy.Module):
    """
    Wrapper that ensures inputs (kwargs or Example list) are normalized
    into a list[dspy.Example] before calling the original pipeline.
    """

    def __init__(self, pipeline: CooperativeQAPipeline):
        super().__init__()
        self.pipeline = pipeline

    def forward(self, *args, **kwargs):
        examples = []

        # Case 1: already got a list of Examples (your style)
        if args and isinstance(args[0], list):
            examples = args[0]

        # Case 2: single example in kwargs (one-off case)
        elif kwargs:
            ex = dspy.Example(**kwargs).with_inputs(*kwargs.keys())
            examples = [ex]
            # print("!!!!!!!!!!!!!!!!!!!!")
            # print(examples)
            # print(ex)
            # print("!!!!!!!!!!!!!!!!!!!!")

        else:
            raise ValueError("Expected list[dspy.Example] or kwargs for one example")

        # Always call your real pipeline with a list of Examples
        outputs = self.pipeline.forward(examples)

        # Return list always (no auto-unwrapping)
        return outputs[0]


wrapped_pipeline_new = CooperativeQAPipelineWrapper(CooperativeQAPipeline())

**Step 3:** set up the optimization program , compile and optimize

In [51]:
TRAINSET_SIZE = 2
tp = dspy.MIPROv2(metric=dspy.evaluate.SemanticF1(decompositional=True), auto="light", num_threads=8) # for cost efficient?
optimized_rag = tp.compile(wrapped_pipeline_new, trainset=trainset_examples[:TRAINSET_SIZE], max_bootstrapped_demos=2, max_labeled_demos=2,requires_permission_to_run=False)

2025/08/25 12:31:45 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 20
minibatch: False
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 1

2025/08/25 12:31:45 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/08/25 12:31:45 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/08/25 12:31:45 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...


Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


  0%|          | 0/1 [00:00<?, ?it/s]

Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:12<00:00, 12.55s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:11<00:00, 11.26s/it]
  → Answer generated for example 1/1
Step 3 complete ✅


100%|██████████| 1/1 [00:39<00:00, 39.71s/it]


Bootstrapped 0 full traces after 0 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 4/6


  0%|          | 0/1 [00:00<?, ?it/s]

Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 844.26it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1431.50it/s]

100%|██████████| 1/1 [00:00<00:00,  7.11it/s]



  → Answer generated for example 1/1
Step 3 complete ✅
Bootstrapped 0 full traces after 0 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/6


  0%|          | 0/1 [00:00<?, ?it/s]

Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 311.31it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 882.45it/s]

100%|██████████| 1/1 [00:00<00:00,  7.03it/s]



  → Answer generated for example 1/1
Step 3 complete ✅
Bootstrapped 0 full traces after 0 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/6


  0%|          | 0/1 [00:00<?, ?it/s]

Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 633.39it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1175.53it/s]

100%|██████████| 1/1 [00:00<00:00,  7.69it/s]
2025/08/25 12:32:25 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/08/25 12:32:25 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.



  → Answer generated for example 1/1
Step 3 complete ✅
Bootstrapped 0 full traces after 0 examples for up to 1 rounds, amounting to 1 attempts.


2025/08/25 12:32:37 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/08/25 12:35:31 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/08/25 12:35:31 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Signature that enriches a user query with Theory of Mind insights - the asker's goals, intent and need

2025/08/25 12:35:31 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are an AI assistant specializing in Theory of Mind (ToM) analysis for conversational queries. Your task is to enrich a user's query by inferring their underlying goals, intents, and needs based on the conversation history, the current question, and relevant context passages. Analyze the inputs carefully to generate:

- A step-by-step reasoning process that breaks down how you infer the user's intentions from the provided information.
- A concise 1-2 line summary of the user's learning goal or intent, capturing what they aim to achieve or learn.
- A 1-2 line descri

Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.33s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.07s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.25 / 1 (25.0%): 100%|██████████| 1/1 [00:27<00:00, 27.31s/it]

2025/08/25 12:35:58 INFO dspy.evaluate.evaluate: Average Metric: 0.25003749812509374 / 1 (25.0%)
2025/08/25 12:35:58 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 25.0

2025/08/25 12:35:58 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:10<00:00, 10.87s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00, 10.00s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.22 / 1 (22.2%): 100%|██████████| 1/1 [00:34<00:00, 34.34s/it]

2025/08/25 12:36:33 INFO dspy.evaluate.evaluate: Average Metric: 0.22222204938267764 / 1 (22.2%)
2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.22 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22]
2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 25.0


2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1715.46it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1632.02it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.22 / 1 (22.2%): 100%|██████████| 1/1 [00:00<00:00,  7.80it/s]

2025/08/25 12:36:33 INFO dspy.evaluate.evaluate: Average Metric: 0.22222204938267764 / 1 (22.2%)
2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.22 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22]
2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 25.0


2025/08/25 12:36:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1483.13it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:08<00:00,  8.03s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:20<00:00, 20.77s/it]

2025/08/25 12:36:54 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:36:54 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 28.57
2025/08/25 12:36:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/08/25 12:36:54 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57]
2025/08/25 12:36:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:36:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
  0%|          | 0/1 [00:00<?, ?it/s]



Processed 1 / 1 examples: 100%|██████████| 1/1 [00:39<00:00, 39.34s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:08<00:00,  8.52s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [01:03<00:00, 63.62s/it]

2025/08/25 12:37:57 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:37:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 4'].
2025/08/25 12:37:57 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57]
2025/08/25 12:37:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:37:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]



Step 1 complete ✅
Step 2: Running ToM enrichment...




Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 673.57it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:08<00:00,  8.42s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:19<00:00, 19.97s/it]

2025/08/25 12:38:17 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57]
2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1203.53it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1589.96it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.25 / 1 (25.0%): 100%|██████████| 1/1 [00:00<00:00,  7.52it/s]

2025/08/25 12:38:17 INFO dspy.evaluate.evaluate: Average Metric: 0.25003749812509374 / 1 (25.0%)
2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 25.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 0'].
2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0]
2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:38:17 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1608.86it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1198.72it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.22 / 1 (22.2%): 100%|██████████| 1/1 [00:00<00:00,  7.64it/s]

2025/08/25 12:38:18 INFO dspy.evaluate.evaluate: Average Metric: 0.22222204938267764 / 1 (22.2%)
2025/08/25 12:38:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.22 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 1'].
2025/08/25 12:38:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22]
2025/08/25 12:38:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:38:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1218.92it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.88s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.22 / 1 (22.2%): 100%|██████████| 1/1 [00:19<00:00, 19.57s/it]

2025/08/25 12:38:37 INFO dspy.evaluate.evaluate: Average Metric: 0.22222204938267764 / 1 (22.2%)
2025/08/25 12:38:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 22.22 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 0'].
2025/08/25 12:38:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22]
2025/08/25 12:38:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:38:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1230.72it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:10<00:00, 10.28s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:28<00:00, 28.81s/it]

2025/08/25 12:39:06 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 4'].
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57]
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 676.28it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1407.48it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  7.01it/s]

2025/08/25 12:39:06 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57]
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 12 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]



Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1452.82it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1627.59it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  7.50it/s]

2025/08/25 12:39:06 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 4'].
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57]
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]



Step 1 complete ✅
Step 2: Running ToM enrichment...




Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 536.91it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1199.06it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  7.71it/s]

2025/08/25 12:39:06 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 5'].
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57]
2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 14 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1460.41it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1666.39it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  6.85it/s]

2025/08/25 12:39:07 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 5'].
2025/08/25 12:39:07 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57]
2025/08/25 12:39:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:07 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 15 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]



Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1893.59it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:10<00:00, 10.16s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.25 / 1 (25.0%): 100%|██████████| 1/1 [00:22<00:00, 22.85s/it]

2025/08/25 12:39:29 INFO dspy.evaluate.evaluate: Average Metric: 0.25003749812509374 / 1 (25.0%)
2025/08/25 12:39:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 25.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 4'].
2025/08/25 12:39:29 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0]
2025/08/25 12:39:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:29 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 16 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]



Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1503.33it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 2356.35it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  7.77it/s]

2025/08/25 12:39:30 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 0'].
2025/08/25 12:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0, 28.57]
2025/08/25 12:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:30 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 541.13it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:10<00:00, 10.16s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.25 / 1 (25.0%): 100%|██████████| 1/1 [00:22<00:00, 22.42s/it]

2025/08/25 12:39:52 INFO dspy.evaluate.evaluate: Average Metric: 0.25 / 1 (25.0%)
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 25.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 4'].
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0, 28.57, 25.0]
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 18 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1437.88it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1119.68it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  7.85it/s]

2025/08/25 12:39:52 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 3'].
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0, 28.57, 25.0, 28.57]
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 472.12it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1413.65it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  6.26it/s]

2025/08/25 12:39:52 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].





2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0, 28.57, 25.0, 28.57, 28.57]
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 20 / 20 =====


Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 1616.93it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 2194.82it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  8.70it/s]

2025/08/25 12:39:52 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 1'].
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0, 28.57, 25.0, 28.57, 28.57, 28.57]
2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 21 / 20 =====



Step 1: Retrieving top-k passages for 1 examples...
  0%|          | 0/1 [00:00<?, ?it/s]Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 950.44it/s]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:00<00:00, 795.73it/s]
  → Answer generated for example 1/1
Step 3 complete ✅
Average Metric: 0.29 / 1 (28.6%): 100%|██████████| 1/1 [00:00<00:00,  8.33it/s]

2025/08/25 12:39:53 INFO dspy.evaluate.evaluate: Average Metric: 0.28571428571428575 / 1 (28.6%)
2025/08/25 12:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 28.57 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 1'].
2025/08/25 12:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 25.0, 22.22, 22.22, 28.57, 28.57, 28.57, 28.57, 28.57, 25.0, 28.57, 25.0, 28.57, 28.57, 28.57, 28.57]
2025/08/25 12:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 28.57


2025/08/25 12:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 28.57!





In [52]:
from concurrent.futures import ThreadPoolExecutor
import dspy
from tqdm import tqdm

class ParallelBatchCompiledPipeline(dspy.Module):
    def __init__(self, compiled_module, max_workers=4):
        super().__init__()
        self.compiled = compiled_module
        self.max_workers = max_workers

    def batch(self, examples: list[dspy.Example]):
        print("total example length is: ", len(examples))
        def process(ex):
            out = self.compiled.forward([ex])
            # ex.answer = out.get("answer")
            # ex.reasoning = out.get("reasoning")
            return out

        # executor.map preserves order of input
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(tqdm(executor.map(process, examples),
                                total=len(examples),
                                desc="--- OVERALL PROCESSING PROGRESS ---"))
        return results

In [53]:
EVAL_SET_SIZE = 15
parallel_optimized_rag = ParallelBatchCompiledPipeline(optimized_rag) # MIPROv2 accepts single examples
new_preds = parallel_optimized_rag.batch(valset_examples[:EVAL_SET_SIZE])

total example length is:  15
Step 1: Retrieving top-k passages for 1 examples...
Step 1: Retrieving top-k passages for 1 examples...
Step 1: Retrieving top-k passages for 1 examples...
Step 1: Retrieving top-k passages for 1 examples...


--- OVERALL PROCESSING PROGRESS ---:   0%|          | 0/15 [00:00<?, ?it/s]

Step 1 complete ✅
Step 2: Running ToM enrichment...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Step 1 complete ✅
Step 2: Running ToM enrichment...
  0%|          | 0/1 [00:00<?, ?it/s]

[A[A
[A
[A
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.26s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.03s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.28s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
  0%|          | 0/1 [00:00<?, ?it/s]



Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.64s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:39<00:00, 39.93s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:40<00:00, 40.88s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.19s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
  0%|          | 0/1 [00:00<?, ?it/s]

[A[A

Processed 1 / 1 examples: 100%|██████████| 1/1 [00:43<00:00, 43.61s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.25s/it]
  → Answer generated f

--- OVERALL PROCESSING PROGRESS ---:   7%|▋         | 1/15 [00:55<12:53, 55.26s/it]


  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.24s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:08<00:00,  8.43s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:08<00:00,  8.55s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:11<00:00, 11.75s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:10<00:00, 10.02s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running

--- OVERALL PROCESSING PROGRESS ---:  47%|████▋     | 7/15 [01:12<01:06,  8.34s/it]


  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.07s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.46s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:07<00:00,  7.39s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:10<00:00, 10.67s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
  0%|          | 0/1 [00:00<?, ?it/s]



Processed 1 / 1 examples: 100%|██████████| 1/1 [00:08<00:00,  8.40s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
  0%|          | 0/1 [00:00<?, ?it/s]



Processed 1 / 1 examples: 100%|██████████| 1/1 [00:41<00:00, 41.06s/it]

--- OVERALL PROCESSING PROGRESS ---:  60%|██████    | 9/15 [01:47<01:06, 11.02s/it]


  → Answer generated for example 1/1
Step 3 complete ✅
Step 1: Retrieving top-k passages for 1 examples...
Step 1 complete ✅
Step 2: Running ToM enrichment...
  0%|          | 0/1 [00:00<?, ?it/s]



Processed 1 / 1 examples: 100%|██████████| 1/1 [00:39<00:00, 39.04s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:09<00:00,  9.70s/it]
  → Answer generated for example 1/1
Step 3 complete ✅
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:42<00:00, 42.42s/it]

--- OVERALL PROCESSING PROGRESS ---:  73%|███████▎  | 11/15 [02:01<00:39,  9.94s/it]


  → Answer generated for example 1/1
Step 3 complete ✅




Processed 1 / 1 examples: 100%|██████████| 1/1 [00:44<00:00, 44.90s/it]

--- OVERALL PROCESSING PROGRESS ---:  93%|█████████▎| 14/15 [02:24<00:08,  8.94s/it]


  → Answer generated for example 1/1
Step 3 complete ✅
Processed 1 / 1 examples: 100%|██████████| 1/1 [00:44<00:00, 44.34s/it]
  → Enriched example 1/1
Step 2 complete ✅
Step 3: Generating cooperative answers...
  0%|          | 0/1 [00:00<?, ?it/s]



Processed 1 / 1 examples: 100%|██████████| 1/1 [00:43<00:00, 43.47s/it]

--- OVERALL PROCESSING PROGRESS ---: 100%|██████████| 15/15 [03:15<00:00, 13.00s/it]


  → Answer generated for example 1/1
Step 3 complete ✅





In [54]:
from dspy.evaluate import SemanticF1

# temp_rag = CooperativeQAPipeline()
# new_preds = optimized_rag(valset_examples[:EVAL_SET_SIZE])

metric = SemanticF1(decompositional=True)

examples = [
    dspy.Example(
        question=valset_examples[i].question,
        response=valset_examples[i].response) 
        for i in range(len(new_preds))]

# print(new_preds)
# Compute the metric score for each prediction.
scores_new = []
for i in range(len(new_preds)):
    print(examples[i])
    print(new_preds[i])
    score = metric(examples[i], new_preds[i])
    print(f"completed {i+1}/{len(new_preds)} scorings")
    print(score)
    scores_new.append(score)


Example({'question': 'In the House of Dragons, are new episodes released weekly or are full seasons released at a time for streaming?', 'response': 'They are released each week, HBO does this thing, like "appointment viewing" or must see TV type event viewings, they try to build buzz and maximize cash. Smart right? For instance say you get an HBO trial month or something? You get midway through, you are likely to renew, whereas if you dump it all, you\'re not getting that money.'}) (input_keys=None)
Prediction(
    reasoning="Reasoning: Let's think step by step in order to analyze the inputs and craft a response that is accurate, pragmatic, and anticipatory. First, the conversation history shows the user has been deeply engaged in questions about Game of Thrones, including its characters, episodes, seasons, and the spinoff House of the Dragon. This indicates a pattern of interest in the broader universe, so I need to maintain continuity by linking my response back to this context while

# THE ANSWER TO 4.4.2 IS HERE BELOW

In [55]:
# calculate the average F1 score and standard deviation , and provide a report
import numpy as np
np_scores_new = np.array(scores_new)
sd_new = np.std(np_scores_new)
mean_new = np.mean(np_scores_new)


print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n")
print("the total number of samples is: ", len(np_scores_new))
print("the average F1 score is: ", mean_new)
print("the F1 score standard deviation is: ", sd_new)
print("the max score is: ", max(np_scores_new))
print("the min score is: ", min(np_scores_new))
print("\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

the total number of samples is:  15
the average F1 score is:  0.23240378400995088
the F1 score standard deviation is:  0.17557047362664144
the max score is:  0.6666666666666666
the min score is:  0.0

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


# Still need to:

---------
1) fix the retriever to return more relevant context, if possible, by indexing according to topics or something similar
2) re-run 4.4.1 and compare 4.3 , write the results
3) change the name of "rag.ipynb" to "part1.ipynb" for clearnce
---------



---------
1) make sure 4.4.2 works with the new retriever
2) set the train set size to 10 or 15 and the eval set size to 400
3) re-run the whole 4.4.2
4) report the results + insights