In [1]:
import dspy
from dspy.teleprompt import MIPROv2
import sys
import os
import json

In [2]:
lm = dspy.LM('openai/gpt-4o-mini', api_key=os.environ["OPENAI_API_KEY"])
dspy.configure(lm=lm)

In [3]:
# Define a signature for the task
class DescriptorGeneration(dspy.Signature):
    """Given a document, generate both general and specific descriptors 
    that facilitate rewriting the original document."""
    document: str = dspy.InputField()
    general_descriptors: list[str] = dspy.OutputField()
    specific_descriptors: list[str] = dspy.OutputField()

# Create a module using the signature
description_generator = dspy.Predict(DescriptorGeneration)

def label_f1_score(true_labels, pred_labels):
    true_set = set(true_labels)
    pred_set = set(pred_labels)
    
    # Calculate true positives, precision, and recall
    true_positives = len(true_set & pred_set)
    precision = true_positives / len(pred_set) if pred_set else 0
    recall = true_positives / len(true_set) if true_set else 0
    
    # Calculate F1 score
    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


class ModelAsJudge(dspy.Signature):
    """Judge how well the descriptors together summarize all possible aspects of the document."""

    document = dspy.InputField(desc="The original document")
    general_descriptors = dspy.InputField(desc="General descriptors should describe the document on a general level")
    specific_descriptors = dspy.InputField(desc="Specific descriptors should describe more minute details of the document.")
    quality = dspy.OutputField(desc="Do the descriptors cover all possible aspects of the given document? Answer on a scale from 1-5, where 5 is best", prefix="Rating[1-5]:")

judge = dspy.ChainOfThought(ModelAsJudge)

def quality_metric(example, pred):
    quality = judge(document=example.document,
                    general_descriptors=pred.general_descriptors,
                    specific_descriptors=pred.specific_descriptors
                   )
    return int(quality)


# Initialize the optimizer
optimizer = MIPROv2(
    metric=quality_metric,
    auto="light",  # Choose between light, medium, and heavy optimization runs
    num_threads=4,
    max_bootstrapped_demos=0, # ZERO FEW-SHOT EXAMPLES
    max_labeled_demos=0 # ZERO FEW-SHOT EXAMPLES
)

# Define your training data
with open('../results/descriptors_test2.jsonl', 'r') as f:
    file = []
    lines = f.readlines()
    for line in lines:
        file.append(json.loads(line))

train_data = []
for doc in file:
    train_data.append(
        {'document': doc['document'],
         'general_descriptors': doc['general_descriptors'],
         'specific_descriptors': doc['specific_descriptors'],
        }
    )

# Convert the training data into a list of dspy.Example objects
trainset = [dspy.Example(**data).with_inputs('document') for data in train_data]

# Compile the optimized program
optimized_description_generator = optimizer.compile(
    description_generator,
    trainset=train_data,
    max_bootstrapped_demos=3,
    max_labeled_demos=4
)

# Example usage
document = """
Tammy Faye, the musical created by Elton John, James Graham and Scissor Sisters’ Jake Shears, is to close on Broadway less than a month after its opening night.

The show about the eponymous TV evangelist sold out at London’s Almeida theatre in 2022, received rave reviews and was nominated for best new musical at the Olivier awards the following year. But on Tuesday it was announced that it will have its final curtain at the Palace theatre in New York on 8 December. That will be its 29th performance; the musical also had 24 previews before opening.

This month, the show has failed to fill more than two-thirds of its audience capacity. In the week ending 17 November, it drew a gross of $374,371 with an overall attendance of 5,732 (63% of capacity). New York’s critics were less impressed than London’s by the Rupert Goold production. In her review, the New York Times’s Elisabeth Vincentelli called Tammy Faye “strangely bland” considering its larger-than-life subject matter. Variety’s Frank Rizzo said it was a “misguided West End import”.
Tammy Faye: A New Musical review – Elton John’s hymn to biblical kitsch
Read more

Katie Brayben has made her Broadway debut reprising her Olivier award-winning role as Faye, this time opposite Christian Borle as Faye’s first husband and TV co-star, Jim Bakker. The married evangelists found fame in the 1970s through their television network, PTL (Praise the Lord). Faye was known for her singing voice, abundant emotion and extravagant makeup; a 2021 biopic starring Jessica Chastain was called The Eyes of Tammy Faye.

The musical charts her rise to fame, a series of scandals and the backlash against her advocacy for gay rights. In 1985, Faye raised awareness of HIV and Aids through a famous interview with Steve Pieters, a gay church pastor living with HIV. “She won me over when she did that,” Elton John told ABC News last month. Her allyship was “pretty remarkable for someone in the religious community,” John added.

John’s musical version of The Devil Wears Prada, starring Vanessa Williams, is now at the Dominion theatre in London. Graham’s new play, Punch, transfers from Nottingham Playhouse to the Young Vic in London in March.
"""
response = optimized_description_generator(document=document)

print(response.keywords)


2024/11/28 12:21:10 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 16



[93m[1mProjected Language Model (LM) Calls[0m

Based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Prompt Generation: [94m[1m10[0m[93m data summarizer calls + [94m[1m5[0m[93m * [94m[1m1[0m[93m lm calls in program + ([94m[1m2[0m[93m) lm calls in program-aware proposer = [94m[1m17[0m[93m prompt model calls[0m
[93m- Program Evaluation: [94m[1m16[0m[93m examples in val set * [94m[1m7[0m[93m batches = [94m[1m112[0m[93m LM program calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) 
            + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).[0m

For a preliminary estimate of potential costs, we

2024/11/28 12:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2024/11/28 12:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2024/11/28 12:21:35 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


  0%|          | 0/4 [00:00<?, ?it/s]2024/11/28 12:21:35 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example {'document': '|Viewing Single Post From: Spoilers for the Week of February 11th|\n|Lil||Feb 1 2013, 09:58 AM|\nDon\'t care about Chloe/Taniel/Jen-Jen. Don\'t care about Sami, really, but hoping that we get some good "SAMANTHA GENE!!" Marlena Death-Stares out of it. And "newfound" feelings. Please. If only.\nSTEFANO!! STEFANO, STEFANO, STEFANO!!!! :cheer:\n|Spoilers for the Week of February 11th · DAYS: News, Spoilers & Discussion|', 'general_descriptors': ['Informal tone', 'Fan discussion', 'Weekly spoilers', 'Soap opera genre', 'Concise', 'Conversational language', 'Colloquial expressions', 'Cheerful enthusiasm', 'Emphasis on character names', 'Casual commentary', 'Speculative content'], 'specific_descriptors': ['Use of exclamation marks', 'Casual references to characters', 'Appeal for dialogue content', 'Indirect expressions of dislike', 'Mentions of specific

Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/5


  0%|          | 0/4 [00:00<?, ?it/s]2024/11/28 12:21:35 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example {'document': '|Viewing Single Post From: Spoilers for the Week of February 11th|\n|Lil||Feb 1 2013, 09:58 AM|\nDon\'t care about Chloe/Taniel/Jen-Jen. Don\'t care about Sami, really, but hoping that we get some good "SAMANTHA GENE!!" Marlena Death-Stares out of it. And "newfound" feelings. Please. If only.\nSTEFANO!! STEFANO, STEFANO, STEFANO!!!! :cheer:\n|Spoilers for the Week of February 11th · DAYS: News, Spoilers & Discussion|', 'general_descriptors': ['Informal tone', 'Fan discussion', 'Weekly spoilers', 'Soap opera genre', 'Concise', 'Conversational language', 'Colloquial expressions', 'Cheerful enthusiasm', 'Emphasis on character names', 'Casual commentary', 'Speculative content'], 'specific_descriptors': ['Use of exclamation marks', 'Casual references to characters', 'Appeal for dialogue content', 'Indirect expressions of dislike', 'Mentions of specific

Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/5


  0%|          | 0/4 [00:00<?, ?it/s]2024/11/28 12:21:35 ERROR dspy.teleprompt.bootstrap: Failed to run or to evaluate example {'document': 'Free the Cans! Working Together to Reduce Waste\nIn a blog about how people share, it’s worth the occasional reference to the bizarre ways that people DON’T SHARE. Is it safe to say we live in a society that places great value on independence, private property, personal space, and privacy? Even sometimes extreme value? Is that why people at an 8-unit apartment building in Oakland, CA have separate caged stalls for eight separate trash cans? I know it’s not nice to stare, but I walked by these incarcerated cans and could not help myself. I returned with my camera, so that I could share my question with the world: Why can’t people share trash cans or a single dumpster? Or, at the very least, why can’t the cans share driveway space?\nThe Zero Waste Movement has come to the Bay Area and it calls for a new use for these eight cages. Here are my suggest

Bootstrapped 0 full traces after 3 examples for up to 1 rounds, amounting to 4 attempts.


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given a document, generate both general and specific descriptors 
that facilitate rewriting the original document.

2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Imagine you are a content editor tasked with improving a community health article that has gone viral but needs refinement for clarity and engagement. Given the article as a document, generate both general and specific descriptors that will guide the rewriting process to enhance its emotional appeal and practical relevance for the audience.

2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Imagine you are a content editor tasked with improving a community health article that has gone viral but needs refinement for clarity and engagement. Given the article as a document, generate both general and specific descriptors that will guide 

Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 2141.04it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 0.0

2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 2664.74it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 3075.85it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 3177.80it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0, 0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 3178.86it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0, 0.0, 0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 3183.23it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 3183.53it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====



Average Metric: 0.00 / 16 (0.0%): 100%|██████████| 16/16 [00:00<00:00, 3183.38it/s]

2024/11/28 12:21:41 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 16 (0.0%)
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 0.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 0.0


2024/11/28 12:21:41 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 0.0!





AttributeError: 'Prediction' object has no attribute 'keywords'

In [12]:
# Define your training data
with open('../results/descriptors_test2.jsonl', 'r') as f:
    file = []
    lines = f.readlines()
    for line in lines:
        file.append(json.loads(line))

train_data = []
for doc in file:
    train_data.append(
        {'document': doc['document'],
         'general_descriptors': doc['general_descriptors'],
         'specific_descriptors': doc['specific_descriptors'],
        }
    )

# Convert the training data into a list of dspy.Example objects
trainset = [dspy.Example(**data).with_inputs('document') for data in train_data]

trainset[0]

Example({'document': '|Viewing Single Post From: Spoilers for the Week of February 11th|\n|Lil||Feb 1 2013, 09:58 AM|\nDon\'t care about Chloe/Taniel/Jen-Jen. Don\'t care about Sami, really, but hoping that we get some good "SAMANTHA GENE!!" Marlena Death-Stares out of it. And "newfound" feelings. Please. If only.\nSTEFANO!! STEFANO, STEFANO, STEFANO!!!! :cheer:\n|Spoilers for the Week of February 11th · DAYS: News, Spoilers & Discussion|', 'general_descriptors': ['Informal tone', 'Fan discussion', 'Weekly spoilers', 'Soap opera genre', 'Concise', 'Conversational language', 'Colloquial expressions', 'Cheerful enthusiasm', 'Emphasis on character names', 'Casual commentary', 'Speculative content'], 'specific_descriptors': ['Use of exclamation marks', 'Casual references to characters', 'Appeal for dialogue content', 'Indirect expressions of dislike', 'Mentions of specific character arcs', 'Choppy sentence structure', 'Use of emojis', "Emphasis on Marlena's 'Death-Stares'", "Use of repetit

In [9]:
response

Prediction(
    general_descriptors=['theatrical production', 'musical', 'Broadway show', 'cultural commentary', 'entertainment news'],
    specific_descriptors=['Tammy Faye musical', 'Elton John', 'James Graham', 'Scissor Sisters', 'closing announcement', 'Palace theatre', 'audience capacity', 'New York critics', 'Katie Brayben', 'Christian Borle', 'PTL network', 'HIV awareness', 'gay rights advocacy', 'biopic', 'Olivier awards', 'Rupert Goold production']
)

In [10]:
optimized_description_generator

Predict(DescriptorGenaration(document -> general_descriptors, specific_descriptors
    instructions='Generate genreal and specific descriptors for a given document.'
    document = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Document:', 'desc': '${document}'})
    general_descriptors = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'General Descriptors:', 'desc': '${general_descriptors}'})
    specific_descriptors = Field(annotation=list[str] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Specific Descriptors:', 'desc': '${specific_descriptors}'})
))

In [11]:
optimized_description_generator.prompt

AttributeError: 'Predict' object has no attribute 'prompt'