In [1]:
from collections import Counter
import dspy
from dspy import OpenAI, settings
from dspy.teleprompt import MIPROv2, BootstrapFewShot
from dspy.evaluate.evaluate import Evaluate
from dspy import ColBERTv2
from dsp.utils import print_message, normalize_text
from dotenv import load_dotenv
import os
load_dotenv("/media/uberdev/ddrv/gitFolders/python_de_learners_data/.env")

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed
  from .autonotebook import tqdm as notebook_tqdm


True

In [40]:
def f1_score_01(prediction, ground_truth):
    prediction_tokens = [normalize_text(elem) for elem in prediction.split("|")]
    ground_truth_tokens = [normalize_text(elem) for elem in ground_truth.split("|")]
    
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    
    if len(prediction_tokens) == len(ground_truth_tokens) == 0:
        print_message("\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n")
    
    if num_same == 0:
        return 0
    
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"precision: {precision}, recall: {recall}, f1: {f1}") 
    return f1

Key Idea is to ping the LLM multiple times for the same prediction
and then get the f1_score of the prediction.

Keep track of the calls to LLMs inside phoenix using the instrumentation

    - Need to create a seperate project inside the Phoenix server for this exploration

    - Log the traces to that Project

    - Use the logs to further understand Metric functions

    - Move farther into OpenTelemetry

In [3]:
from openinference.instrumentation.dspy import DSPyInstrumentor
# instruments the internal calls in DSPy library
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
# help to get the span of http requests to the APIs
from opentelemetry.sdk import trace as trace_sdk
#processes the data collected from the spans
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from openinference.semconv.resource import ResourceAttributes
import phoenix as px

INFO:phoenix.config:📋 Ensuring phoenix working directory: /home/uberdev/.phoenix
INFO:phoenix.inferences.inferences:Dataset: phoenix_inferences_02bbca05-adbd-41a6-84d9-ab6f6dbbbb58 initialized


In [4]:
endpoint = "http://127.0.0.1:6006/v1/traces"
# resource = Resource(attributes={})

In [5]:
client = px.Client(endpoint=endpoint)

INFO:httpx:HTTP Request: GET http://127.0.0.1:6006/v1/traces/arize_phoenix_version "HTTP/1.1 200 OK"


In [6]:
# https://docs.arize.com/phoenix/tracing/how-to-tracing/trace-a-deployed-app
resource = Resource(attributes={
    ResourceAttributes.PROJECT_NAME: 'bswfs-f1-score'
})

In [7]:
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)

In [8]:
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

trace_api.set_tracer_provider(tracer_provider=tracer_provider)

DSPyInstrumentor().instrument(skip_dep_check=True) # here where DSPy is instrumented

In [9]:
# Need to check if the dspy is writing logs
import os
llm = OpenAI(model='gpt-4o-mini',
             api_key=os.environ['OPENAI_API_KEY'],
             max_tokens=2000)

In [10]:
colbertv2_wiki17_abstracts = ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
settings.configure(lm=llm, rm=colbertv2_wiki17_abstracts)

In [11]:
llm("Let's see if you are logging to bswfs-f1-score")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


['It seems like you\'re referring to a specific logging or scoring system, possibly related to a project or application. However, I don\'t have the capability to access external systems or databases, including any logging systems like "bswfs-f1-score." If you have questions about logging, scoring, or any related topics, feel free to ask!']

In [12]:
def F1(prediction, answers_list):
    # list of answers are entering for a single prediction
    assert isinstance(answers_list, list)
    return max(f1_score_01(prediction, ans) for ans in answers_list)

def answer_match(prediction, answers, frac=1.0):
    return F1(prediction, answers) >= frac

def answer_f1_match_01(example, pred, trace=[], frac=0.95):
    assert isinstance(example.answer, (str, list))
    print(f"Looking at the traces, {trace}")
    if isinstance(example.answer, str):
        return answer_match(pred.answer, [example.answer], frac=frac)
    else:
        return answer_match(pred.answer, example.answer, frac=frac)


In [13]:
class NewsCategorization(dspy.Signature):
    news_body = dspy.InputField(desc="The body of the news to be categorized")
    answer = dspy.OutputField(desc="Should be 'fake' or 'real'")

class CoTCombined(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought(NewsCategorization)
        self.history = []  # This will store the history of operations

    def forward(self, news_body):
        pred_list = []
        for news in news_body.split("|"):
            pred_one = self.prog(news_body=news)
            pred_list.append(pred_one.answer)
            self.history.append(f"Processed news: {news}, Prediction: {pred_one.answer}")
        return dspy.Prediction(answer="|".join(pred_list))

    def inspect_history(self, n=None):
        """
        Return the last n entries of the history. If n is None, return the entire history.
        """
        if n is None:
            return self.history
        else:
            return self.history[-n:]

In [17]:
prgm_under_test = CoTCombined()

In [18]:
prgm_under_test.dump_state()

[('prog', Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)))]


{'prog': {'lm': None,
  'traces': [],
  'train': [],
  'demos': [],
  'signature_instructions': 'Given the fields `news_body`, produce the fields `answer`.',
  'signature_prefix': 'Answer:',
  'extended_signature_instructions': 'Given the fields `news_body`, produce the fields `answer`.',
  'extended_signature_prefix': 'Answer:'}}

In [14]:
class CustomExample:
    def __init__(self, news_body, answer):
        self.news_body = news_body
        self.answer = answer
        self._dict = {"news_body": news_body,
                      "answer": answer}

    def with_inputs(self, input_key):
        return self

    def inputs(self):
        return {"news_body": self.news_body}

    def items(self):
        return self._dict.items()

    def copy(self):
        return CustomExample(self.news_body, self.answer)

    def get(self, key, default=None):
        return self._dict.get(key, default)

    def __iter__(self):
        return iter(self._dict)

    def __contains__(self, key):
        return key in self._dict

    def __getitem__(self, key):
        return self._dict[key]

    def keys(self):
        return self._dict.keys()

    def values(self):
        return self._dict.values()


In [28]:
# preping the dataset from the train_fake_real_news.tsv
custom_trainset = []
custom_devset = []

with open('train_fake_real_news.tsv', 'r') as tsv:
    lines = tsv.readlines()
    for line in lines[1:21]:
        news, truth = line.split("\t")
        if truth.strip() == '0':
            custom_trainset.append(CustomExample(news, 'fake'))
        else:
            custom_trainset.append(CustomExample(news, 'real'))
    for line in lines[22:43]:
        news, truth = line.split("\t")
        if truth.strip() == '0':
            custom_devset.append(CustomExample(news, 'fake'))
        else:
            custom_devset.append(CustomExample(news, 'real'))

In [36]:
for x in custom_trainset[0:10]:
    print(x.items())

dict_items([('news_body', ' Courts Decide Conspiracy Nut Alex Jones Is Too Crazy To Raise His Own Kids (DETAILS)'), ('answer', 'fake')])
dict_items([('news_body', "U.S. Senator Menendez's corruption trial to proceed: judge"), ('answer', 'real')])
dict_items([('news_body', 'Search ends for bodies in Mexico City after earthquake'), ('answer', 'real')])
dict_items([('news_body', 'BUH-BYE! GLENN BECK Places Final Nail In His Coffin…And His Former Fans Won’t Miss Him [VIDEO]'), ('answer', 'fake')])
dict_items([('news_body', 'BREAKING: Michigan Native KID ROCK Announces He’s Running For US Senate'), ('answer', 'fake')])
dict_items([('news_body', 'Medicaid cuts coming in Trump budget: Washington Post'), ('answer', 'real')])
dict_items([('news_body', 'GOP MAJORITY SENATE FINALLY GETS IT RIGHT: Votes To Gut Obamacare And Defund Planned Parenthood'), ('answer', 'fake')])
dict_items([('news_body', 'Tillerson seeks to reassure worried Europe over Trump'), ('answer', 'real')])
dict_items([('news_bo

In [37]:
print("Starting to build the optimizer")
model_to_generate_prompts = llm
model_that_solves_task = CoTCombined()
your_defined_metric = answer_f1_match_01
num_new_prompts_generated = 10
prompt_generation_temperature = 0.7

Starting to build the optimizer


In [42]:
from dspy.evaluate import Evaluate
test_with_evaluator = Evaluate(devset=custom_devset[:5], display_progress=True, num_threads=1)
test_with_evaluator(model_that_solves_task, metric=your_defined_metric)

  0%|          | 0/5 [00:00<?, ?it/s]Looking at the traces, []
precision: 1.0, recall: 1.0, f1: 1.0
Average Metric: 1 / 1  (100.0):   0%|          | 0/5 [00:00<?, ?it/s]Looking at the traces, []
precision: 1.0, recall: 1.0, f1: 1.0
Average Metric: 2 / 2  (100.0):  20%|██        | 1/5 [00:00<00:00, 11.35it/s]Looking at the traces, []
precision: 1.0, recall: 1.0, f1: 1.0
Average Metric: 3 / 3  (100.0):  60%|██████    | 3/5 [00:00<00:00, 18.41it/s]Looking at the traces, []
precision: 1.0, recall: 1.0, f1: 1.0
Average Metric: 4 / 4  (100.0):  60%|██████    | 3/5 [00:00<00:00, 18.41it/s]Looking at the traces, []
precision: 1.0, recall: 1.0, f1: 1.0
Average Metric: 5 / 5  (100.0): 100%|██████████| 5/5 [00:00<00:00, 18.64it/s]

INFO:dspy.evaluate.evaluate:[2m2024-08-17T12:54:26.728863Z[0m [[32m[1minfo     [0m] [1mAverage Metric: 5 / 5 (100.0%)[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m





100.0

In [38]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

config = dict(max_bootstrapped_demos=2,
              max_labeled_demos=4,
              num_candidate_programs=2,
              num_threads=6)

teleprompter = BootstrapFewShotWithRandomSearch(metric=your_defined_metric, **config)

Going to sample between 1 and 2 traces per predictor.
Will attempt to bootstrap 2 candidate sets.


In [19]:
# teleprompter = BootstrapFewShot(metric=your_defined_metric)
# print("Teleprompter assembled")

Teleprompter assembled


In [39]:
optimized_cotcomb = teleprompter.compile(model_that_solves_task,
                                    trainset=custom_trainset,
                                    valset=custom_devset)

  0%|          | 0/21 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 1 / 1  (100.0):   5%|▍         | 1/21 [00:01<00:25,  1.25s/it]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 2 / 2  (100.0):  10%|▉         | 2/21 [00:01<00:14,  1.34it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 3 / 3  (100.0):  10%|▉         | 2/21 [00:01<00:14,  1.34it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 5 / 5  (100.0):  19%|█▉        | 4/21 [00:02<00:06,  2.45it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 7 / 7  (100.0):  29%|██▊       | 6/21 [00:02<00:05,  2.53it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 8 / 8  (100.0):  38%|███▊      | 8/21 [00:03<00:05,  2.37it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 9 / 9  (100.0):  38%|███▊      | 8/21 [00:03<00:05,  2.37it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 10 / 10  (100.0):  48%|████▊     | 10/21 [00:03<00:03,  3.45it/s]

Looking at the traces, []
Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 11 / 11  (100.0):  52%|█████▏    | 11/21 [00:04<00:03,  2.55it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 12 / 12  (100.0):  52%|█████▏    | 11/21 [00:04<00:03,  2.55it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 13 / 13  (100.0):  62%|██████▏   | 13/21 [00:05<00:02,  2.98it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 14 / 14  (100.0):  67%|██████▋   | 14/21 [00:05<00:02,  2.99it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 15 / 15  (100.0):  67%|██████▋   | 14/21 [00:05<00:02,  2.99it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 16 / 16  (100.0):  76%|███████▌  | 16/21 [00:06<00:01,  2.80it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 16 / 17  (94.1):  81%|████████  | 17/21 [00:06<00:01,  3.06it/s] INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 17 / 18  (94.4):  81%|████████  | 17/21 [00:06<00:01,  3.06it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 18 / 19  (94.7):  90%|█████████ | 19/21 [00:06<00:00,  3.33it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 19 / 20  (95.0):  95%|█████████▌| 20/21 [00:07<00:00,  2.58it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 20 / 21  (95.2):  95%|█████████▌| 20/21 [00:07<00:00,  2.58it/s]

Looking at the traces, []
Looking at the traces, []


Average Metric: 20 / 21  (95.2): 100%|██████████| 21/21 [00:07<00:00,  2.66it/s]
INFO:dspy.evaluate.evaluate:[2m2024-08-17T12:37:44.186435Z[0m [[32m[1minfo     [0m] [1mAverage Metric: 20 / 21 (95.2%)[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Score: 95.24 for set: [0]
New best sscore: 95.24 for seed -3
Scores so far: [95.24]
Best score: 95.24


  0%|          | 0/21 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 1 / 1  (100.0):   5%|▍         | 1/21 [00:01<00:26,  1.35s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []
Looking at the traces, []


Average Metric: 1 / 2  (50.0):  10%|▉         | 2/21 [00:01<00:12,  1.49it/s] INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 3 / 4  (75.0):  14%|█▍        | 3/21 [00:01<00:09,  1.91it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 4 / 5  (80.0):  24%|██▍       | 5/21 [00:02<00:04,  3.29it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 5 / 6  (83.3):  29%|██▊       | 6/21 [00:02<00:06,  2.32it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 7 / 8  (87.5):  33%|███▎      | 7/21 [00:03<00:05,  2.37it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 8 / 9  (88.9):  43%|████▎     | 9/21 [00:03<00:03,  3.37it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 9 / 10  (90.0):  48%|████▊     | 10/21 [00:03<00:03,  3.37it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 10 / 11  (90.9):  52%|█████▏    | 11/21 [00:04<00:03,  2.74it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 11 / 12  (91.7):  57%|█████▋    | 12/21 [00:04<00:03,  2.91it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 13 / 14  (92.9):  67%|██████▋   | 14/21 [00:04<00:01,  4.64it/s]

Looking at the traces, []
Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 14 / 15  (93.3):  71%|███████▏  | 15/21 [00:05<00:01,  4.67it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 15 / 16  (93.8):  76%|███████▌  | 16/21 [00:05<00:01,  3.79it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 15 / 17  (88.2):  81%|████████  | 17/21 [00:05<00:01,  3.09it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 16 / 18  (88.9):  86%|████████▌ | 18/21 [00:06<00:00,  3.42it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 17 / 19  (89.5):  90%|█████████ | 19/21 [00:06<00:00,  3.42it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 18 / 21  (85.7): 100%|██████████| 21/21 [00:06<00:00,  3.17it/s]
INFO:dspy.evaluate.evaluate:[2m2024-08-17T12:37:50.911619Z[0m [[32m[1minfo     [0m] [1mAverage Metric: 18 / 21 (85.7%)[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Looking at the traces, []
Looking at the traces, []
Looking at the traces, []
Score: 85.71 for set: [4]
Scores so far: [95.24, 85.71]
Best score: 95.24


  0%|          | 0/20 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
  5%|▌         | 1/20 [00:02<00:46,  2.47s/it]

Looking at the traces, [(Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)), {'news_body': ' Courts Decide Conspiracy Nut Alex Jones Is Too Crazy To Raise His Own Kids (DETAILS)'}, Prediction(
    rationale='produce the answer. We analyze the news body which discusses a legal decision regarding Alex Jones, a controversial figure known for promoting conspiracy theories. The ph

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 10%|█         | 2/20 [00:04<00:43,  2.41s/it]

Looking at the traces, [(Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)), {'news_body': "U.S. Senator Menendez's corruption trial to proceed: judge"}, Prediction(
    rationale='determine the credibility of this news. The news mentions a specific U.S. Senator, Bob Menendez, and refers to a legal proceeding, which is a verifiable event. The source of the information is a ju

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 15%|█▌        | 3/20 [00:06<00:35,  2.09s/it]


Looking at the traces, [(Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)), {'news_body': 'Search ends for bodies in Mexico City after earthquake'}, Prediction(
    rationale='determine the authenticity of this news. The news mentions a search for bodies following an earthquake, which is a plausible and serious event that could occur in a city like Mexico City, known for its

  0%|          | 0/21 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 4 / 4  (100.0):  14%|█▍        | 3/21 [00:01<00:29,  1.66s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 5 / 5  (100.0):  24%|██▍       | 5/21 [00:01<00:04,  3.62it/s]

Looking at the traces, []
Looking at the traces, []
Looking at the traces, []
Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 6 / 6  (100.0):  24%|██▍       | 5/21 [00:02<00:04,  3.62it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 7 / 7  (100.0):  33%|███▎      | 7/21 [00:02<00:05,  2.60it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 11 / 11  (100.0):  48%|████▊     | 10/21 [00:03<00:04,  2.54it/s]

Looking at the traces, []Looking at the traces, []

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 12 / 12  (100.0):  57%|█████▋    | 12/21 [00:04<00:02,  3.26it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 13 / 13  (100.0):  62%|██████▏   | 13/21 [00:04<00:02,  3.55it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 17 / 18  (94.4):  81%|████████  | 17/21 [00:05<00:01,  2.05it/s] 

Looking at the traces, []
Looking at the traces, []
Looking at the traces, []
Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 18 / 19  (94.7):  90%|█████████ | 19/21 [00:06<00:00,  3.60it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 19 / 20  (95.0):  95%|█████████▌| 20/21 [00:07<00:00,  2.55it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 20 / 21  (95.2): 100%|██████████| 21/21 [00:07<00:00,  2.71it/s]
INFO:dspy.evaluate.evaluate:[2m2024-08-17T12:38:04.981232Z[0m [[32m[1minfo     [0m] [1mAverage Metric: 20 / 21 (95.2%)[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Looking at the traces, []
Score: 95.24 for set: [4]
Scores so far: [95.24, 85.71, 95.24]
Best score: 95.24
Average of max per entry across top 1 scores: 0.9523809523809523
Average of max per entry across top 2 scores: 0.9523809523809523
Average of max per entry across top 3 scores: 0.9523809523809523
Average of max per entry across top 5 scores: 0.9523809523809523
Average of max per entry across top 8 scores: 0.9523809523809523
Average of max per entry across top 9999 scores: 0.9523809523809523


  0%|          | 0/20 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
  5%|▌         | 1/20 [00:01<00:28,  1.50s/it]

Looking at the traces, [(Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)), {'news_body': 'New York Welfare Programs More Generous Than Sweden Or France'}, Prediction(
    rationale="determine the credibility of this news. First, we need to assess the claim that New York's welfare programs are more generous than those of Sweden or France. This statement seems exaggerated and

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 10%|█         | 2/20 [00:03<00:32,  1.79s/it]


Looking at the traces, [(Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)), {'news_body': "Britain, Germany committed to Iran nuclear deal: May's office"}, Prediction(
    rationale="produce the answer. We need to evaluate the credibility of the statement regarding Britain and Germany's commitment to the Iran nuclear deal. This type of news typically comes from official gove

  0%|          | 0/21 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 0 / 1  (0.0):   5%|▍         | 1/21 [00:01<00:37,  1.87s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 1 / 3  (33.3):  10%|▉         | 2/21 [00:02<00:16,  1.12it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 2 / 4  (50.0):  19%|█▉        | 4/21 [00:02<00:07,  2.21it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 2 / 5  (40.0):  24%|██▍       | 5/21 [00:02<00:05,  2.82it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 3 / 6  (50.0):  29%|██▊       | 6/21 [00:03<00:06,  2.23it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 4 / 7  (57.1):  33%|███▎      | 7/21 [00:03<00:05,  2.48it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 5 / 8  (62.5):  38%|███▊      | 8/21 [00:04<00:05,  2.24it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 6 / 9  (66.7):  43%|████▎     | 9/21 [00:04<00:04,  2.81it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []
Looking at the traces, []


Average Metric: 6 / 10  (60.0):  43%|████▎     | 9/21 [00:04<00:04,  2.81it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 7 / 11  (63.6):  52%|█████▏    | 11/21 [00:04<00:02,  3.88it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 8 / 12  (66.7):  52%|█████▏    | 11/21 [00:04<00:02,  3.88it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 9 / 13  (69.2):  62%|██████▏   | 13/21 [00:05<00:02,  2.93it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 10 / 14  (71.4):  67%|██████▋   | 14/21 [00:05<00:02,  3.31it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []
Looking at the traces, []


Average Metric: 12 / 16  (75.0):  71%|███████▏  | 15/21 [00:06<00:02,  2.55it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 13 / 17  (76.5):  81%|████████  | 17/21 [00:06<00:01,  3.55it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 14 / 18  (77.8):  86%|████████▌ | 18/21 [00:07<00:01,  2.65it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 15 / 19  (78.9):  90%|█████████ | 19/21 [00:07<00:00,  2.96it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 16 / 20  (80.0):  95%|█████████▌| 20/21 [00:08<00:00,  2.27it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 17 / 21  (81.0): 100%|██████████| 21/21 [00:08<00:00,  2.46it/s]
INFO:dspy.evaluate.evaluate:[2m2024-08-17T12:38:17.201970Z[0m [[32m[1minfo     [0m] [1mAverage Metric: 17 / 21 (81.0%)[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Looking at the traces, []
Score: 80.95 for set: [4]
Scores so far: [95.24, 85.71, 95.24, 80.95]
Best score: 95.24
Average of max per entry across top 1 scores: 0.9523809523809523
Average of max per entry across top 2 scores: 0.9523809523809523
Average of max per entry across top 3 scores: 0.9523809523809523
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  0%|          | 0/20 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
  5%|▌         | 1/20 [00:02<00:55,  2.90s/it]


Looking at the traces, [(Predict(StringSignature(news_body -> rationale, answer
    instructions='Given the fields `news_body`, produce the fields `answer`.'
    news_body = Field(annotation=str required=True json_schema_extra={'desc': 'The body of the news to be categorized', '__dspy_field_type': 'input', 'prefix': 'News Body:'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': "Should be 'fake' or 'real'", '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)), {'news_body': 'PLAYBOY “REPORTER” WHINES About Getting No Respect From Trump’s Female Deputy Press Secretary [VIDEO]'}, Prediction(
    rationale='produce the answer. We need to analyze the content of the news body. The phrase "PLAYBOY \'REPORTER\' WHINES" suggests a sensationalized or biased portray

  0%|          | 0/21 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 1 / 1  (100.0):   5%|▍         | 1/21 [00:01<00:33,  1.67s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 2 / 2  (100.0):   5%|▍         | 1/21 [00:01<00:33,  1.67s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []
Looking at the traces, []


Average Metric: 3 / 3  (100.0):  10%|▉         | 2/21 [00:01<00:31,  1.67s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 5 / 5  (100.0):  19%|█▉        | 4/21 [00:02<00:08,  1.89it/s]

Looking at the traces, []
Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 6 / 6  (100.0):  29%|██▊       | 6/21 [00:02<00:05,  2.66it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 7 / 7  (100.0):  33%|███▎      | 7/21 [00:03<00:05,  2.68it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 8 / 8  (100.0):  38%|███▊      | 8/21 [00:03<00:05,  2.50it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 10 / 10  (100.0):  43%|████▎     | 9/21 [00:03<00:04,  2.56it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 11 / 11  (100.0):  52%|█████▏    | 11/21 [00:04<00:03,  2.79it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 12 / 12  (100.0):  57%|█████▋    | 12/21 [00:04<00:03,  2.76it/s]

Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 13 / 13  (100.0):  62%|██████▏   | 13/21 [00:05<00:03,  2.56it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 14 / 14  (100.0):  67%|██████▋   | 14/21 [00:05<00:02,  3.16it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 15 / 15  (100.0):  67%|██████▋   | 14/21 [00:05<00:02,  3.16it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 15 / 15  (100.0):  71%|███████▏  | 15/21 [00:06<00:02,  2.86it/s]

Looking at the traces, []


Average Metric: 16 / 16  (100.0):  76%|███████▌  | 16/21 [00:06<00:01,  2.57it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Looking at the traces, []


Average Metric: 18 / 18  (100.0):  86%|████████▌ | 18/21 [00:07<00:00,  3.20it/s]

Looking at the traces, []
Looking at the traces, []


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 19 / 19  (100.0):  90%|█████████ | 19/21 [00:07<00:00,  2.79it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Average Metric: 20 / 20  (100.0):  90%|█████████ | 19/21 [00:07<00:00,  2.79it/s]

Looking at the traces, []
Looking at the traces, []


Average Metric: 21 / 21  (100.0): 100%|██████████| 21/21 [00:07<00:00,  2.71it/s]
INFO:dspy.evaluate.evaluate:[2m2024-08-17T12:38:27.949350Z[0m [[32m[1minfo     [0m] [1mAverage Metric: 21 / 21 (100.0%)[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m200[0m


Looking at the traces, []
Score: 100.0 for set: [4]
New best sscore: 100.0 for seed 1
Scores so far: [95.24, 85.71, 95.24, 80.95, 100.0]
Best score: 100.0
Average of max per entry across top 1 scores: 1.0
Average of max per entry across top 2 scores: 1.0
Average of max per entry across top 3 scores: 1.0
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
5 candidate programs found.
