# Creating our first Pipeline

We're going to use only the non-biomedical SRs from the SYNERGY dataset. 

And we'll start with gemma3 and see where it goes!

## Dataset Preparations

In [1]:
# pkg imports
import pandas as pd

In [2]:
# our synergy datasets' paths
synergy_dataset_path = "./data/synergy_dataset/SYNERGY_combined_data.pkl"
synergy_titles_path = "./data/synergy_dataset/SYNERGY_review_titles.csv"

In [3]:
# load our datasets
syn_df = pd.read_pickle(synergy_dataset_path)
syn_titles_df = pd.read_csv(synergy_titles_path, delimiter='#', index_col='SR_id')
# a dict to map a citation_id to their title
citation2title = syn_titles_df.to_dict(orient='dict')['title']

In [4]:
# list of all the non-biomedical systematic reviews (SRs)
non_biomed_SRs = {
    "Hall_2012",
    "Radjenovic_2013",
    "Sep_2021",
    "Smid_2020"
}

In [5]:
non_biomeds_df = syn_df[syn_df['SR_id'].isin(non_biomed_SRs)]
# insert the ttitle of the systematic review in the DataFrame
non_biomeds_df.insert(2, 'SR_title', syn_df['SR_id'].apply(lambda id: citation2title[id]))
# get indexes of rows with an empty title or abstract
na_idxs = non_biomeds_df[non_biomeds_df[['title','abstract']].isna().any(axis=1)].index
# remove the rows with empty titles or abstracts
non_biomeds_df = non_biomeds_df.drop(na_idxs, axis='index')
# create a column that maps 'label_included' to a boolean 'relevant' column
non_biomeds_df['relevant'] = non_biomeds_df['label_included'].apply(bool) 
non_biomeds_df

Unnamed: 0,doi,SR_id,SR_title,title,abstract,label_included,relevant
0,https://doi.org/10.1109/indcon.2010.5712716,Hall_2012,A Systematic Literature Review on Fault Predic...,Computer vision based offset error computation...,The use of computer vision based approach has ...,0,False
1,https://doi.org/10.1109/induscon.2010.5740045,Hall_2012,A Systematic Literature Review on Fault Predic...,Design and development of a software for fault...,This paper presents an on-line fault diagnosis...,0,False
2,https://doi.org/10.1109/tpwrd.2005.848672,Hall_2012,A Systematic Literature Review on Fault Predic...,Analytical Approach to Internal Fault Simulati...,A new method for simulating faulted transforme...,0,False
3,https://doi.org/10.1109/icelmach.2008.4799852,Hall_2012,A Systematic Literature Review on Fault Predic...,Nonlinear equivalent circuit model of a tracti...,The paper presents the development of an equiv...,0,False
4,https://doi.org/10.1109/ipdps.2006.1639408,Hall_2012,A Systematic Literature Review on Fault Predic...,Fault tolerance with real-time Java,After having drawn up a state of the art on th...,0,False
...,...,...,...,...,...,...,...
66482,https://doi.org/10.1109/ictai.2010.27,Radjenovic_2013,Software fault prediction metrics: A systemati...,Attribute Selection and Imbalanced Data: Probl...,The data mining and machine learning community...,0,False
66483,https://doi.org/10.1109/acc.2001.945656,Radjenovic_2013,Software fault prediction metrics: A systemati...,Benchmarking of advanced technologies for proc...,Global competition is forcing industrial plant...,0,False
66484,https://doi.org/10.1109/icsess.2010.5552438,Radjenovic_2013,Software fault prediction metrics: A systemati...,Queueing models based performance evaluation a...,Since queueing is a common behavior in compute...,0,False
66485,https://doi.org/10.1109/wicom.2011.6040617,Radjenovic_2013,Software fault prediction metrics: A systemati...,A New Face Detection Method with GA-BP Neural ...,"In this paper, the BP neural network improved ...",0,False


## Getting DSPY going
We have our data, let's test it with DSPy!

In [6]:
# pkg imports
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# configuring our local gemma3 model
lm = dspy.LM('ollama_chat/gemma3', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [8]:
# testing out the LM
lm("Say 'Hello world!'", temperature=0.7) 

['Hello world!\n']

## It works: basic pipeline time!
Let's create the most basic pipeline.

It just takes in the SRs titlea and a candidate citation's title and abstract.

I think this starts with a basic signature class.

In [9]:
class Relevance(dspy.Signature):
    """Classify a citation's relevance to a systematic review."""

    sr_title: str = dspy.InputField()
    citation_title: str = dspy.InputField()
    citation_abstract: str = dspy.InputField()
    relevant: bool = dspy.OutputField()
    confidence: float = dspy.OutputField()

In [10]:
first_row = non_biomeds_df.iloc[0,:]
classify = dspy.ChainOfThought(Relevance)
classify(
    sr_title=first_row['SR_title'],
    citation_title=first_row['title'],
    citation_abstract=first_row['abstract']
)

Prediction(
    reasoning='The systematic review focuses on fault prediction performance in software engineering. The citation discusses an FPGA-based computer vision system for offset error computation in *web printing machines*. While both relate to error detection and control, the citation is specifically about hardware-based image processing for a mechanical manufacturing process (printing), not software fault prediction. Therefore, the connection is weak.',
    relevant=False,
    confidence=0.3
)

## Building out the Development Set
17000 citations is still too many to run evaluations in decent time.
So, let's aim for a max of 300, that's 75 from each SR. 

In [11]:
# pkg improts
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from dspy.evaluate import Evaluate, metrics

In [12]:
# initialise our random under sampler
rus = RandomUnderSampler(random_state=42, sampling_strategy={0: 67, 1: 8})

In [13]:
def create_rus_dataset(df: pd.DataFrame, rus: RandomUnderSampler = rus) -> tuple[np.array, np.array]:
    """
    Given a group/DataFrame of a particular sytematic review.
    Returns a randomly undersampled dataset of inputs and labels.
    """
    Xs = np.array(group[['SR_title','title','abstract']])
    Ys = np.array(group['relevant'])
    X_resampled, Y_resampled = rus.fit_resample(Xs, Ys)
    return  X_resampled, Y_resampled

In [14]:
# create a development set of inputs and labels by randomly undersampling
# each systematic review so that it contains 8 positive labels and 67 negative
# ones
Xs, Ys = [], [] 
for _, group in non_biomeds_df.groupby(by='SR_id'):
    grp_Xs, grp_Ys = create_rus_dataset(group)
    Xs.append(grp_Xs)
    Ys.append(grp_Ys)
Xs = np.concatenate(Xs)
Ys = np.concatenate(Ys)

In [15]:
# create our development set of DSPy Example objects
devset = [dspy.Example(sr_title=x[0], citation_title=x[1], citation_abstract=x[2], relevant=y)\
          .with_inputs('sr_title', 'citation_title', 'citation_abstract')
          for x,y in zip(Xs, Ys)]

In [16]:
# evaluate setup
evaluator = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)

In [17]:
# basic exact metric for our Relevance signature
def validate_answer(example, pred, trace=None):
    return example.relevant == pred.relevant

In [18]:
# launch evaluation with a basic 'exact match' metric
evaluator(classify, metric=validate_answer)

Average Metric: 93.00 / 300 (31.0%): 100%|████████████████████████████████████████████| 300/300 [00:06<00:00, 46.06it/s]

2025/04/18 23:52:40 INFO dspy.evaluate.evaluate: Average Metric: 93 / 300 (31.0%)





Unnamed: 0,sr_title,citation_title,citation_abstract,example_relevant,reasoning,pred_relevant,confidence,validate_answer
0,A Systematic Literature Review on Fault Prediction Performance in ...,Towards a Bayesian Approach in Modeling the Disclosure of Unique S...,Software security has both an objective and a subjective component...,False,The systematic review focuses on fault prediction performance in s...,True,0.95,
1,A Systematic Literature Review on Fault Prediction Performance in ...,Networked vehicles for automated fault detection,Creating fault detection software for complex mechatronic systems ...,False,The systematic review focuses on fault prediction performance in s...,True,0.95,
2,A Systematic Literature Review on Fault Prediction Performance in ...,Advanced Fault-Tolerant Control of Induction-Motor Drives for EV/H...,This paper describes active fault-tolerant control systems for a h...,False,The systematic review focuses on fault prediction performance in s...,False,0.6,✔️ [True]
3,A Systematic Literature Review on Fault Prediction Performance in ...,Fault ride through of DFIG wind turbines during symmetrical voltag...,Low Voltage Ride Through is an important feature for wind turbine ...,False,The systematic review focuses on fault prediction performance in s...,False,0.85,✔️ [True]
4,A Systematic Literature Review on Fault Prediction Performance in ...,Integration of Multivariate Control Charts and Neural Networks to ...,"Because of advanced technology, there are many aspects of quality ...",False,The systematic review focuses on fault prediction performance in s...,True,0.95,


np.float64(31.0)

In [19]:
# see how well basic Prediction does compared to ChainOfThought
simple_predict = dspy.Predict(Relevance)
result = evaluator(simple_predict, metric=validate_answer)

Average Metric: 36.00 / 300 (12.0%): 100%|████████████████████████████████████████████| 300/300 [00:08<00:00, 36.08it/s]


2025/04/18 23:52:48 INFO dspy.evaluate.evaluate: Average Metric: 36 / 300 (12.0%)


Unnamed: 0,sr_title,citation_title,citation_abstract,example_relevant,pred_relevant,confidence,validate_answer
0,A Systematic Literature Review on Fault Prediction Performance in ...,Towards a Bayesian Approach in Modeling the Disclosure of Unique S...,Software security has both an objective and a subjective component...,False,True,0.95,
1,A Systematic Literature Review on Fault Prediction Performance in ...,Networked vehicles for automated fault detection,Creating fault detection software for complex mechatronic systems ...,False,True,0.95,
2,A Systematic Literature Review on Fault Prediction Performance in ...,Advanced Fault-Tolerant Control of Induction-Motor Drives for EV/H...,This paper describes active fault-tolerant control systems for a h...,False,True,0.95,
3,A Systematic Literature Review on Fault Prediction Performance in ...,Fault ride through of DFIG wind turbines during symmetrical voltag...,Low Voltage Ride Through is an important feature for wind turbine ...,False,True,0.95,
4,A Systematic Literature Review on Fault Prediction Performance in ...,Integration of Multivariate Control Charts and Neural Networks to ...,"Because of advanced technology, there are many aspects of quality ...",False,True,0.95,


## New metrics
We've got a basic pipeline and evaluator going.

It's time to create the metrics we actually want to use.

We can then iterate on development and construct more sophisticated pipelines.

### F1 Score
Let's implmenet an F1 score metric.

In [20]:
# std lib imports
from collections import Counter
from typing import Callable
from math import sqrt

# pkg imports
from tqdm import tqdm

In [21]:
def precision(c: Counter) -> float:
    pos_total = c['TP'] + c['FP']
    if pos_total:
        return c['TP']/pos_total
    return float('nan')
def recall(c: Counter) -> float:
    divisor = c['TP'] + c['FN']
    if divisor:
        return c['TP']/divisor
    return float('nan')
def f1score(c: Counter) -> float:
    prec, rec = precision(c), recall(c)
    if prec and rec:
        return 2*(prec*rec)/(prec+rec)
    return float('nan')
def specificity(c: Counter) -> float:
    n = sum(c.values())
    if n:
        return 1-(c['FP']/n)
    return float('nan')

In [22]:
def validate_f1answer(example, pred, trace=None):
    if trace is None:
        if example.relevant and pred.relevant:
            return "TP" # return True Positive
        elif not example.relevant and not pred.relevant:
            return "TN" # return True Negative
        elif not example.relevant and pred.relevant:
            return "FP" # return False Positive
        else:
            return "FN" # return False Negative
    else:
        return example.relevant == pred.relevant

In [23]:
def f1evaluate(program: dspy.Program,
               devset: list[dspy.Example],
               metric: Callable = validate_f1answer) -> None:
    c = Counter()
    with tqdm(total=len(devset),
              bar_format="{postfix[0]} {postfix[1][value]:.3f} {l_bar}{bar}'| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, ' '{rate_fmt}]'",
              postfix=["F1 Score:", {'value': float('nan')}]) as t:
        for x in devset:
            pred = program(**x.inputs())
            score = metric(x, pred)
            c[score] += 1
        
            # scores
            prec = precision(c)
            rec = recall(c)
            f1 = f1score(c)
            
            # update progress bar
            t.postfix[1]['value'] = f1
            t.update()

    print(c)
    mcc = mccscore(c)
    print(f"Precision: {prec:.3f}")
    print(f"Recall: {rec:.3f}")
    print(f"F1: {f1:.3f}")
    print(f"MCC: {mcc:.3f}")
    print(f"Specificity: {specificity(c):.3f}")

In [26]:
f1evaluate(simple_predict, devset)

F1 Score: 0.195 100%|██████████████████████████████████████████████████████████'| 300/300 [00:00<00:00, ' '1696.08it/s]'

Counter({'FP': 264, 'TP': 32, 'TN': 4})
Precision: 0.108
Recall: 1.000
F1: 0.195
MCC: 0.040
Specificity: 0.120





In [27]:
f1evaluate(classify, devset)

F1 Score: 0.230 100%|██████████████████████████████████████████████████████████'| 300/300 [00:00<00:00, ' '1599.33it/s]'

Counter({'FP': 206, 'TN': 62, 'TP': 31, 'FN': 1})
Precision: 0.131
Recall: 0.969
F1: 0.230
MCC: 0.152
Specificity: 0.313





### MCC
[Pretty easy to implement](https://en.wikipedia.org/wiki/Phi_coefficient)

In [25]:
def mccscore(c: Counter) -> float:
    tp, tn, fp, fn = c['TP'], c['TN'], c['FP'], c['FN']
    dividend = tp*tn - fp*fn
    divisor = sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    if divisor:
        return dividend/divisor
    return float('nan')

## Other Pipelines
Let's try something a bit different before optimising. 

Let's see what the other built-in modules have to offer, like Refine and BestOf.

Starting with the latter, we can use the confidence parameter and see what happens.

In [234]:
def high_confidence(args, pred):
    return pred.confidence

In [241]:
best_of_3 = dspy.BestOfN(module=classify, N=3, reward_fn=high_confidence, threshold=.5)
best_of_5 = dspy.BestOfN(module=classify, N=5, reward_fn=high_confidence, threshold=.7)

In [239]:
f1evaluate(best_of_3, devset)

F1 Score: nan   1%|▍                                                               '| 2/300 [00:00<01:17, ' ' 3.87it/s]'

nan
nan


F1 Score: nan   1%|▊                                                               '| 4/300 [00:00<01:03, ' ' 4.64it/s]'

nan
nan


F1 Score: nan   2%|█▎                                                              '| 6/300 [00:01<00:59, ' ' 4.92it/s]'

nan
nan


F1 Score: nan   3%|█▋                                                              '| 8/300 [00:01<01:08, ' ' 4.27it/s]'

nan
nan


F1 Score: nan   3%|██                                                             '| 10/300 [00:02<01:02, ' ' 4.64it/s]'

nan
nan


F1 Score: nan   4%|██▌                                                            '| 12/300 [00:02<00:59, ' ' 4.83it/s]'

nan
nan


F1 Score: nan   5%|██▉                                                            '| 14/300 [00:03<01:30, ' ' 3.17it/s]'

nan
nan


F1 Score: nan   5%|███▎                                                           '| 16/300 [00:04<01:23, ' ' 3.38it/s]'

nan
nan


F1 Score: nan   6%|███▊                                                           '| 18/300 [00:04<01:18, ' ' 3.59it/s]'

nan
nan


F1 Score: nan   7%|████▏                                                          '| 20/300 [00:05<01:06, ' ' 4.22it/s]'

nan
nan


F1 Score: nan   7%|████▌                                                          '| 22/300 [00:05<01:00, ' ' 4.62it/s]'

nan
nan


F1 Score: nan   8%|█████                                                          '| 24/300 [00:06<01:04, ' ' 4.25it/s]'

nan
nan


F1 Score: nan   9%|█████▍                                                         '| 26/300 [00:06<00:58, ' ' 4.65it/s]'

nan
nan


F1 Score: nan   9%|█████▉                                                         '| 28/300 [00:06<00:56, ' ' 4.85it/s]'

nan
nan


F1 Score: nan  10%|██████▎                                                        '| 30/300 [00:07<01:01, ' ' 4.36it/s]'

nan
nan


F1 Score: nan  11%|██████▋                                                        '| 32/300 [00:07<00:57, ' ' 4.68it/s]'

nan
nan


F1 Score: nan  11%|███████▏                                                       '| 34/300 [00:08<00:54, ' ' 4.84it/s]'

nan
nan


F1 Score: nan  12%|███████▎                                                       '| 35/300 [00:08<01:04, ' ' 4.10it/s]'

nan
nan


F1 Score: nan  13%|███████▉                                                       '| 38/300 [00:09<01:08, ' ' 3.84it/s]'

nan
nan


F1 Score: nan  13%|████████▏                                                      '| 39/300 [00:09<01:03, ' ' 4.13it/s]'

nan


F1 Score: nan  13%|████████▍                                                      '| 40/300 [00:09<01:09, ' ' 3.72it/s]'

nan


F1 Score: nan  14%|████████▊                                                      '| 42/300 [00:10<01:10, ' ' 3.65it/s]'

nan
nan


F1 Score: nan  14%|█████████                                                      '| 43/300 [00:11<01:34, ' ' 2.72it/s]'

nan


F1 Score: nan  15%|█████████▍                                                     '| 45/300 [00:11<01:18, ' ' 3.23it/s]'

nan
nan


F1 Score: nan  16%|█████████▊                                                     '| 47/300 [00:11<01:04, ' ' 3.95it/s]'

nan
nan


F1 Score: nan  16%|██████████▎                                                    '| 49/300 [00:12<00:56, ' ' 4.43it/s]'

nan
nan


F1 Score: nan  17%|██████████▋                                                    '| 51/300 [00:12<00:59, ' ' 4.22it/s]'

nan
nan


F1 Score: nan  18%|███████████▏                                                   '| 53/300 [00:13<00:53, ' ' 4.62it/s]'

nan
nan


F1 Score: nan  18%|███████████▌                                                   '| 55/300 [00:13<00:50, ' ' 4.82it/s]'

nan
nan


F1 Score: nan  19%|███████████▊                                                   '| 56/300 [00:13<00:59, ' ' 4.10it/s]'

nan


F1 Score: nan  19%|███████████▉                                                   '| 57/300 [00:14<01:24, ' ' 2.87it/s]'

nan
nan


F1 Score: nan  20%|████████████▍                                                  '| 59/300 [00:14<01:05, ' ' 3.66it/s]'

nan


F1 Score: nan  20%|████████████▌                                                  '| 60/300 [00:15<01:09, ' ' 3.46it/s]'

nan
nan


F1 Score: nan  21%|█████████████▏                                                 '| 63/300 [00:15<00:54, ' ' 4.32it/s]'

nan
nan


F1 Score: nan  21%|█████████████▍                                                 '| 64/300 [00:16<00:52, ' ' 4.48it/s]'

nan


F1 Score: nan  22%|█████████████▋                                                 '| 65/300 [00:16<00:51, ' ' 4.54it/s]'

nan


F1 Score: nan  22%|█████████████▊                                                 '| 66/300 [00:16<01:00, ' ' 3.89it/s]'

nan
nan


F1 Score: 0.118  23%|██████████████                                               '| 69/300 [00:17<00:51, ' ' 4.48it/s]'

0.0625
0.11764705882352941


F1 Score: 0.162  24%|██████████████▍                                              '| 71/300 [00:17<00:48, ' ' 4.69it/s]'

0.11428571428571428
0.16216216216216214


F1 Score: 0.205  24%|██████████████▋                                              '| 72/300 [00:17<00:47, ' ' 4.79it/s]'

0.20512820512820512


F1 Score: 0.279  25%|███████████████                                              '| 74/300 [00:18<00:51, ' ' 4.38it/s]'

0.2439024390243902
0.27906976744186046


F1 Score: 0.304  25%|███████████████▍                                             '| 76/300 [00:18<00:47, ' ' 4.69it/s]'

0.3111111111111111
0.30434782608695654


F1 Score: 0.298  26%|███████████████▋                                             '| 77/300 [00:19<00:46, ' ' 4.78it/s]'

0.2978723404255319


F1 Score: 0.292  26%|████████████████                                             '| 79/300 [00:19<00:51, ' ' 4.29it/s]'

0.29166666666666663
0.29166666666666663


F1 Score: 0.280  27%|████████████████▍                                            '| 81/300 [00:19<00:47, ' ' 4.63it/s]'

0.28571428571428575
0.27999999999999997


F1 Score: 0.275  27%|████████████████▋                                            '| 82/300 [00:20<00:46, ' ' 4.74it/s]'

0.2745098039215686
0.26923076923076916


F1 Score: 0.264  28%|█████████████████                                            '| 84/300 [00:20<00:44, ' ' 4.85it/s]'

0.26415094339622647


F1 Score: 0.255  29%|█████████████████▍                                           '| 86/300 [00:21<00:49, ' ' 4.30it/s]'

0.2592592592592593
0.25454545454545446


F1 Score: 0.250  29%|█████████████████▉                                           '| 88/300 [00:21<00:46, ' ' 4.61it/s]'

0.25454545454545446
0.25000000000000006


F1 Score: 0.246  30%|██████████████████▎                                          '| 90/300 [00:21<00:43, ' ' 4.84it/s]'

0.25000000000000006
0.2456140350877193


F1 Score: 0.241  31%|██████████████████▋                                          '| 92/300 [00:22<00:48, ' ' 4.31it/s]'

0.2456140350877193
0.24137931034482757


F1 Score: 0.237  31%|███████████████████                                          '| 94/300 [00:22<00:44, ' ' 4.65it/s]'

0.23728813559322037
0.23728813559322037


F1 Score: 0.237  32%|███████████████████▎                                         '| 95/300 [00:23<00:42, ' ' 4.78it/s]'

0.23728813559322037


F1 Score: 0.233  32%|███████████████████▌                                         '| 96/300 [00:23<00:42, ' ' 4.81it/s]'

0.23333333333333334


F1 Score: 0.226  33%|███████████████████▉                                         '| 98/300 [00:23<00:47, ' ' 4.30it/s]'

0.22950819672131148
0.22580645161290322


F1 Score: 0.219  33%|████████████████████                                        '| 100/300 [00:24<00:42, ' ' 4.68it/s]'

0.22222222222222224
0.21875


F1 Score: 0.215  34%|████████████████████▍                                       '| 102/300 [00:24<00:40, ' ' 4.90it/s]'

0.2153846153846154
0.2153846153846154


F1 Score: 0.212  34%|████████████████████▌                                       '| 103/300 [00:24<00:47, ' ' 4.14it/s]'

0.21212121212121213
0.21212121212121213


F1 Score: 0.209  35%|█████████████████████                                       '| 105/300 [00:25<00:43, ' ' 4.48it/s]'

0.208955223880597


F1 Score: 0.209  36%|█████████████████████▍                                      '| 107/300 [00:26<00:56, ' ' 3.43it/s]'

0.208955223880597
0.208955223880597


F1 Score: 0.206  36%|█████████████████████▊                                      '| 109/300 [00:26<00:51, ' ' 3.69it/s]'

0.208955223880597
0.20588235294117646


F1 Score: 0.200  37%|██████████████████████▏                                     '| 111/300 [00:27<00:44, ' ' 4.29it/s]'

0.20289855072463767
0.19999999999999998


F1 Score: 0.194  38%|██████████████████████▌                                     '| 113/300 [00:27<00:40, ' ' 4.67it/s]'

0.19718309859154926
0.19444444444444445


F1 Score: 0.189  38%|███████████████████████                                     '| 115/300 [00:27<00:43, ' ' 4.26it/s]'

0.1917808219178082
0.1891891891891892


F1 Score: 0.187  39%|███████████████████████▍                                    '| 117/300 [00:28<00:39, ' ' 4.65it/s]'

0.18666666666666668
0.18666666666666668


F1 Score: 0.184  39%|███████████████████████▌                                    '| 118/300 [00:28<00:38, ' ' 4.77it/s]'

0.1842105263157895


F1 Score: 0.182  40%|████████████████████████                                    '| 120/300 [00:29<00:49, ' ' 3.66it/s]'

0.1842105263157895
0.18181818181818182


F1 Score: 0.177  41%|████████████████████████▍                                   '| 122/300 [00:29<00:41, ' ' 4.29it/s]'

0.17948717948717952
0.17721518987341772


F1 Score: 0.175  41%|████████████████████████▊                                   '| 124/300 [00:30<00:37, ' ' 4.70it/s]'

0.17500000000000002
0.17500000000000002


F1 Score: 0.171  42%|█████████████████████████▏                                  '| 126/300 [00:30<00:40, ' ' 4.25it/s]'

0.17283950617283952
0.17073170731707316


F1 Score: 0.169  43%|█████████████████████████▌                                  '| 128/300 [00:30<00:36, ' ' 4.66it/s]'

0.1686746987951807
0.1686746987951807


F1 Score: 0.165  43%|██████████████████████████                                  '| 130/300 [00:31<00:34, ' ' 4.91it/s]'

0.16666666666666669
0.16470588235294117


F1 Score: 0.163  44%|██████████████████████████▏                                 '| 131/300 [00:31<00:33, ' ' 4.97it/s]'

0.1627906976744186


F1 Score: 0.159  44%|██████████████████████████▌                                 '| 133/300 [00:32<00:38, ' ' 4.38it/s]'

0.16091954022988506
0.15909090909090906


F1 Score: 0.159  45%|██████████████████████████▊                                 '| 134/300 [00:32<00:36, ' ' 4.56it/s]'

0.15909090909090906
0.15730337078651688


F1 Score: 0.154  46%|███████████████████████████▍                                '| 137/300 [00:32<00:33, ' ' 4.90it/s]'

0.15555555555555559
0.15384615384615385


F1 Score: 0.151  46%|███████████████████████████▊                                '| 139/300 [00:33<00:36, ' ' 4.36it/s]'

0.15217391304347824
0.15053763440860213


F1 Score: 0.149  47%|████████████████████████████▏                               '| 141/300 [00:33<00:33, ' ' 4.69it/s]'

0.14893617021276595
0.14893617021276595


F1 Score: 0.147  47%|████████████████████████████▍                               '| 142/300 [00:33<00:32, ' ' 4.80it/s]'

0.14736842105263157
0.16494845360824742


F1 Score: 0.198  48%|█████████████████████████████                               '| 145/300 [00:34<00:35, ' ' 4.34it/s]'

0.18181818181818182
0.198019801980198


F1 Score: 0.229  49%|█████████████████████████████▍                              '| 147/300 [00:35<00:32, ' ' 4.73it/s]'

0.21359223300970875
0.2285714285714286


F1 Score: 0.257  50%|█████████████████████████████▊                              '| 149/300 [00:35<00:31, ' ' 4.87it/s]'

0.24299065420560742
0.25688073394495414


F1 Score: 0.270  50%|██████████████████████████████                              '| 150/300 [00:35<00:30, ' ' 4.92it/s]'

0.27027027027027023


F1 Score: 0.265  51%|██████████████████████████████▍                             '| 152/300 [00:36<00:33, ' ' 4.44it/s]'

0.26785714285714285
0.26548672566371684


F1 Score: 0.263  51%|██████████████████████████████▊                             '| 154/300 [00:36<00:30, ' ' 4.75it/s]'

0.2631578947368421
0.2631578947368421


F1 Score: 0.259  52%|███████████████████████████████▏                            '| 156/300 [00:36<00:29, ' ' 4.91it/s]'

0.2608695652173913
0.25862068965517243


F1 Score: 0.254  53%|███████████████████████████████▌                            '| 158/300 [00:37<00:32, ' ' 4.35it/s]'

0.2564102564102564
0.25423728813559326


F1 Score: 0.250  53%|████████████████████████████████                            '| 160/300 [00:37<00:29, ' ' 4.70it/s]'

0.25210084033613445
0.25


F1 Score: 0.246  54%|████████████████████████████████▍                           '| 162/300 [00:38<00:28, ' ' 4.89it/s]'

0.24793388429752067
0.2459016393442623


F1 Score: 0.242  55%|████████████████████████████████▊                           '| 164/300 [00:38<00:31, ' ' 4.38it/s]'

0.24390243902439027
0.24193548387096778


F1 Score: 0.238  55%|█████████████████████████████████▏                          '| 166/300 [00:39<00:28, ' ' 4.72it/s]'

0.24000000000000002
0.2380952380952381


F1 Score: 0.234  56%|█████████████████████████████████▌                          '| 168/300 [00:39<00:27, ' ' 4.88it/s]'

0.2362204724409449
0.23437499999999997


F1 Score: 0.231  57%|██████████████████████████████████                          '| 170/300 [00:40<00:29, ' ' 4.35it/s]'

0.23255813953488375
0.23076923076923075


F1 Score: 0.227  57%|██████████████████████████████████▍                         '| 172/300 [00:40<00:27, ' ' 4.72it/s]'

0.22900763358778628
0.22727272727272727


F1 Score: 0.224  58%|██████████████████████████████████▊                         '| 174/300 [00:40<00:25, ' ' 4.88it/s]'

0.22556390977443608
0.22388059701492533


F1 Score: 0.221  59%|███████████████████████████████████▏                        '| 176/300 [00:41<00:28, ' ' 4.34it/s]'

0.2222222222222222
0.22058823529411764


F1 Score: 0.217  59%|███████████████████████████████████▌                        '| 178/300 [00:41<00:26, ' ' 4.66it/s]'

0.21897810218978103
0.21739130434782608


F1 Score: 0.214  60%|████████████████████████████████████                        '| 180/300 [00:42<00:24, ' ' 4.87it/s]'

0.21582733812949637
0.21428571428571425


F1 Score: 0.211  61%|████████████████████████████████████▍                       '| 182/300 [00:42<00:27, ' ' 4.30it/s]'

0.21276595744680846
0.21126760563380279


F1 Score: 0.210  61%|████████████████████████████████████▌                       '| 183/300 [00:43<00:25, ' ' 4.51it/s]'

0.2097902097902098


F1 Score: 0.208  61%|████████████████████████████████████▊                       '| 184/300 [00:43<00:25, ' ' 4.55it/s]'

0.20833333333333334


F1 Score: 0.207  62%|█████████████████████████████████████                       '| 185/300 [00:43<00:24, ' ' 4.65it/s]'

0.20689655172413793


F1 Score: 0.205  62%|█████████████████████████████████████▏                      '| 186/300 [00:43<00:24, ' ' 4.58it/s]'

0.2054794520547945


F1 Score: 0.203  63%|█████████████████████████████████████▌                      '| 188/300 [00:44<00:26, ' ' 4.25it/s]'

0.2040816326530612
0.20270270270270271


F1 Score: 0.200  63%|██████████████████████████████████████                      '| 190/300 [00:44<00:23, ' ' 4.61it/s]'

0.20134228187919465
0.2


F1 Score: 0.197  64%|██████████████████████████████████████▍                     '| 192/300 [00:44<00:22, ' ' 4.83it/s]'

0.19867549668874168
0.19736842105263155


F1 Score: 0.196  64%|██████████████████████████████████████▌                     '| 193/300 [00:45<00:21, ' ' 4.92it/s]'

0.19607843137254904


F1 Score: 0.194  65%|███████████████████████████████████████                     '| 195/300 [00:45<00:23, ' ' 4.41it/s]'

0.1948051948051948
0.1935483870967742


F1 Score: 0.191  66%|███████████████████████████████████████▍                    '| 197/300 [00:46<00:21, ' ' 4.73it/s]'

0.1923076923076923
0.19108280254777069


F1 Score: 0.190  66%|███████████████████████████████████████▌                    '| 198/300 [00:46<00:21, ' ' 4.79it/s]'

0.18987341772151897


F1 Score: 0.189  66%|███████████████████████████████████████▊                    '| 199/300 [00:46<00:20, ' ' 4.84it/s]'

0.18867924528301883


F1 Score: 0.186  67%|████████████████████████████████████████▏                   '| 201/300 [00:47<00:22, ' ' 4.35it/s]'

0.1875
0.18633540372670807


F1 Score: 0.184  68%|████████████████████████████████████████▌                   '| 203/300 [00:47<00:20, ' ' 4.72it/s]'

0.18518518518518517
0.18404907975460127


F1 Score: 0.183  68%|████████████████████████████████████████▊                   '| 204/300 [00:47<00:19, ' ' 4.82it/s]'

0.1829268292682927


F1 Score: 0.182  68%|█████████████████████████████████████████                   '| 205/300 [00:47<00:19, ' ' 4.78it/s]'

0.18181818181818182


F1 Score: 0.181  69%|█████████████████████████████████████████▏                  '| 206/300 [00:48<00:20, ' ' 4.61it/s]'

0.18072289156626503


F1 Score: 0.179  69%|█████████████████████████████████████████▌                  '| 208/300 [00:48<00:21, ' ' 4.21it/s]'

0.17964071856287422
0.1785714285714286


F1 Score: 0.176  70%|██████████████████████████████████████████                  '| 210/300 [00:49<00:19, ' ' 4.54it/s]'

0.17751479289940825
0.17647058823529413


F1 Score: 0.174  71%|██████████████████████████████████████████▍                 '| 212/300 [00:49<00:18, ' ' 4.80it/s]'

0.1754385964912281
0.17441860465116277


F1 Score: 0.172  71%|██████████████████████████████████████████▊                 '| 214/300 [00:49<00:19, ' ' 4.31it/s]'

0.17341040462427748
0.17241379310344826


F1 Score: 0.170  72%|███████████████████████████████████████████▏                '| 216/300 [00:50<00:18, ' ' 4.63it/s]'

0.17142857142857146
0.17045454545454544


F1 Score: 0.179  73%|███████████████████████████████████████████▌                '| 218/300 [00:50<00:17, ' ' 4.79it/s]'

0.16949152542372878
0.1787709497206704


F1 Score: 0.197  73%|████████████████████████████████████████████                '| 220/300 [00:51<00:18, ' ' 4.31it/s]'

0.1878453038674033
0.19672131147540986


F1 Score: 0.205  74%|████████████████████████████████████████████▏               '| 221/300 [00:51<00:17, ' ' 4.49it/s]'

0.2054054054054054
0.21390374331550802


F1 Score: 0.230  75%|████████████████████████████████████████████▊               '| 224/300 [00:52<00:15, ' ' 4.79it/s]'

0.2222222222222222
0.23036649214659688


F1 Score: 0.238  75%|█████████████████████████████████████████████               '| 225/300 [00:52<00:18, ' ' 4.01it/s]'

0.2383419689119171


F1 Score: 0.236  76%|█████████████████████████████████████████████▍              '| 227/300 [00:52<00:16, ' ' 4.44it/s]'

0.2371134020618556
0.2358974358974359


F1 Score: 0.234  76%|█████████████████████████████████████████████▊              '| 229/300 [00:53<00:14, ' ' 4.74it/s]'

0.23469387755102042
0.23350253807106597


F1 Score: 0.232  77%|██████████████████████████████████████████████              '| 230/300 [00:53<00:14, ' ' 4.84it/s]'

0.23232323232323232


F1 Score: 0.230  77%|██████████████████████████████████████████████▍             '| 232/300 [00:53<00:15, ' ' 4.31it/s]'

0.23115577889447236
0.22999999999999998


F1 Score: 0.230  78%|██████████████████████████████████████████████▊             '| 234/300 [00:54<00:14, ' ' 4.68it/s]'

0.22999999999999998
0.22999999999999998


F1 Score: 0.228  79%|███████████████████████████████████████████████▏            '| 236/300 [00:54<00:13, ' ' 4.88it/s]'

0.22885572139303484
0.2277227722772277


F1 Score: 0.227  79%|███████████████████████████████████████████████▍            '| 237/300 [00:54<00:12, ' ' 4.95it/s]'

0.22660098522167488


F1 Score: 0.225  80%|███████████████████████████████████████████████▊            '| 239/300 [00:55<00:13, ' ' 4.36it/s]'

0.22549019607843135
0.22549019607843135


F1 Score: 0.224  80%|████████████████████████████████████████████████▏           '| 241/300 [00:55<00:12, ' ' 4.70it/s]'

0.22439024390243906
0.22439024390243906


F1 Score: 0.222  81%|████████████████████████████████████████████████▌           '| 243/300 [00:56<00:11, ' ' 4.93it/s]'

0.2233009708737864
0.2222222222222222


F1 Score: 0.220  82%|█████████████████████████████████████████████████           '| 245/300 [00:56<00:12, ' ' 4.30it/s]'

0.22115384615384615
0.22009569377990432


F1 Score: 0.218  82%|█████████████████████████████████████████████████▍          '| 247/300 [00:57<00:11, ' ' 4.68it/s]'

0.21904761904761905
0.21800947867298578


F1 Score: 0.217  83%|█████████████████████████████████████████████████▌          '| 248/300 [00:57<00:10, ' ' 4.80it/s]'

0.2169811320754717


F1 Score: 0.216  83%|█████████████████████████████████████████████████▊          '| 249/300 [00:57<00:10, ' ' 4.84it/s]'

0.21596244131455397


F1 Score: 0.214  84%|██████████████████████████████████████████████████▏         '| 251/300 [00:58<00:11, ' ' 4.23it/s]'

0.21495327102803738
0.21395348837209302


F1 Score: 0.213  84%|██████████████████████████████████████████████████▌         '| 253/300 [00:58<00:10, ' ' 4.59it/s]'

0.212962962962963
0.212962962962963


F1 Score: 0.211  85%|███████████████████████████████████████████████████         '| 255/300 [00:58<00:09, ' ' 4.85it/s]'

0.21198156682027647
0.21100917431192662


F1 Score: 0.209  86%|███████████████████████████████████████████████████▍        '| 257/300 [00:59<00:09, ' ' 4.33it/s]'

0.21004566210045664
0.20909090909090908


F1 Score: 0.207  86%|███████████████████████████████████████████████████▊        '| 259/300 [00:59<00:08, ' ' 4.70it/s]'

0.20814479638009054
0.20720720720720717


F1 Score: 0.205  87%|████████████████████████████████████████████████████▏       '| 261/300 [01:00<00:07, ' ' 4.91it/s]'

0.20627802690582958
0.20535714285714285


F1 Score: 0.204  87%|████████████████████████████████████████████████████▍       '| 262/300 [01:00<00:07, ' ' 4.94it/s]'

0.20444444444444448


F1 Score: 0.203  88%|████████████████████████████████████████████████████▊       '| 264/300 [01:00<00:08, ' ' 4.40it/s]'

0.20353982300884957
0.2026431718061674


F1 Score: 0.201  89%|█████████████████████████████████████████████████████▏      '| 266/300 [01:01<00:07, ' ' 4.75it/s]'

0.2017543859649123
0.20087336244541484


F1 Score: 0.199  89%|█████████████████████████████████████████████████████▌      '| 268/300 [01:01<00:06, ' ' 4.92it/s]'

0.2
0.19913419913419914


F1 Score: 0.197  90%|██████████████████████████████████████████████████████      '| 270/300 [01:02<00:06, ' ' 4.38it/s]'

0.19827586206896552
0.1974248927038627


F1 Score: 0.196  91%|██████████████████████████████████████████████████████▍     '| 272/300 [01:02<00:05, ' ' 4.68it/s]'

0.19658119658119658
0.19574468085106383


F1 Score: 0.194  91%|██████████████████████████████████████████████████████▊     '| 274/300 [01:03<00:05, ' ' 4.85it/s]'

0.19491525423728814
0.1940928270042194


F1 Score: 0.192  92%|███████████████████████████████████████████████████████▏    '| 276/300 [01:03<00:05, ' ' 4.37it/s]'

0.19327731092436973
0.19246861924686195


F1 Score: 0.191  93%|███████████████████████████████████████████████████████▌    '| 278/300 [01:04<00:04, ' ' 4.67it/s]'

0.19166666666666668
0.19087136929460582


F1 Score: 0.189  93%|████████████████████████████████████████████████████████    '| 280/300 [01:04<00:04, ' ' 4.86it/s]'

0.19008264462809918
0.18930041152263374


F1 Score: 0.188  94%|████████████████████████████████████████████████████████▍   '| 282/300 [01:04<00:04, ' ' 4.35it/s]'

0.1885245901639344
0.18775510204081633


F1 Score: 0.186  95%|████████████████████████████████████████████████████████▊   '| 284/300 [01:05<00:03, ' ' 4.72it/s]'

0.1869918699186992
0.1862348178137652


F1 Score: 0.185  95%|█████████████████████████████████████████████████████████▏  '| 286/300 [01:05<00:02, ' ' 4.89it/s]'

0.1854838709677419
0.18473895582329317


F1 Score: 0.183  96%|█████████████████████████████████████████████████████████▌  '| 288/300 [01:06<00:02, ' ' 4.39it/s]'

0.18400000000000002
0.18326693227091634


F1 Score: 0.183  97%|██████████████████████████████████████████████████████████  '| 290/300 [01:06<00:02, ' ' 4.69it/s]'

0.18253968253968253
0.18253968253968253


F1 Score: 0.181  97%|██████████████████████████████████████████████████████████▍ '| 292/300 [01:07<00:01, ' ' 4.83it/s]'

0.18181818181818182
0.18110236220472442


F1 Score: 0.188  98%|██████████████████████████████████████████████████████████▌ '| 293/300 [01:07<00:01, ' ' 4.10it/s]'

0.1875
0.19379844961240308


F1 Score: 0.206  99%|███████████████████████████████████████████████████████████▏'| 296/300 [01:07<00:00, ' ' 4.65it/s]'

0.2
0.20610687022900767


F1 Score: 0.212  99%|███████████████████████████████████████████████████████████▍'| 297/300 [01:08<00:00, ' ' 4.71it/s]'

0.21212121212121213


F1 Score: 0.218  99%|███████████████████████████████████████████████████████████▌'| 298/300 [01:08<00:00, ' ' 4.75it/s]'

0.2180451127819549


F1 Score: 0.230 100%|████████████████████████████████████████████████████████████'| 300/300 [01:08<00:00, ' ' 4.35it/s]'

0.22388059701492538
0.22962962962962963
Counter({'FP': 207, 'TN': 61, 'TP': 31, 'FN': 1})
Precision: 0.130
Recall: 0.969
F1: 0.230
MCC: 0.150





In [1]:
f1evaluate(best_of_5, devset)

NameError: name 'f1evaluate' is not defined

In [247]:
# TODO looking at implemmentation 'classify might have to be defined slightly differently
dspy.Refine(module=classify, N=3, reward_fn=high_confidence, threshold=.5)

OSError: could not get source code

In [248]:
best_of_3r = dspy.Refine(module=dspy.ChainOfThought(Relevance), N=3, reward_fn=high_confidence, threshold=.5)

OSError: could not get source code

In [None]:
f1evaluate(best_of_3r, devset)

In [None]:
type(f