In [None]:
from collections import defaultdict
from itertools import product
import re



def recogs_exact_match(gold, pred, flag="000000"):
    gold = normalize_formula(gold.logical_form)
    pred = normalize_formula(pred.logical_form)
    gold_conj_set = get_conj_set(gold)
    # Loop over all viable mappings from pred_vars to gold_vars:
    for this_map in _candidate_variable_maps(gold, pred):
        phi = pred
        for sourcevar, targetvar in this_map.items():
            # The flag makes sure we don't accidentally do a chain
            # of replacements via successive changes in situations
            # where the domain and range of `this_map` share vars.
            phi = variable_change(phi, sourcevar, targetvar, flag=flag)
        print ('now here!')
        phi = phi.replace(flag, "")
        phi_conj_set = get_conj_set(phi)
        # This step assumes that we have no conjuncts that are
        # tautologies, contradictions, or equality predications. If
        # such are introduced, they need to be identified ahead of
        # time and treated separately -- tautologies would be removed,
        # contradictions would reduce to comparisons of only those
        # conjuncts, and equality statements would call for special
        # handling related to variables mapping.
        if phi_conj_set == gold_conj_set:
            return True
    return False


def normalize_formula(phi):
    return phi.replace(" ", "").replace("AND" , " AND ")


binary_pred_re = re.compile(r"""
    (\w+)
    \s*
    \(
    \s*
    (\d+)
    \s*
    ,
    \s*
    (\d+)
    \s*
    \)""", re.VERBOSE)


unary_pred_re = re.compile(r"""
    (\w+)
    \s*
    \(
    \s*
    (\d+)
    \s*
    \)""", re.VERBOSE)


def _candidate_variable_maps(gold, pred):
    # This creates a mapping from tuples of predicates into their
    # associated variables. These serve as equivalence classes over
    # variables that could possibly be translations of each other.
    gold_map = _map_get_preds_to_vars(gold)
    pred_map = _map_get_preds_to_vars(pred)

    # For each prediction variable, get the set of potential
    # translations for it:
    pred2gold = defaultdict(list)
    for preds, pvars in pred_map.items():
        gvars = gold_map[preds]
        for pvar in pvars:
            pred2gold[pvar] = gold_map[preds]

    # Variable sets:
    gold_vars = set(get_variables(gold))
    pred_vars = set(get_variables(pred))

    # Now generate potentially viable mappings:
    for vals in list(product(*list(pred2gold.values()))):
        d = dict(zip(pred2gold.keys(), vals))
        if set(d.keys()) == pred_vars and set(d.values()) == gold_vars:
            yield d


def _map_get_preds_to_vars(phi):
    var2pred = defaultdict(list)
    for pred, var in unary_pred_re.findall(phi):
        var2pred[var].append(pred)
    # We could do somewhat less search by specializing to first and
    # second position for these predicates, but I think it's fine
    # as-is.
    for pred, var1, var2 in binary_pred_re.findall(phi):
        var2pred[var1].append(pred)
        var2pred[var2].append(pred)
    pred2var = defaultdict(list)
    for var, preds in var2pred.items():
        pred2var[tuple(sorted(preds))].append(var)
    return pred2var


def get_variables(phi):
    variable_re = re.compile(r"(\d+)")
    return variable_re.findall(phi)


def get_conj_set(phi):
    conj_splitter_re  = re.compile(r"\s*(?:AND|;)\s*")
    return set(conj_splitter_re.split(phi))


def variable_change(phi, sourcevar, targetvar, flag="000000"):
    replace_re = re.compile(rf"\b{sourcevar}\b")
    return replace_re.sub(f"{flag}{targetvar}", phi)


In [236]:
import pandas as pd
SRC_DIRNAME = "data"

def load_split(filename):
    return pd.read_csv(
        filename,
        delimiter="\t",
        names=['input', 'output', 'category'])

dataset = {}

for splitname in ("RECOGStrain", "RECOGSdev", "RECOGSgen"):
    dataset[splitname] = load_split(f"{SRC_DIRNAME}/{splitname}.tsv")

In [237]:
test_sentence = dataset["RECOGSgen"]["input"].tolist()[1]
print (test_sentence)

gold_Label = dataset["RECOGSgen"]["output"].tolist()[1]
print (gold_Label)

Zoe thought that a hippo cleaned .
Zoe ( 10 ) ; hippo ( 36 ) ; think ( 3 ) AND agent ( 3 , 10 ) AND ccomp ( 3 , 44 ) AND clean ( 44 ) AND agent ( 44 , 36 )


In [238]:
dspy_recogs_train = [
    dspy.Example(
        sentence=row['input'], logical_form=row['output']
    ).with_inputs("sentence")
    for _, row in dataset['RECOGStrain'].iterrows()]

dspy_recogs_dev = [
    dspy.Example(
        sentence=row['input'], logical_form=row['output']
    ).with_inputs("sentence")
    for _, row in dataset['RECOGSdev'].iterrows()]
    

In [239]:
from datasets import load_dataset
import openai
import os
import dspy
from dotenv import load_dotenv
import spacy

root_path = '.'
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(root_path, 'cache')
# keep the API keys in a `.env` file in the local root directory
load_dotenv()
openai_key = os.getenv('OPENAI_API_KEY')  # use the .env file as it is a good practice to keep keys outside of one's code

In [240]:
lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key)
dspy.settings.configure(lm=lm)

In [241]:
class HierarchyParse(dspy.Signature):
    __doc__ = """Parse a sentence into a hierarchy of phrases."""
    sentence = dspy.InputField(desc="a sentence to parse")
    hierarchy = dspy.OutputField(desc="propose a hierarchy of phrases in the sentence; the hierarchy is marked by opening and closing brackets for each chunk and can be nested; the output should be a string")

class ReCOGS(dspy.Signature):
    __doc__ = """Given a sentence hierarchy, convert it to a ReCOGS logical form"""
    hierarchy = dspy.InputField(desc="a hierarchy of phrases in the sentence")
    logical_form = dspy.OutputField(desc="the ReCOGS logical form of the sentence hierarchy. the output should be a string")

class BasicParse(dspy.Module):
    def __init__(self):
        super().__init__()
        self.parse = dspy.Predict(HierarchyParse)
        self.generaterecogs = dspy.Predict(ReCOGS)

    def forward(self, sentence):
        hierarchy = self.parse(sentence=sentence).hierarchy
        return self.generaterecogs(hierarchy=hierarchy)

In [242]:
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BootstrapFewShotWithRandomSearch

fewshot_optimizer = BootstrapFewShot(metric=recogs_exact_match)
compiled = fewshot_optimizer.compile(student = BasicParse(), trainset=dspy_recogs_train)

  0%|          | 0/24155 [00:00<?, ?it/s]ERROR:dspy.teleprompt.bootstrap:[2m2025-05-02T04:53:30.595439Z[0m [[31m[1merror    [0m] [1mFailed to run or to evaluate example Example({'sentence': 'A rose was helped by a dog .', 'logical_form': 'rose ( 1 ) ; dog ( 36 ) ; help ( 40 ) AND theme ( 40 , 1 ) AND agent ( 40 , 36 )'}) (input_keys={'sentence'}) with <function recogs_exact_match at 0x16a1b3160> due to replace() argument 1 must be str, not list.[0m [[0m[1m[34mdspy.teleprompt.bootstrap[0m][0m [36mfilename[0m=[35mbootstrap.py[0m [36mlineno[0m=[35m211[0m
ERROR:dspy.teleprompt.bootstrap:[2m2025-05-02T04:53:30.599886Z[0m [[31m[1merror    [0m] [1mFailed to run or to evaluate example Example({'sentence': 'The sailor dusted a boy .', 'logical_form': '* sailor ( 0 ) ; boy ( 53 ) ; dust ( 4 ) AND agent ( 4 , 0 ) AND theme ( 4 , 53 )'}) (input_keys={'sentence'}) with <function recogs_exact_match at 0x16a1b3160> due to replace() argument 1 must be str, not list.[0m [[0m

rose ( 1 ) ; dog ( 36 ) ; help ( 40 ) AND theme ( 40 , 1 ) AND agent ( 40 , 36 )
rose ( 1 ) ; dog ( 2 ) ; help ( 3 ) AND theme ( 3 , 1 ) AND agent ( 3 , 2 )
here!
now here!
* sailor ( 0 ) ; boy ( 53 ) ; dust ( 4 ) AND agent ( 4 , 0 ) AND theme ( 4 , 53 )
* sailor ( 12 ) ; dust ( 23 ) ; boy ( 45 ) AND agent ( 23 , 12 ) AND theme ( 23 , 45 )
here!
now here!
Emma ( 27 ) ; teacher ( 6 ) ; roll ( 26 ) AND agent ( 26 , 27 ) AND theme ( 26 , 6 )
Emma ( 21 ) ; roll ( 34 ) AND agent ( 34 , 21 ) AND theme ( 34 , 40 ) AND recipient ( 34 , 40 )
here!
Evelyn ( 54 ) ; * girl ( 14 ) ; roll ( 31 ) AND agent ( 31 , 54 ) AND theme ( 31 , 14 )
Evelyn ( 1 ) ; girl ( 2 ) ; roll ( 3 ) AND agent ( 3 , 1 ) AND theme ( 3 , 2 )
here!
now here!
cake ( 49 ) ; Levi ( 7 ) ; Charlotte ( 15 ) ; forward ( 11 ) AND theme ( 11 , 49 ) AND recipient ( 11 , 7 ) AND agent ( 11 , 15 )
* cake ( 3 ) ; forward ( 1 ) ; Levi ( 5 ) ; Charlotte ( 7 ) AND theme ( 1 , 3 ) AND recipient ( 1 , 5 ) AND agent ( 1 , 7 )
here!
now here!
* 




TypeError: replace() argument 1 must be str, not list

In [246]:
ssamp = dataset['RECOGSgen'].head(10)

In [247]:
ssamp['prediction'] = ssamp.input.apply(
    lambda x: compiled(sentence=x))
ssamp['correct'] = ssamp.apply(
    lambda row: recogs_exact_match(row['output'], row['prediction']), axis=1)  
ssamp['correct'].sum() / ssamp.shape[0]      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ssamp['prediction'] = ssamp.input.apply(


AttributeError: 'str' object has no attribute 'logical_form'