# OpenAI Baselines

This notebook runs the OpenAI model baselines on the Only Connect dataset.

In [None]:
# %pip install -r requirements.txt
# %pip install guidance
import json
import os
import re
import random
from pathlib import Path

import guidance
from datasets import load_dataset

## Setup

First, you will need to add your OpenAI API key

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

Next, load a copy of the Only Connect dataset using the [HuggingFace Datasets Library](https://huggingface.co/docs/datasets/index)

In [None]:
dataset = load_dataset(
    "json",
    data_files={
        "train": "dataset/train.json",
        "validation": "dataset/validation.json",
        "test": "dataset/test.json",
    },
    field="dataset",
)

Finally, load the helper function with will make the calls to the OpenAI API

In [None]:
def run_openai(
    dataset,
    task: str = "task1",
    model: str = "gpt-3.5-turbo",
    split: str = "test",
    num_in_context_examples: int = 3,
    dry_run: bool = False,
    seed: int = 42,
):
    guidance.llm = guidance.llms.OpenAI(model)

    if task == "task1":
        prompt = guidance(
            """{{#system~}}You are currently competing in Round 3: Connecting Wall on the quiz show Only Connect. Your task: given 16 "clues" (words or phrases), solve the wall by grouping the clues into four groups of four. You will be given the clues as a list. You are also given examples of solved walls, which include the connections. Provide your answer as a list of four groups of four clues; separate groups by newlines and clues by commas. Do not try to guess the connection; only use the clues given and don't make up your own.

Be careful! Connecting Wall is deliberately difficult. The puzzles are designed to include red herrings and to suggest more connections than actually exist. Some clues appear to fit into more than one category. Still, there is only one perfect solution for each wall.
{{~/system}}

{{#user~}}
{{examples}}

Clues: {{#each clues}} {{this}}{{#unless @last}},{{/unless}}{{/each}}
{{~/user}}

Solved wall: 

{{#assistant~}}{{gen 'predicted_groups' temperature=0.0 max_tokens=64}}{{~/assistant}}
    """
        )
    elif task == "task2":
        prompt = guidance(
            """{{#system~}}You are currently competing in Round 3: Connecting Wall on the quiz show Only Connect. Your task: given 4 groups of 4 "clues" (words or phrases), determine the connection for each group. You will be given the groups as four lists of four. You are also given examples of solved walls, which include the connections. Provide your answer by repeating the four groups and adding "Connection: "{{~/system}}

{{#user~}}
{{examples}}

Groups:
{{#each groups}}{{this}}{{#unless @last}}\n{{/unless}}{{/each}}
{{~/user}}

Solved wall: 

{{#assistant~}}{{gen 'predicted_connections' temperature=0.0 max_tokens=128}}{{~/assistant}}
    """
        )

    # Set the RNG here so repeated calls to this function will return the same results
    rng = random.Random(seed)

    predictions = []

    # Create the in-context examples
    ic_examples = ""
    random_examples = rng.sample(dataset["train"]["groups"], k=num_in_context_examples)
    for i, example in enumerate(random_examples):
        ic_examples += f"Example {i+1}\n"
        for group in example.values():
            ic_examples += ", ".join(group["gt_words"]) + f". Connection: {group['gt_connection']}\n"
        ic_examples += "\n"
    ic_examples = ic_examples.strip()

    # Run the model on each wall
    for wall in dataset[split]:
        # Clues have already been shuffled, so we can take them as is
        wall_id, clues = (
            wall["wall_id"],
            wall["words"],
        )
        groups = [", ".join(group["gt_words"]) for group in wall["groups"].values()]
        # Try to parse the model response, but if it fails, just use a random guess
        predicted_groups, predicted_connections = None, None
        if task == "task1":
            response = prompt(examples=ic_examples, clues=clues)
            try:
                predicted_groups = [
                    [word.strip() for word in line.split(",")] for line in response["predicted_groups"].splitlines()
                ]
            except:
                Warning(
                    f"Failed to parse model response:\n\n{response['predicted_groups']}\n\nUsing random guess instead."
                )
                predicted_groups = [clues[i : i + 4] for i in range(0, len(clues), 4)]
        else:
            response = prompt(examples=ic_examples, groups=groups)
            predicted_connections = [
                re.search(r"Connection:\s*(.*)", connection)
                for connection in response["predicted_connections"].splitlines()
            ]
            predicted_connections = [
                connection.group(1).strip() if connection else "" for connection in predicted_connections
            ]

        predictions.append(
            {
                "wall_id": wall_id,
                "predicted_groups": predicted_groups,
                "predicted_connections": predicted_connections,
            }
        )
        if dry_run:
            print("--dry-run flag passed. Exiting after one example.")
            break

    return predictions

## Task 1: Solving Walls

To run task 1 (solving the wall), run the following:

In [None]:
# Remove dry-run when you are ready to run the full dataset
predictions = run_openai(dataset, task="task1", num_in_context_examples=5, dry_run=True)

To evaluate the predictions, save them to disk and run the evaluation script:

In [None]:
Path("predictions_task1.json").write_text(json.dumps(predictions, ensure_ascii=False, indent=2));

In [None]:
!python evaluate.py \
    --prediction_file "./predictions_task1.json" \
    --dataset_path "./dataset/" \
    --results_path "./results_task1.json" 

## Task 2: Making Connections

To run task 2 (predicting the connections between solved groups), run the following:

In [None]:
# Remove dry-run when you are ready to run the full dataset
predictions = run_openai(dataset, task="task2", num_in_context_examples=5, dry_run=True)

We can evaluate the predictions similarly to task 1:

In [None]:
Path("predictions_task2.json").write_text(json.dumps(predictions, ensure_ascii=False, indent=2));

In [None]:
!python evaluate.py \
    --prediction_file "./predictions_task2.json" \
    --dataset_path "./dataset/" \
    --results_path "./results_task2.json"