In [47]:
import datasets
import bounded_subprocess # Available with pip3 install bounded_subprocess
import pandas as pd
import tempfile
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Prompt Ambiguity

This notebook tests the hypothesis that some prompts in StudentEval are ambiguous: they
generate not just the wrong function, but *several plausible functions*.
In general, determining whether two functions are different is undecidable. However,
we can approximate this by looking at the set of passed/failed tests. If two functions pass/fail
a different set of tests, they are definitely different. However, if they pass/fail the same set
they are not necessarily the same. Thus this method underestimates the ambiguity of a prompt.

In [76]:
# Limit the number of completions per prompt for development. Set to None to disable.
COMPLETION_LIMIT = None
PROBLEM_LIMIT = None

In [30]:
studenteval = datasets.load_dataset("nuprl/StudentEval", split="test").to_pandas()
studenteval = studenteval[["prompt", "prints", "is_first_success", "is_first_failure", "is_last_success", "is_last_failure"]]
studenteval.head()

Found cached dataset parquet (/home/arjun/.cache/huggingface/datasets/nuprl___parquet/nuprl--StudentEval-d3c968fb04673613/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Unnamed: 0,prompt,prints,is_first_success,is_first_failure,is_last_success,is_last_failure
0,"def add_up(arr):\n """"""\n takes a list of...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",False,True,False,False
1,"def add_up(arr):\n """"""\n takes a list of...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",False,False,False,False
2,"def add_up(arr):\n """"""\n takes a list of...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",False,False,False,True
3,"def add_up(arr):\n """"""\n Input: list of ...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",False,True,False,False
4,"def add_up(arr):\n """"""\n Input: list of ...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",False,False,False,False


To do this analysis, we need to pick a model to use for completions. We choose StarCoderBase,
which is the best-performing pretrained model.

In [33]:
completions = pd.read_json("../computed_data/completions_starcoderbase.jsonl", lines=True)
completions = completions.groupby(["prompt", "problem"]).agg(list).reset_index()
print(len(completions))
completions.head()

1897


Unnamed: 0,prompt,problem,completion
0,"def add_int(lst, num):\n """"""\n Append th...",add_int,"[\n if isinstance(lst, list):\n for ..."
1,"def add_int(lst, num):\n """"""\n Create a ...",add_int,[\n A = []\n for i in lst:\n if t...
2,"def add_int(lst, num):\n """"""\n For each ...",add_int,[\n for i in range(len(lst)):\n if i...
3,"def add_int(lst, num):\n """"""\n Given a l...",add_int,[\n for i in range(len(lst)):\n if i...
4,"def add_int(lst, num):\n """"""\n Given a l...",add_int,"[\n for i in lst:\n if isinstance(i,..."


Select the completions for prompts that are in the four main subsets of StudentEval.
Moreover, merge them with the StudentEval dataset so that we can get their test cases.

In [50]:
completions_with_prints = completions.merge(studenteval, on="prompt")
completions_with_prints = completions_with_prints[completions_with_prints["is_first_failure"] | completions_with_prints["is_last_failure"] | completions_with_prints["is_first_success"] | completions_with_prints["is_last_success"]]
print(len(completions_with_prints))
completions_with_prints.head()

1027


Unnamed: 0,prompt,problem,completion,prints,is_first_success,is_first_failure,is_last_success,is_last_failure
0,"def add_int(lst, num):\n """"""\n Append th...",add_int,"[\n if isinstance(lst, list):\n for ...","print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",True,False,False,False
1,"def add_int(lst, num):\n """"""\n Create a ...",add_int,[\n A = []\n for i in lst:\n if t...,"print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",True,False,False,False
2,"def add_int(lst, num):\n """"""\n For each ...",add_int,[\n for i in range(len(lst)):\n if i...,"print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",True,False,False,False
3,"def add_int(lst, num):\n """"""\n Given a l...",add_int,[\n for i in range(len(lst)):\n if i...,"print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",False,True,False,False
4,"def add_int(lst, num):\n """"""\n Given a l...",add_int,"[\n for i in lst:\n if isinstance(i,...","print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",False,False,True,False


For each prompt, run the list of completions and count the number of different results
that they produce.

In [77]:
def run_program(code):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f:
        f.write(code)
        f.flush()
        result = bounded_subprocess.run(["python3", f.name], timeout_seconds=5)
        if result.timeout:
            return "timeout"
        if result.exit_code != 0:
            return "error"
        return result.stdout

def run_completions(row):
    results = set()
    for (i, completion) in enumerate(row["completion"]):
        if COMPLETION_LIMIT is not None and i == COMPLETION_LIMIT:
            break
        code = row["prompt"] + completion + "\n\n" + row["prints"]
        results.add(run_program(code))
    return { "prompt": row["prompt"], "num_functions": len(results) }

with ThreadPoolExecutor(max_workers=50) as executor:
    rows = [ ]
    for (i, row) in tqdm(enumerate(executor.map(run_completions, completions_with_prints.iloc)), total=len(completions_with_prints)):
        if i == PROBLEM_LIMIT:
            break
        rows.append(row)

100%|██████████| 1027/1027 [00:46<00:00, 22.32it/s]


Merge the prompts with the counts produced in the previous step.

In [66]:
def factor(row):
    if row["is_first_success"]:
        return "success (first attempt)"
    if row["is_first_failure"]:
        return "failure (first attempt)"
    if row["is_last_success"]:
        return "success (last attempt)"
    if row["is_last_failure"]:
        return "failure (last attempt)"
    return "unknown"


num_functions = completions_with_prints.merge(pd.DataFrame(rows), on="prompt")
num_functions["factor"] = num_functions.apply(factor, axis=1)
num_functions.head()

Unnamed: 0,prompt,problem,completion,prints,is_first_success,is_first_failure,is_last_success,is_last_failure,num_functions,factor
0,"def add_int(lst, num):\n """"""\n Append th...",add_int,"[\n if isinstance(lst, list):\n for ...","print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",True,False,False,False,3,success (first attempt)
1,"def add_int(lst, num):\n """"""\n Create a ...",add_int,[\n A = []\n for i in lst:\n if t...,"print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",True,False,False,False,2,success (first attempt)
2,"def add_int(lst, num):\n """"""\n For each ...",add_int,[\n for i in range(len(lst)):\n if i...,"print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",True,False,False,False,3,success (first attempt)
3,"def add_int(lst, num):\n """"""\n Given a l...",add_int,[\n for i in range(len(lst)):\n if i...,"print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",False,True,False,False,2,failure (first attempt)
4,"def add_int(lst, num):\n """"""\n Given a l...",add_int,"[\n for i in lst:\n if isinstance(i,...","print(add_int(['hello', 'aaa', 'bbb', 'ccc', 1...",False,False,True,False,5,success (last attempt)


Summarize the results for the paper.

In [97]:
num_functions_agg = num_functions.groupby("factor").agg(
    mean=pd.NamedAgg("num_functions", "mean"),
    median=pd.NamedAgg("num_functions", "median"),
    stddev=pd.NamedAgg("num_functions", "std"))
num_functions_agg.reset_index(inplace=True)
num_functions_agg["#Functions"] = num_functions_agg.apply(
        lambda x: f'{x["mean"]:.2f} ({x["median"]:.2f}) ± {x["stddev"]:.2f}',
        axis=1)
num_functions_agg.rename(columns={"factor": "Subset"}, inplace=True)
num_functions_agg = num_functions_agg[["Subset", "#Functions"]]
num_functions_agg

Unnamed: 0,Subset,#Functions
0,failure (first attempt),2.21 (2.00) ± 1.55
1,failure (last attempt),2.41 (2.00) ± 1.62
2,success (first attempt),1.85 (1.00) ± 1.32
3,success (last attempt),2.19 (2.00) ± 1.32


In [98]:
print(num_functions_agg.to_latex(index=False))

\begin{tabular}{ll}
\toprule
Subset & #Functions \\
\midrule
failure (first attempt) & 2.21 (2.00) ± 1.55 \\
failure (last attempt) & 2.41 (2.00) ± 1.62 \\
success (first attempt) & 1.85 (1.00) ± 1.32 \\
success (last attempt) & 2.19 (2.00) ± 1.32 \\
\bottomrule
\end{tabular}



This is an example of a prompt that was successful but very ambiguous.

In [95]:
examples = num_functions[(num_functions["num_functions"] == 7) & num_functions["is_first_success"]].iloc[0]
for item in examples["completion"]:
    print(examples["prompt"].rstrip() + item)
    print("*" * 10)

def altText(s):
    """
    The function takes a string of text as an input. For words in the string with an odd number of letters, every other letter is capitalized starting with the first letter. For words in the string with an even number of letters, every other letter is capitalized starting with the second letter. 
    """
    # split the string into a list of words
    words = s.split()
    # create a new list of words with the alternate letters capitalized
    new_words = []
    for word in words:
        # create a new string with the first letter capitalized
        new_word = word[0].upper()
        # loop through the letters in the word
        for letter in word[1:]:
            # if the letter is an odd number, capitalize it
            if len(letter) % 2!= 0:
                new_word += letter.upper()
            # if the letter is an even number, leave it lowercase
            else:
                new_word += letter
        # add the new word to the new list of words
  