# Dataset

This notebook has basic the analysis that we report in the dataset section.

In [1]:
import pandas as pd
import datasets

In [5]:
FRIENDLY_FACTORS = {
    "first_success": "First Success",
    "first_failure": "First Failure",
    "last_success": "Last Success",
    "last_failure": "Last Failure"
}

In [3]:
df = datasets.load_dataset("nuprl/StudentEval", split="test").to_pandas()

# There are boolean-valued columns is_first_success, is_first_failure,
# is_last_success, and is_last_failure that are mutually exclusive. Add a
# factor column that indicates which of these is true.
def add_factor(row):
    if row["is_first_success"]:
        return "first_success"
    if row["is_first_failure"]:
        return "first_failure"
    if row["is_last_success"]:
        return "last_success"
    if row["is_last_failure"]:
        return "last_failure"
    return "other"

df["factor"] = df.apply(add_factor, axis=1)

df.head()

Found cached dataset parquet (/home/arjun/.cache/huggingface/datasets/nuprl___parquet/nuprl--StudentEval-d3c968fb04673613/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Unnamed: 0,problem,entrypoint,assertions,prints,username,submitted_text,tests_passed,total_tests,prompt,completion,first_attempt,last_attempt,is_success,is_first_success,is_last_success,is_first_failure,is_last_failure,factor
0,add_up,add_up,"assert add_up([ [ 'one', 2, 3, 4 ] ]) == 9\nas...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",student1,"takes a list of strings, integers, and floats ...",1,3,"def add_up(arr):\n """"""\n takes a list of...","return sum([x for x in arr if isinstance(x, (i...",True,False,False,False,False,True,False,first_failure
1,add_up,add_up,"assert add_up([ [ 'one', 2, 3, 4 ] ]) == 9\nas...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",student1,"takes a list of strings, integers, and floats ...",0,3,"def add_up(arr):\n """"""\n takes a list of...",return sum(arr)\n,False,False,False,False,False,False,False,other
2,add_up,add_up,"assert add_up([ [ 'one', 2, 3, 4 ] ]) == 9\nas...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",student1,"takes a list of strings, integers, and floats ...",1,3,"def add_up(arr):\n """"""\n takes a list of...",sum = 0\n for i in arr:\n if type(i)...,False,True,False,False,False,False,True,last_failure
3,add_up,add_up,"assert add_up([ [ 'one', 2, 3, 4 ] ]) == 9\nas...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",student6,"Input: list of items, that may be a combinatio...",1,3,"def add_up(arr):\n """"""\n Input: list of ...",total = 0\n for item in arr:\n if ty...,True,False,False,False,False,True,False,first_failure
4,add_up,add_up,"assert add_up([ [ 'one', 2, 3, 4 ] ]) == 9\nas...","print(add_up([ [ 'one', 2, 3, 4 ] ]))\nprint(a...",student6,"Input: list of items, that may be a combinatio...",1,3,"def add_up(arr):\n """"""\n Input: list of ...",new_list = []\n for item in arr:\n i...,False,False,False,False,False,False,False,other


In [6]:
# Add a word_count column that counts the number of words in the submitted_text column.
def word_count(df):
    df = df.copy()
    df["word_count"] = df["submitted_text"].str.split().str.len()
    # Remove all rows with factor == "other"
    df = df[df["factor"] != "other"]
    # Rename the factors to be more readable
    df["factor"] = df["factor"].replace(FRIENDLY_FACTORS)
    df = df.rename(columns={"word_count": "Word Count"})
    # Group by factor
    df = df.groupby("factor")
    # Calculate the mean and standard deviation of word_count for each factor.
    # Also calculate the number of items.
    df = df.agg(
        mean = ("Word Count", "mean"),
        median = ("Word Count", "median"),
        std = ("Word Count", "std"),
        Items = ("Word Count", "count"),
    )
    df.reset_index(inplace=True)
    # Flatten
    df.columns = ["Subset", "mean", "median", "stddev", "Items"]
    # Add a column that reads mean (median) ± std
    df["Word Count"] = df.apply(
        lambda x: f'{x["mean"]:.2f} ({x["median"]:.2f}) ± {x["stddev"]:.2f}',
        axis=1)
    return df

word_count_df = word_count(df)
# Print as latex
print(word_count_df[["Subset", "Items", "Word Count"]].to_latex(index=False))
word_count_df

\begin{tabular}{lrl}
\toprule
Subset & Items & Word Count \\
\midrule
First Failure & 450 & 28.75 (25.50) ± 16.67 \\
First Success & 187 & 28.79 (25.00) ± 17.38 \\
Last Failure & 205 & 35.94 (30.00) ± 22.64 \\
Last Success & 185 & 37.84 (35.00) ± 18.38 \\
\bottomrule
\end{tabular}



Unnamed: 0,Subset,mean,median,stddev,Items,Word Count
0,First Failure,28.753333,25.5,16.674375,450,28.75 (25.50) ± 16.67
1,First Success,28.786096,25.0,17.376205,187,28.79 (25.00) ± 17.38
2,Last Failure,35.936585,30.0,22.637183,205,35.94 (30.00) ± 22.64
3,Last Success,37.843243,35.0,18.376564,185,37.84 (35.00) ± 18.38


In [7]:
print("Number of unique problems:", len(df["problem"].unique()))
print("Total number of prompts:", len(df))

Number of unique problems: 48
Total number of prompts: 1749
