In [19]:
import numpy as np
import pandas as pd

In [20]:
# ------------------------------------------------------------------
# 0.  Load the data
# ------------------------------------------------------------------
df = pd.read_csv("data/full_results_llm_gpt4o_new_exo.csv", sep=";", encoding="utf-8-sig")

# Data obtained from the streamlit app are in the xlsx format
# df = pd.read_excel("data/complete_analysis_results.xlsx", engine="openpyxl")

In [21]:
# -------------------------------------------------
# ) COUNT THE NUMBER OF RECORDS BY EXERCISE, MODEL, AND PROMPT
# -------------------------------------------------
# Group by 'Code', 'model', and 'prompt', and then count the occurrences
counts_df = df.groupby(['Code', 'model', 'prompt']).size().reset_index(name='count')

# Print out the counts
print(counts_df.to_string())

    Code        model     prompt  count
0      1  GPT-4o mini    correct     10
1      1  GPT-4o mini  incorrect     10
2      2  GPT-4o mini    correct     10
3      2  GPT-4o mini  incorrect     10
4      3  GPT-4o mini    correct     10
5      3  GPT-4o mini  incorrect     10
6      4  GPT-4o mini    correct     10
7      4  GPT-4o mini  incorrect     10
8      5  GPT-4o mini    correct     10
9      5  GPT-4o mini  incorrect     10
10     6  GPT-4o mini    correct     10
11     6  GPT-4o mini  incorrect     10
12     7  GPT-4o mini    correct     10
13     7  GPT-4o mini  incorrect     10
14     8  GPT-4o mini    correct     10
15     8  GPT-4o mini  incorrect     10
16     9  GPT-4o mini    correct     10
17     9  GPT-4o mini  incorrect     10
18    10  GPT-4o mini    correct     10
19    10  GPT-4o mini  incorrect     10
20    11  GPT-4o mini    correct     10
21    11  GPT-4o mini  incorrect     10
22    12  GPT-4o mini    correct     10
23    12  GPT-4o mini  incorrect     10


In [28]:
# ------------------------------------------------------------------
# 1.  Map the 0/1/2 labels to points
#     (→ change the dict if we want to treat “1” differently)
# ------------------------------------------------------------------
label_to_points = {0: 0, 1: 1, 2: 2}
df["points"] = df["ModelPrediction"].map(label_to_points)

# ------------------------------------------------------------------
# 2.  Raw grade on 20 for every (model × exercise × prompt‑type)
# ------------------------------------------------------------------
agg = (
    df
      .groupby(["model", "Code", "prompt"], as_index=False)
      .agg(raw_grade=("points", "sum"))         # 10 gens × max 2 = 0…20
)

# ------------------------------------------------------------------
# 3.  “Good‑ised” grade — so that *higher = better* for both prompts
#     Correct prompt  ➜  keep the raw score
#     Incorrect prompt➜  invert: goodised = 20 – raw
# ------------------------------------------------------------------
agg["goodised_grade"] = np.where(
    agg["prompt"] == "incorrect",
    20 - agg["raw_grade"],
    agg["raw_grade"]
)

# ------------------------------------------------------------------
# 4.  Pivot to get both prompt types side‑by‑side, then compute
#     a Balanced‑Prompt Grade (BPG) that fuses them
# ------------------------------------------------------------------
pivot = (
    agg
      .pivot_table(index=["model", "Code"],
                   columns="prompt",
                   values="raw_grade")
      .reset_index()
)

pivot["Overall"] = 0.5 * (pivot["correct"] + (20 - pivot["incorrect"]))

# ------------------------------------------------------------------
# 5.  Model‑level summary across all exercises
# ------------------------------------------------------------------
model_summary = (
    pivot
      .groupby("model", as_index=False)
      .agg(mean_Overall=("Overall", "mean"),
           good_prompt_mean=("correct", "mean"),
           bad_prompt_mean=("incorrect", "mean"))
      .sort_values("mean_Overall", ascending=False)
)

print("===== Grade per model × exercise =====")
print(pivot.head(50))  

===== Grade per model × exercise =====
prompt        model  Code  correct  incorrect  Overall
0       GPT-4o mini     1     19.0        0.0     19.5
1       GPT-4o mini     2     20.0        0.0     20.0
2       GPT-4o mini     3     20.0        0.0     20.0
3       GPT-4o mini     4     20.0        0.0     20.0
4       GPT-4o mini     5     20.0       10.0     15.0
5       GPT-4o mini     6     20.0        0.0     20.0
6       GPT-4o mini     7     19.0        3.0     18.0
7       GPT-4o mini     8     20.0        0.0     20.0
8       GPT-4o mini     9     20.0        8.0     16.0
9       GPT-4o mini    10     20.0        0.0     20.0
10      GPT-4o mini    11     20.0        0.0     20.0
11      GPT-4o mini    12     20.0        0.0     20.0
12      GPT-4o mini    13     20.0        0.0     20.0
13      GPT-4o mini    14     20.0        0.0     20.0


In [29]:
print("\n===== Model‑level summary =====")
print(model_summary)


===== Model‑level summary =====
         model  mean_Overall  good_prompt_mean  bad_prompt_mean
0  GPT-4o mini     19.178571         19.857143              1.5


In [30]:
# def label_counts(
#     data: pd.DataFrame,
#     model: str | None = None,
#     exo: int | None = None,
#     prompt: str | None = None,
# ) -> pd.Series:
#     """
#     Return the number of 0-, 1- and 2-labels in ModelPrediction
#     after optionally filtering by model, exercise (Code) and prompt type.

#     Parameters
#     ----------
#     data   : your full_results_llm DataFrame
#     model  : exact value from the 'model' column     (e.g. "GPT-4o mini")
#     exo    : exercise number from the 'Code' column  (e.g. 7)
#     prompt : "correct" or "incorrect"

#     Returns
#     -------
#     pd.Series with the counts, indexed by [0, 1, 2]
#     """
#     subset = data
#     if model is not None:
#         subset = subset[subset["model"] == model]
#     if exo is not None:
#         subset = subset[subset["Code"] == exo]
#     if prompt is not None:
#         subset = subset[subset["prompt"] == prompt]

#     # ensure all three labels appear even if their count is zero
#     return (
#         subset["ModelPrediction"]
#         .value_counts()
#         .reindex([0, 1, 2], fill_value=0)
#         .astype(int)
#         .rename("count")
#     )

# print(label_counts(df, model="GPT-4o mini", exo=1, prompt="incorrect"))

In [31]:
def rank_exercises_overall(
    pivot_df: pd.DataFrame,
    overall_col: str = "Overall",
    ascending: bool = False
) -> pd.DataFrame:
    """
    Ranks exercises (exo) by the average 'Overall' score across all models,
    and includes the per-model values for comparison.

    Parameters
    ----------
    pivot_df     : DataFrame from step 4 (must include 'model', 'Code', and 'overall_col')
    overall_col  : Name of the column representing the overall score (default = 'BPG')
    ascending    : Whether to sort from worst to best (default = False)

    Returns
    -------
    pd.DataFrame with:
    - 'exo' (exercise ID)
    - 'mean_overall'
    - one column per model (showing that model’s score on each exercise)
    """
    # Pivot: rows = Code (exo), columns = model
    wide = pivot_df.pivot(index="Code", columns="model", values=overall_col)

    # Drop the name "model" from columns if present
    wide.columns.name = None

    # Compute the mean and reformat
    wide["mean_overall"] = wide.mean(axis=1)
    wide = wide.reset_index().rename(columns={"Code": "exo"})

    # Reorder columns: exo, mean, then each model
    cols = ["exo", "mean_overall"] + [col for col in wide.columns if col not in ["exo", "mean_overall"]]
    result = wide[cols].sort_values("mean_overall", ascending=ascending).reset_index(drop=True)

    return result

exercise_ranking = rank_exercises_overall(pivot)
print(exercise_ranking)

    exo  mean_overall  GPT-4o mini
0     2          20.0         20.0
1     3          20.0         20.0
2     6          20.0         20.0
3     4          20.0         20.0
4    10          20.0         20.0
5     8          20.0         20.0
6    13          20.0         20.0
7    14          20.0         20.0
8    12          20.0         20.0
9    11          20.0         20.0
10    1          19.5         19.5
11    7          18.0         18.0
12    9          16.0         16.0
13    5          15.0         15.0


In [32]:
def rank_exercises_by_bpg(
    pivot_df: pd.DataFrame,
    model: str,
    ascending: bool = False    # False = best→worst, True = worst→best
) -> pd.DataFrame:
    """
    Return the pivot rows for one model, sorted by the Balanced-Prompt Grade.

    Parameters
    ----------
    pivot_df  : the dataframe produced in step 4 (one row = model × exercise)
    model     : exact model name as it appears in the 'model' column
    ascending : set to True if you want worst → best instead

    Returns
    -------
    pd.DataFrame with columns ['Code', 'correct', 'incorrect', 'BPG'], sorted.
    """
    df = pivot_df.copy()
    df.columns.name = None  # clear the annoying "prompt" column label if it exists
    return (
        df
        .loc[df["model"] == model, ["Code", "correct", "incorrect", "Overall"]]
        .rename(columns={"Code": "exo"})
        .sort_values("Overall", ascending=ascending)
        .reset_index(drop=True)
    )

In [33]:
gpt4o_ranking = rank_exercises_by_bpg(pivot, model="GPT-4o mini")
print("\n===== GPT-4o mini ranking =====")
print(gpt4o_ranking)


===== GPT-4o mini ranking =====
    exo  correct  incorrect  Overall
0     2     20.0        0.0     20.0
1     3     20.0        0.0     20.0
2     6     20.0        0.0     20.0
3     4     20.0        0.0     20.0
4    10     20.0        0.0     20.0
5     8     20.0        0.0     20.0
6    13     20.0        0.0     20.0
7    14     20.0        0.0     20.0
8    12     20.0        0.0     20.0
9    11     20.0        0.0     20.0
10    1     19.0        0.0     19.5
11    7     19.0        3.0     18.0
12    9     20.0        8.0     16.0
13    5     20.0       10.0     15.0


In [10]:
mistrals3_ranking = rank_exercises_by_bpg(pivot, model="Mistral Small 3")
print("\n===== Mistral Small 3 ranking =====")
print(mistrals3_ranking)


===== Mistral Small 3 ranking =====
Empty DataFrame
Columns: [exo, correct, incorrect, Overall]
Index: []


In [11]:
llama3_ranking = rank_exercises_by_bpg(pivot, model="Llama 3.3 70B")
print("\n===== Llama 3.3 70B ranking =====")
print(llama3_ranking)


===== Llama 3.3 70B ranking =====
Empty DataFrame
Columns: [exo, correct, incorrect, Overall]
Index: []


In [20]:
# -------------------------------------------------
# 6) COMPUTE THE LENGTH OF 'réponse_llm' AND GROUP STATS
# -------------------------------------------------
# Add a new column 'llm_length' that is the character count of 'réponse_llm'
df['llm_length'] = df['réponse_llm'].apply(len)

# Group the data by model, prompt type, and exercise code, then compute mean and std deviation 
grouped_stats = df.groupby(['model', 'prompt'])['llm_length'].agg(['mean', 'std']).reset_index()

# Print out the grouped statistics
print(grouped_stats)

             model     prompt         mean         std
0      GPT-4o mini    correct   868.341667  370.700098
1      GPT-4o mini  incorrect  1382.825000  415.676803
2    Llama 3.3 70B    correct  1227.375000  313.801791
3    Llama 3.3 70B  incorrect  1386.008333  329.533373
4  Mistral Small 3    correct  1408.558333  717.093817
5  Mistral Small 3  incorrect  1885.200000  407.792079


In [None]:
# # Function to compute similarity ratio between two strings.
# def similarity_score(a, b):
#     if pd.isna(a) or pd.isna(b):
#         return 0.0
#     return SequenceMatcher(None, a.strip().lower(), b.strip().lower()).ratio()

# # Function to compute mean and standard deviation for a list of responses.
# def compute_similarity_stats(responses):
#     # Return NaNs if fewer than two responses exist.
#     if len(responses) < 2:
#         return {'mean_similarity': np.nan, 'std_similarity': np.nan}
    
#     # Calculate similarity scores for every unique pair.
#     scores = [similarity_score(a, b) for a, b in itertools.combinations(responses, 2)]
    
#     return {'mean_similarity': np.mean(scores), 'std_similarity': np.std(scores)}

# # Group the DataFrame by Code, model, and prompt.
# similarity_stats = df.groupby(['Code', 'model', 'prompt'])['réponse_llm'].agg(
#     mean_similarity=lambda x: compute_similarity_stats(list(x))['mean_similarity'],
#     std_similarity=lambda x: compute_similarity_stats(list(x))['std_similarity']
# ).reset_index()

# # Print the similarity statistics DataFrame.
# print(similarity_stats.to_string())

    Code            model     prompt  mean_similarity  std_similarity
0      1      GPT-4o mini    correct         0.489160        0.223464
1      1      GPT-4o mini  incorrect         0.705769        0.192325
2      1    Llama 3.3 70B    correct         0.337619        0.246238
3      1    Llama 3.3 70B  incorrect         0.560682        0.330597
4      1  Mistral Small 3    correct         0.913635        0.112648
5      1  Mistral Small 3  incorrect         0.921882        0.157887
6      2      GPT-4o mini    correct         0.954432        0.063967
7      2      GPT-4o mini  incorrect         0.639566        0.224334
8      2    Llama 3.3 70B    correct         0.279527        0.180718
9      2    Llama 3.3 70B  incorrect         0.416526        0.280307
10     2  Mistral Small 3    correct         0.725395        0.351781
11     2  Mistral Small 3  incorrect         0.743388        0.320780
12     3      GPT-4o mini    correct         0.341372        0.229097
13     3      GPT-4o

In [None]:
# # ------------------------------------------------------------
# # Compute the overall mean (mean of means) by model.
# # ------------------------------------------------------------
# model_mean_similarity = similarity_stats.groupby(['model', 'prompt'])['mean_similarity'].mean().reset_index(name='consistency')

# # Print the DataFrame with the model-level mean similarity.
# print(model_mean_similarity.to_string())

             model     prompt  consistency
0      GPT-4o mini    correct     0.552707
1      GPT-4o mini  incorrect     0.492043
2    Llama 3.3 70B    correct     0.487499
3    Llama 3.3 70B  incorrect     0.468484
4  Mistral Small 3    correct     0.919027
5  Mistral Small 3  incorrect     0.804506
