# Create Combined Judgements

The purpose of this file is just to combine the LM Judge judgement file with the Human Annotation judgement files.

Note that the human annotations only cover 20 out of the 50 questions for each dataset.
So we should probably filter the LM Judge datasets to the same questions before combining the two, since it wouldn't make sense to compare LMJ and Humans on different distributions of data.


I could see arguments that this or its outputs shouldn't be in the /judgements directory. It's a little messy, I admit 😄

In [2]:
import pandas as pd

In [10]:
# jdf = pd.read_csv("judgements_eli5_HQ_20250311_213806.csv") # (LM) "Judge df"
# hdf = pd.read_csv("../annotation/HumanAnnotationQuestionAnswering_corrected.csv")  # "Human df"
jdf = pd.read_csv("judgements_writing_prompts_HQ_20250312_113801.csv")
hdf = pd.read_csv("../annotation/HumanAnnotationCreativeWriting_corrected.csv")  # Making sure to use CORRECTED versions

In [11]:
# INTEGRITY CHECKS

print(f"Length of jdf: {len(jdf)}")
print(f"Length of hdf: {len(hdf)}")

# Do the columns match?
print(f"Column match: {jdf.columns == hdf.columns}")

# What's the range of questions for each?
print("\nQuestion ID ranges:")
print(f"LM Judge df: {jdf['question_id'].min()} - {jdf['question_id'].max()}")
print(f"Human df: {hdf['question_id'].min()} - {hdf['question_id'].max()}")

# Are there any judgement_ids that appear anything other than 6 times, in either?
print("\nJudgement ID counts in LM Judge df:")
print(jdf.groupby('judgement_id').size().value_counts())
print("\nJudgement ID counts in Human df:") 
print(hdf.groupby('judgement_id').size().value_counts())

# If every judgement_crtieria_id unique?
print("\nJudgement criteria ID uniqueness check:")
print(f"LM Judge df unique criteria IDs: {len(jdf['judgement_criteria_id'].unique())} ... Good? {len(jdf['judgement_criteria_id'].unique()) == len(jdf)}")
print(f"Human df unique criteria IDs: {len(hdf['judgement_criteria_id'].unique())} ... Good? {len(hdf['judgement_criteria_id'].unique()) == len(hdf)}")

# Do both have the same criteria?
# Do both have the same criteria?
print("\nCriteria comparison:")
jdf_criteria = set(jdf['criteria'].unique())
hdf_criteria = set(hdf['criteria'].unique())
print(f"LM Judge criteria: {jdf_criteria}")
print(f"Human criteria: {hdf_criteria}")
print(f"Criteria match: {jdf_criteria == hdf_criteria}")



Length of jdf: 24300
Length of hdf: 1080
Column match: [ True  True  True  True  True  True  True  True  True  True  True]

Question ID ranges:
LM Judge df: 0 - 49
Human df: 0 - 19

Judgement ID counts in LM Judge df:
6    4050
Name: count, dtype: int64

Judgement ID counts in Human df:
6    180
Name: count, dtype: int64

Judgement criteria ID uniqueness check:
LM Judge df unique criteria IDs: 24300 ... Good? True
Human df unique criteria IDs: 1080 ... Good? True

Criteria comparison:
LM Judge criteria: {'Flow', 'Style', 'Emotion', 'Plot', 'Dialogue', 'Character'}
Human criteria: {'Flow', 'Style', 'Plot', 'Emotion', 'Dialogue', 'Character'}
Criteria match: True


In [12]:
# Now let's combine the dataframes
new_df = pd.concat([jdf, hdf], ignore_index=True)

print(f"Length of new_df: {len(new_df)}")


# And filter to only include questions that Humans have annotated   
new_df = new_df[new_df['question_id'].isin(hdf['question_id'])] 

print(f"Length of new_df: {len(new_df)}")

# What's the range of questions in the filtered df?
print("\nQuestion ID range in filtered df:")
print(f"new_df: {new_df['question_id'].min()} - {new_df['question_id'].max()}")


Length of new_df: 25380
Length of new_df: 10800

Question ID range in filtered df:
new_df: 0 - 19


In [13]:
# Now let's save the new dataframe
new_df.to_csv("judgements_writing_prompts_combined_with_humans.csv", index=False)