In [7]:
import pandas as pd
import numpy as np

# Read result from 3 model and get the majority voting
# gpt-4o-mini-zeroshot
gpt_df = pd.read_csv("output/submission-qwen-14b-300-samples.csv")
# qwen_32b
qwen_32b = pd.read_csv("output/submission-qwen-coder-32B.csv")
# qwen 7B
qwen_7b = pd.read_csv("output/submission-qwen-14b-4k.csv")

# Create a dataframe for ensemble results
ensemble_results = pd.DataFrame()
ensemble_results["task_id"] = gpt_df["task_id"]

# Combine answers from all models
answers = pd.DataFrame(
    {
        "gpt": gpt_df["answer"],
        "qwen_32b": qwen_32b["answer"],
        "qwen_7b": qwen_7b["answer"],
    }
)


# Get the most common answer for each row
def get_majority_vote(row):
    values, counts = np.unique(row, return_counts=True)
    if len(values) == 1:  # All models agree
        return values[0]
    elif len(values) == 2:  # Two models agree
        return values[counts.argmax()]
    else:  # All three models disagree
        return row["qwen_7b"]  # Default to qwen_32b


# Apply majority voting
ensemble_results["answer"] = answers.apply(get_majority_vote, axis=1)

# Create comparison dataframe showing all model results
comparison = pd.DataFrame(
    {
        "task_id": ensemble_results["task_id"],
        "gpt_answer": answers["gpt"],
        "qwen_32b_answer": answers["qwen_32b"],
        "qwen_7b_answer": answers["qwen_7b"],
        "ensemble_answer": ensemble_results["answer"],
        "changed": ensemble_results["answer"] != qwen_32b["answer"],
    }
)

# Display rows where ensemble differs from qwen_32b
changed_rows = comparison[comparison["changed"]]
print(
    f"Ensemble changed {len(changed_rows)} out of {len(comparison)} answers ({len(changed_rows) / len(comparison):.2%}) from qwen_32b baseline"
)
print("\nRows where ensemble differs from qwen_32b baseline:")
display(changed_rows)

# Save ensemble results
ensemble_results.to_csv("output/submission-ensemble.csv", index=False)

Ensemble changed 249 out of 1253 answers (19.87%) from qwen_32b baseline

Rows where ensemble differs from qwen_32b baseline:


Unnamed: 0,task_id,gpt_answer,qwen_32b_answer,qwen_7b_answer,ensemble_answer,changed
3,k10206,C,D,B,B,True
4,k10215,D,A,D,D,True
8,k10248,A,D,A,A,True
11,k10264,A,E,A,A,True
13,k10272,D,C,D,D,True
...,...,...,...,...,...,...
1131,k00116,C,B,C,C,True
1176,k00351,A,D,A,A,True
1205,k00485,B,A,B,B,True
1251,k00699,C,D,C,C,True
