In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data 1: Model Comparison
models_data = {
    "models": ["deepseek-v3.1", "gpt-5.1", "claude-opus-4.5", "qwen3"],
    "equivalent_rates": [0.24923076923076923, 0.15692307692307692, 0.24307692307692308, 0.10153846153846154],
    "any_implication_rates": [0.9538461538461539, 0.9292307692307692, 0.9692307692307692, 0.6615384615384615],
    "result_count": [
        {"equivalent_count": 81, "generated_implies_reference_count": 22, "reference_implies_generated_count": 207, "no_relationship_count": 13, "error_count": 2},
        {"equivalent_count": 51, "generated_implies_reference_count": 48, "reference_implies_generated_count": 203, "no_relationship_count": 19, "error_count": 4},
        {"equivalent_count": 79, "generated_implies_reference_count": 29, "reference_implies_generated_count": 207, "no_relationship_count": 9, "error_count": 1},
        {"equivalent_count": 33, "generated_implies_reference_count": 25, "reference_implies_generated_count": 157, "no_relationship_count": 12, "error_count": 98}
    ]
}

df_models = pd.DataFrame(models_data)
counts_df_models = pd.DataFrame(models_data["result_count"])
df_models = pd.concat([df_models, counts_df_models], axis=1).drop(columns=['result_count'])

# Data 2: Deepseek Prompt Variants
deepseek_data = {
    "Prompt": ["iterative_basic", "iterative_critique", "iterative_refinement"],
    "equivalent_rates": [0.8276923076923077, 0.8153846153846154, 0.7353846153846154],
    "any_implication_rates": [0.9538461538461539, 0.9661538461538461, 0.963076923076923],
    "result_count": [
        {"equivalent_count": 269, "generated_implies_reference_count": 24, "reference_implies_generated_count": 24, "no_relationship_count": 6, "error_count": 2},
        {"equivalent_count": 265, "generated_implies_reference_count": 26, "reference_implies_generated_count": 23, "no_relationship_count": 10, "error_count": 1},
        {"equivalent_count": 239, "generated_implies_reference_count": 27, "reference_implies_generated_count": 47, "no_relationship_count": 11, "error_count": 1}
    ]
}
df_deepseek = pd.DataFrame(deepseek_data)
counts_df_deepseek = pd.DataFrame(deepseek_data["result_count"])
df_deepseek = pd.concat([df_deepseek, counts_df_deepseek], axis=1).drop(columns=['result_count'])
df_deepseek['Model'] = 'deepseek-v3-1-250821'

# Data 3: GPT-5.1 Prompt Variants
gpt51_data = {
    "Prompt": ["iterative_basic", "iterative_critique", "iterative_refinement"],
    "equivalent_rates": [0.3476923076923077, 0.3230769230769231, 0.24],
    "any_implication_rates": [0.9723076923076923, 0.9784615384615385, 0.9661538461538461],
    "result_count": [
        {"equivalent_count": 113, "generated_implies_reference_count": 44, "reference_implies_generated_count": 159, "no_relationship_count": 7, "error_count": 2},
        {"equivalent_count": 105, "generated_implies_reference_count": 61, "reference_implies_generated_count": 152, "no_relationship_count": 6, "error_count": 1},
        {"equivalent_count": 78, "generated_implies_reference_count": 43, "reference_implies_generated_count": 193, "no_relationship_count": 9, "error_count": 2}
    ]
}
df_gpt51 = pd.DataFrame(gpt51_data)
counts_df_gpt51 = pd.DataFrame(gpt51_data["result_count"])
df_gpt51 = pd.concat([df_gpt51, counts_df_gpt51], axis=1).drop(columns=['result_count'])
df_gpt51['Model'] = 'gpt-5.1'

# Combine prompt variants data for the specific visualization
df_combined_prompts = pd.concat([df_deepseek, df_gpt51], ignore_index=True)

# -----------------
# Plot 1: Model Comparison - Rates
# -----------------
fig, ax1 = plt.subplots(figsize=(10, 6))

models = df_models['models']
x = np.arange(len(models))
width = 0.35

rects1 = ax1.bar(x - width/2, df_models['equivalent_rates'], width, label='Equivalent Rate (Dual Implication)')
rects2 = ax1.bar(x + width/2, df_models['any_implication_rates'], width, label='Any Implication Rate')

# Add some text for labels, titles and custom x-axis tick labels, etc.
ax1.set_ylabel('Rate', fontsize=18)
ax1.set_title('Model Comparison: Equivalence and Implication Rates', fontsize=20)
ax1.set_xticks(x)
ax1.set_xticklabels(models, rotation=45, ha="right", fontsize=16)
ax1.legend(loc='upper left', fontsize=14)
ax1.set_ylim(0, 1.0)
ax1.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.savefig('model_comparison_rates.png')
plt.close()


# -----------------
# Plot 2: Model Comparison - Counts (Stacked Bar Chart)
# -----------------
categories = ['equivalent_count', 'generated_implies_reference_count', 'reference_implies_generated_count', 'no_relationship_count', 'error_count']
labels = ['Equivalent', 'Generated $\\Rightarrow$ Reference (Pulling Back)', 'Reference $\\Rightarrow$ Generated (Pushing Out)', 'No Relationship', 'Error']
colors = ['#4daf4a', '#377eb8', '#e41a1c', '#ff7f00', '#984ea3']

fig, ax = plt.subplots(figsize=(12, 7))

bottom = np.zeros(len(df_models))

for i, cat in enumerate(categories):
    counts = df_models[cat]
    ax.bar(df_models['models'], counts, bottom=bottom, label=labels[i], color=colors[i])
    bottom += counts

ax.set_ylabel('Count (Total $N=325$)', fontsize=18)
ax.set_title('Model Comparison: Relationship Counts', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(df_models['models'], rotation=45, ha="right", fontsize=16)
ax.legend(title="Relationship Type", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14, title_fontsize=16)
plt.tight_layout()
plt.savefig('model_comparison_counts.png')
plt.close()

# -----------------
# Plot 3: Deepseek Prompt Variants - Rates
# -----------------
# Rates
fig, ax1 = plt.subplots(figsize=(8, 6))
prompts = df_deepseek['Prompt']
x = np.arange(len(prompts))
width = 0.35

rects1 = ax1.bar(x - width/2, df_deepseek['equivalent_rates'], width, label='Equivalent Rate (Dual Implication)')
rects2 = ax1.bar(x + width/2, df_deepseek['any_implication_rates'], width, label='Any Implication Rate')

ax1.set_ylabel('Rate', fontsize=18)
ax1.set_title('Deepseek Prompt Variants: Equivalence and Implication Rates', fontsize=20)
ax1.set_xticks(x)
ax1.set_xticklabels(prompts, rotation=0, fontsize=16)
ax1.legend(loc='upper right', fontsize=14)
ax1.set_ylim(0, 1.0)
ax1.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.savefig('deepseek_prompts_rates.png')
plt.close()

# -----------------
# Plot 4: Deepseek Prompt Variants - Counts (Stacked Bar)
# -----------------
fig, ax = plt.subplots(figsize=(10, 7))
bottom = np.zeros(len(df_deepseek))

for i, cat in enumerate(categories):
    counts = df_deepseek[cat]
    ax.bar(df_deepseek['Prompt'], counts, bottom=bottom, label=labels[i], color=colors[i])
    bottom += counts

ax.set_ylabel('Count (Total $N=325$)', fontsize=18)
ax.set_title('Deepseek Prompt Variants: Relationship Counts', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(df_deepseek['Prompt'], rotation=45, ha="right", fontsize=16)
ax.legend(title="Relationship Type", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14, title_fontsize=16)
plt.tight_layout()
plt.savefig('deepseek_prompts_counts.png')
plt.close()

# -----------------
# Plot 5: GPT-5.1 Prompt Variants - Rates
# -----------------
# Rates
fig, ax1 = plt.subplots(figsize=(8, 6))
prompts = df_gpt51['Prompt']
x = np.arange(len(prompts))
width = 0.35

rects1 = ax1.bar(x - width/2, df_gpt51['equivalent_rates'], width, label='Equivalent Rate (Dual Implication)')
rects2 = ax1.bar(x + width/2, df_gpt51['any_implication_rates'], width, label='Any Implication Rate')

ax1.set_ylabel('Rate', fontsize=18)
ax1.set_title('GPT-5.1 Prompt Variants: Equivalence and Implication Rates', fontsize=20)
ax1.set_xticks(x)
ax1.set_xticklabels(prompts, rotation=0, fontsize=16)
ax1.legend(loc='upper right', fontsize=14)
ax1.set_ylim(0, 1.0)
ax1.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.savefig('gpt51_prompts_rates.png')
plt.close()

# -----------------
# Plot 6: GPT-5.1 Prompt Variants - Counts (Stacked Bar)
# -----------------
fig, ax = plt.subplots(figsize=(10, 7))
bottom = np.zeros(len(df_gpt51))

for i, cat in enumerate(categories):
    counts = df_gpt51[cat]
    ax.bar(df_gpt51['Prompt'], counts, bottom=bottom, label=labels[i], color=colors[i])
    bottom += counts

ax.set_ylabel('Count (Total $N=325$)', fontsize=18)
ax.set_title('GPT-5.1 Prompt Variants: Relationship Counts', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(df_gpt51['Prompt'], rotation=45, ha="right", fontsize=16)
ax.legend(title="Relationship Type", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14, title_fontsize=16)
plt.tight_layout()
plt.savefig('gpt51_prompts_counts.png')
plt.close()

# -----------------
# Plot 7: Specific Visualization - Equivalent, Pushing Out, Pulling Back
# -----------------

# Define the three key count columns
key_counts = ['equivalent_count', 'reference_implies_generated_count', 'generated_implies_reference_count']
key_labels = ['Equivalent', 'Pushing Out (Ref $\\Rightarrow$ Gen)', 'Pulling Back (Gen $\\Rightarrow$ Ref)']
key_colors = ['#4daf4a', '#e41a1c', '#377eb8']

df_plot7 = df_combined_prompts.copy()
df_plot7['Group'] = df_plot7['Model'] + ' - ' + df_plot7['Prompt']

fig, ax = plt.subplots(figsize=(14, 8))

# Set the positions for the groups
groups = df_plot7['Group'].unique()
x = np.arange(len(groups))
width = 0.25  # Width of each bar

# Plot bars for each key count
for i, count_col in enumerate(key_counts):
    # Calculate the positions for the bars of this category
    bar_positions = x + (i - 1) * width
    ax.bar(bar_positions, df_plot7[count_col], width, label=key_labels[i], color=key_colors[i])

# Add labels, title, and legend
ax.set_ylabel('Count (Total $N=325$)', fontsize=18)
ax.set_title('Comparison of Implication Counts Across Models and Prompt Variants', fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(groups, rotation=45, ha="right", fontsize=16)
ax.legend(title="Relationship Type", loc='upper left', fontsize=14, title_fontsize=16)
ax.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.savefig('prompt_variant_key_counts.png')
plt.close()