In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

best_reasoning_LLM_7x7 = {
    'WCR': 0.646,
    'ICR': 0.891
}
best_reasoning_LLM_14x14 = {
    'WCR': 0.492,
    'ICR': 0.512
}
best_non_reasoning_LLM_7x7 = {
    'WCR': 0.482,
    'ICR': 0.472
}
best_non_reasoning_LLM_14x14 = {
    'WCR': 0.446,
    'ICR': 0.321
}
best_LVLM_7x7 = {
    'WCR': 0.479,
    'ICR': 0.336
}
best_LVLM_14x14 = {
    'WCR': 0.416,
    'ICR': 0.272
}

metrics = ['WCR', 'ICR']
models = ['Reasoning LLM', 'Non-reasoning LLM', 'LVLM']
grid_sizes = ['7x7', '14x14']

fig = make_subplots(rows=1, cols=2, 
                   subplot_titles=('<b>Correctly Filled Words (%)</b>', '<b>Consistency with Crossing Letters (%)</b>'), 
                   horizontal_spacing=0.1)

model_colors = {
    'Reasoning LLM': '#f9bdb6',     
    'Non-reasoning LLM': '#b7daf5', 
    'LVLM': '#74a892'               
}

fig.add_trace(
    go.Bar(
        x=grid_sizes, 
        y=[best_reasoning_LLM_7x7['WCR']*100, best_reasoning_LLM_14x14['WCR']*100], 
        name='Best Reasoning LLM',
        marker_color=model_colors['Reasoning LLM']
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=grid_sizes, 
        y=[best_non_reasoning_LLM_7x7['WCR']*100, best_non_reasoning_LLM_14x14['WCR']*100], 
        name='Best Non-reasoning LLM',
        marker_color=model_colors['Non-reasoning LLM']
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=grid_sizes, 
        y=[best_LVLM_7x7['WCR']*100, best_LVLM_14x14['WCR']*100], 
        name='Best LVLM',
        marker_color=model_colors['LVLM']
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=grid_sizes, 
        y=[best_reasoning_LLM_7x7['ICR']*100, best_reasoning_LLM_14x14['ICR']*100], 
        name='Best Reasoning LLM',
        marker_color=model_colors['Reasoning LLM'],
        showlegend=False
    ),
    row=1, col=2
)
fig.add_trace(
    go.Bar(
        x=grid_sizes, 
        y=[best_non_reasoning_LLM_7x7['ICR']*100, best_non_reasoning_LLM_14x14['ICR']*100], 
        name='Best Non-reasoning LLM',
        marker_color=model_colors['Non-reasoning LLM'],
        showlegend=False
    ),
    row=1, col=2
)
fig.add_trace(
    go.Bar(
        x=grid_sizes, 
        y=[best_LVLM_7x7['ICR']*100, best_LVLM_14x14['ICR']*100], 
        name='Best LVLM',
        marker_color=model_colors['LVLM'],
        showlegend=False
    ),
    row=1, col=2
)

# Update with Palatino font and bold styling
fig.update_layout(
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1,
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(
        orientation="h",  
        yanchor="bottom",
        y=-0.4,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        borderwidth=0,
        font=dict(
            family="Palatino, serif",
            size=16,
            color="black"
        )
    ),
    margin=dict(l=20, r=20, t=30, b=0),
    height=500,
    width=800,
    font=dict(
        family="Palatino, serif"
    ),
    # Update subplot title font
    title_font=dict(
        family="Palatino, serif"
    )
)

# Bold x-axis labels (7x7 and 14x14)
fig.update_xaxes(
    tickfont=dict(
        family="Palatino, serif",
        size=18,
        color="black"
    ),
    ticktext=["<b>7x7</b>", "<b>14x14</b>"],
    tickvals=grid_sizes,
    showticklabels=True, 
    title_text=""
)

fig.update_yaxes(
    showticklabels=True, 
    title_text="", 
    range=[0, 100],
    tickfont=dict(
        family="Palatino, serif"
    )
)

for annotation in fig.layout.annotations:
    annotation.font.family = "Palatino, serif"
    annotation.font.size = 18
    annotation.font.color = "black"  
    annotation.y = annotation.y + 0.05

# fig.show()
# save as png
fig.write_image("teaser.svg", format='svg', scale=1, width=800, height=200)
# fig.write_image("teaser.pdf", format='pdf', scale=1, width=800, height=200)


reasoning_WCR = 0.646
non_reasoning_WCR = 0.482
reasoning_ICR = 0.891
non_reasoning_ICR = 0.472

percentage_WCR = ((reasoning_WCR - non_reasoning_WCR) / non_reasoning_WCR) * 100
percentage_ICR = ((reasoning_ICR - non_reasoning_ICR) / non_reasoning_ICR) * 100

print("WCR: Reasoning LLM is {:.2f}% higher than Non-reasoning LLM.".format(percentage_WCR))
print("ICR: Reasoning LLM is {:.2f}% higher than Non-reasoning LLM.".format(percentage_ICR))


WCR: Reasoning LLM is 34.02% higher than Non-reasoning LLM.
ICR: Reasoning LLM is 88.77% higher than Non-reasoning LLM.
