In [141]:
# Check variance average model variance per task, model per task category, average variance per model

In [142]:
import json
import os

all_result_files={}
RESPATH="../benczechmark_leaderboard"
for fn in os.listdir(RESPATH):
    if fn.startswith("results_hf_") and fn.endswith(".json"):
        with open(os.path.join(RESPATH,fn)) as rf:
            all_result_files[fn[len("results_hf_"):-len(".json")]] = json.load(rf)
    elif fn.startswith("results_") and fn.endswith(".json"):
        with open(os.path.join(RESPATH,fn)) as rf:
            all_result_files[fn[len("results_"):-len(".json")]] = json.load(rf)
        

In [143]:
list(all_result_files.items())[0][1]['metadata']['model_name']

'ibm-granite/granite-3.0-8b-instruct'

In [144]:
max_c_var = dict()
for raw_modelname in all_result_files.keys():
    modelname = all_result_files[raw_modelname]['metadata']['model_name']
    max_c_var[modelname]=dict()
    for taskname, r in all_result_files[raw_modelname]['results'].items():
        taskname = taskname.replace("benczechmark_","")
        if not 'word_perplexity' in r and taskname not in ['hellaswag']:
            try:
                max_c_var[modelname][taskname]=r['max_centered_variance']
            except KeyError as e :
                print(taskname)
                print(modelname)
                raise e
            

In [145]:
import pandas as pd
from bokeh.plotting import figure
from bokeh.layouts import row, column
from bokeh.models import ColumnDataSource
from bokeh.transform import dodge
from bokeh.io import output_notebook, show
from bokeh.palettes import Category20, Spectral6
from bokeh.models import ColumnDataSource, HoverTool

# Ensure output is displayed in the notebook
output_notebook()

# Convert the dictionary into a DataFrame
df = pd.DataFrame(max_c_var)

# Transpose the DataFrame to have models as rows and tasks as columns
df = df.T

# Remove contaminated tasks
df= df.drop(columns=["czechnews","summarization"])

# Compute average variance per task (mean across models)
task_avg_variance = df.mean(axis=0).sort_values(ascending=False)

# Compute average variance per model (mean across tasks)
model_avg_variance = df.mean(axis=1).sort_values(ascending=False)

# Prepare data for Bokeh
task_names = task_avg_variance.index.tolist()
model_names = model_avg_variance.index.tolist()


In [146]:
df.T.index

Index(['umimeto_biology', 'umimeto_chemistry', 'umimeto_czech',
       'umimeto_history', 'umimeto_informatics', 'umimeto_math',
       'umimeto_physics', 'subjectivity', 'propaganda_demonizace',
       'cermat_czech_mc', 'cermat_czech_tf', 'cermat_czmath_mc',
       'propaganda_zamereni', 'propaganda_nazor', 'propaganda_nalepkovani',
       'ctkfacts_nli', 'propaganda_zanr', 'agree', 'cs_ner', 'propaganda_vina',
       'cermat_czmath_open', 'cermat_czech_open', 'sentiment_mall',
       'sentiment_fb', 'klokan_qa', 'cs_court_decisions_ner', 'history_ir',
       'cs_sqad32', 'propaganda_relativizace', 'propaganda_strach',
       'propaganda_fabulace', 'propaganda_argumentace', 'cs_triviaQA',
       'cs_naturalquestions', 'csfever_nli', 'propaganda_rusko',
       'sentiment_csfd', 'propaganda_lokace', 'belebele', 'snli',
       'propaganda_emoce', 'grammarerrorcorrection'],
      dtype='object')

In [147]:
len(model_names)

44

In [148]:
model_avg_variance

meta-llama/Meta-Llama-3.1-8B-Instruct        94.218068
microsoft/Phi-3-mini-4k-instruct             86.654965
CohereForAI/aya-23-35B                       73.911834
CohereForAI/aya-23-8B                        65.814698
mistralai/Mixtral-8x22B-Instruct-v0.1        63.124259
Qwen/Qwen2-7B                                59.885159
mistralai/Mistral-7B-Instruct-v0.3           59.007630
google/gemma-2-2b-it                         56.368006
Qwen/Qwen2-7B-Instruct                       56.164539
NousResearch/Hermes-3-Llama-3.1-8B           51.256306
Qwen/Qwen2-72B-Instruct                      50.912755
meta-llama/Meta-Llama-3-8B-Instruct          48.013916
meta-llama/Llama-3.2-3B-Instruct             47.459816
mistralai/Mixtral-8x7B-Instruct-v0.1         46.080988
meta-llama/Meta-Llama-3.1-8B                 45.043057
meta-llama/Meta-Llama-3.1-70B-Instruct       44.992968
internlm/internlm2_5-7b-chat                 43.526633
mistralai/Mistral-Nemo-Instruct-2407         42.167763
Qwen/Qwen2

In [149]:
task_avg_variance

subjectivity               336.357699
propaganda_demonizace      244.044287
propaganda_nazor            86.289014
propaganda_argumentace      86.285668
propaganda_relativizace     81.072688
propaganda_zanr             75.858158
propaganda_nalepkovani      72.188339
snli                        65.932930
history_ir                  63.866611
ctkfacts_nli                59.992868
propaganda_strach           46.131683
cs_court_decisions_ner      41.584397
csfever_nli                 34.132504
cs_ner                      33.993958
propaganda_vina             27.831672
belebele                    27.297893
propaganda_zamereni         23.532265
propaganda_fabulace         23.010315
propaganda_lokace           22.571459
umimeto_czech               16.312500
cermat_czmath_mc            15.671883
grammarerrorcorrection      14.197823
umimeto_math                14.005682
propaganda_rusko            13.693199
umimeto_history             13.164773
umimeto_biology             12.414773
propaganda_e

In [150]:
import pickle
with open("max_centered_variances_modelavg.pkl","wb") as f:
    pickle.dump(model_avg_variance,f)

In [151]:
# 1. Plot Variance across Tasks
source_task_variance = ColumnDataSource(data=dict(tasks=task_names, variances=task_avg_variance))

p1 = figure(x_range=task_names, height=800, width=1400, title="Max-centered Variance Across Tasks", tools="pan,wheel_zoom,box_zoom,reset")

# Add hover tool for task variance
hover_task = HoverTool(tooltips=[("Task", "@tasks"), ("Variance", "@variances")])
p1.add_tools(hover_task)

p1.vbar(x='tasks', top='variances', width=0.4, source=source_task_variance, color="firebrick")

# Style the plot
p1.xgrid.grid_line_color = None
p1.xaxis.axis_label = "Tasks"
p1.yaxis.axis_label = "Variance"
p1.xaxis.major_label_orientation = 3.14/2  # Perpendicular labels


#show(p1)

In [152]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

# Define alternating colors
light_color = "#d3d3d3"  
dark_color = "#6a5acd"  

# Assign alternating colors based on the index of the tasks
task_colors = [light_color if i % 2 == 0 else dark_color for i in range(len(task_names))]

# Prepare data source with colors
source_task_variance = ColumnDataSource(data=dict(
    tasks=[t[t.find("/") + 1:] for t in task_names],  # Extract meaningful task names
    variances=[task_avg_variance[t] for t in task_names],
    color=task_colors  # Add color column
))

# Create the figure
p1 = figure(
    x_range=[t[t.find("/") + 1:] for t in task_names], 
    height=350, width=800, 
    #title="Max-centered Variance Across Tasks",  # Title removed for consistency
    tools="pan,wheel_zoom,box_zoom,reset",
    y_axis_type="linear"  # Set the y-axis to log scale
)

# Add hover tool for task variance
hover_task = HoverTool(tooltips=[("Task", "@tasks"), ("Variance", "@variances")])
p1.add_tools(hover_task)

# Add bars with dynamic colors
p1.vbar(x='tasks', top='variances', width=0.9, source=source_task_variance, color='color')

# Style the plot
p1.xgrid.grid_line_color = None
#p1.xaxis.axis_label = "Tasks"  # No x-axis label for consistency
p1.yaxis.axis_label = "Avg. Max-c Variance"
p1.xaxis.major_label_orientation = 3.14 / 2  # Perpendicular labels
p1.xaxis.axis_label_text_font_style = "normal"
p1.yaxis.axis_label_text_font_style = "normal"
p1.y_range = Range1d(start=0, end=100)

from bokeh.models import ColumnDataSource, Label

# Add vertical labels
hello_label = Label(
    x=0.45, y=50, text="y=336", angle=3.14 / 2, text_font_size="11pt",
    text_align="center", text_baseline="middle", text_color=dark_color,
) 

world_label = Label(
    x=1.45, y=50, text="y=244", angle=3.14 / 2, text_font_size="11pt",
    text_align="center", text_baseline="middle",text_color=light_color,
)

# Add labels to the plot
p1.add_layout(hello_label)
p1.add_layout(world_label)

# Show the plot
show(p1)

from bokeh.io import export_svgs
p1.output_backend = "svg"
export_svgs(p1, filename="max-c-variance-task.svg")


['max-c-variance-task.svg']

In [153]:
# plot variance vs performance
# load model sizes


# model_mapping = {
#     "BUT-FIT/CSTinyLlama-1.2B": "cstllama",
#     "BUT-FIT/csmpt7b": "csmpt",
#     "CohereForAI/aya-23-35B": "aya23_35b_instruct",
#     "CohereForAI/aya-23-8B": "aya23_instruct",
#     "NousResearch/Hermes-3-Llama-3.1-8B": "hermes_llama31",
#     "Qwen/Qwen2-7B": "qwen2_lm",
#     "Qwen/Qwen2-7B-Instruct": "qwen2_instruct",
#     "Qwen/Qwen2.5-7B": "qwen25_lm",
#     "Qwen/Qwen2.5-7B-Instruct": "qwen25_instruct",
#     "Qwen/Qwen2.5-72B": "qwen2_70b_lm",
#     "Qwen/Qwen2.5-72B-Instruct": "qwen2_70b_instruct",
    
#     "Qwen/Qwen2-72B-Instruct": "qwen2_70b_instruct",
#     "allenai/OLMo-7B-Instruct-hf": "olmo7b_instruct",
#     "google/gemma-2-9b-it": "gemma2_instruct",
#     "google/gemma-2-9b": "gemma2_lm",
#     "google/gemma-2-2b": "gemma2-2b",
#     "google/gemma-2-2b-it": "gemma2-2b_instruct",
#     "internlm/internlm2_5-7b-chat": "internlm_instruct",
#     "meta-llama/Meta-Llama-3.1-405B-Instruct": "llama31_405b_instruct",
#     "meta-llama/Meta-Llama-3.1-70B-Instruct": "llama31_70b_instruct",
#     "meta-llama/Meta-Llama-3.1-8B": "llama31_lm",
#     "meta-llama/Meta-Llama-3-8B-Instruct": "llama3_instruct",
#     "meta-llama/Meta-Llama-3.1-70B": "llama31_70b_lm",
#     "meta-llama/Meta-Llama-3.1-8B-Instruct": "llama31_instruct",
#     "microsoft/Phi-3-mini-4k-instruct": "phi3mini_instruct",
#     "mistralai/Mistral-Nemo-Instruct-2407": "mistral_nemo_instruct",
#     "mistralai/Mistral-7B-Instruct-v0.3": "mistral03_instruct",
#     "mistralai/Mixtral-8x7B-Instruct-v0.1": "mixtral8x7_instruct",
#     "mistralai/Mixtral-8x22B-Instruct-v0.1": "mixtral8x22_instruct",
#     "mosaicml/mpt-7b": "mpt7",
#     'meta-llama/Llama-3.2-3B-Instruct': "llama323_instruct",
#     'meta-llama/Llama-3.2-3B': "llama323",
#     'meta-llama/Llama-3.2-1B-Instruct': "llama321_instruct",
#     'meta-llama/Llama-3.2-1B': "llama321",
#     'speakleash/Bielik-11B-v2.3-Instruct': "bielikv239(pl)",
#     'AMead10/c4ai-command-r-08-2024-awq':'command-r-0824-awq_instruct',
#     "ibm-granite/granite-3.0-8b-base": "granite3_8b_base",
#     "ibm-granite/granite-3.0-8b-instruct": "granite3_8b_instruct",
#     'ibm-granite/granite-3.0-3b-a800m-instruct':'granite3_3b_moe_instruct',
#     'ibm-granite/granite-3.0-3b-a800m-base':'granite3_3b_moe_base',
#     'utter-project/EuroLLM-1.7B-Instruct':'eurollm1.7b_instruct',
#     'utter-project/EuroLLM-1.7B':'eurollm1.7b_base',
#     'ibm-granite/granite-3.0-3b-a800m-base':'granite3_3b_moe_base',
# }
import csv

csv_file_path="leaderboard_data/Leaderboard - Overall.csv"


ldb_records={}
with open(csv_file_path, mode='r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        if row['Model'].startswith("Qwen/Qwen2.5-"):
            continue
        sanitized_name = row['Model'] # model_mapping[row['Model']]
        ldb_records[sanitized_name] = row
        
        #print(f'{row["Model"]} -> {sanitized_name}')

model_sizes={k: float(v['# θ (B)']) for k,v in ldb_records.items()}
performance={k: float(v['Average ⬆️']) for k,v in ldb_records.items()}

In [154]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

# Define alternating colors
light_color = "#d3d3d3"  
dark_color = "#6a5acd"  

# Assign alternating colors based on the index of the models
model_colors = [light_color if i % 2 == 0 else dark_color for i in range(len(model_names))]

# Prepare data source with colors
source_model_variance = ColumnDataSource(data=dict(
    models=[m[m.find("/")+1:] for m in model_names],
    variances=[model_avg_variance[m] for m in model_names],
    color=model_colors  # Add color column
))

# Create the figure
p2 = figure(
    x_range=[m[m.find("/")+1:] for m in model_names], 
    height=350, width=800, 
    #title="Average Variance Per Model", 
    tools="pan,wheel_zoom,box_zoom,reset"
)

# Add hover tool for model variance
hover_model = HoverTool(tooltips=[("Model", "@models"), ("Variance", "@variances")])
p2.add_tools(hover_model)

# Add bars with dynamic colors
p2.vbar(x='models', top='variances', width=0.9, source=source_model_variance, color='color')

# Style the plot
p2.xgrid.grid_line_color = None
#p2.xaxis.axis_label = "Models"
p2.yaxis.axis_label = "Avg. Max-c Variance"
p2.xaxis.major_label_orientation = 3.14 / 2  # Perpendicular labels
p2.xaxis.axis_label_text_font_style = "normal"
p2.yaxis.axis_label_text_font_style = "normal"# Show the plot
show(p2)

from bokeh.io import export_svgs
p2.output_backend = "svg"
export_svgs(p2, filename="max-c-variance-model.svg")


['max-c-variance-model.svg']

In [155]:
#performance
#model_avg_variance

In [156]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category20
from sklearn.linear_model import LinearRegression

# Convert performance and variance to lists ensuring alignment by model names
model_names = list(performance.keys())
performance_values = [performance[model] for model in model_names]
variance_values = [model_avg_variance[model] for model in model_names]

def fit_curve(x, y, degree=1):
    """
    Fits a polynomial curve of specified degree to the data.

    Parameters:
    - x: np.array, the x-axis data (Performance)
    - y: np.array, the y-axis data (Variance)
    - degree: int, the degree of the polynomial to fit (default is 1 for linear regression)

    Returns:
    - x_fit: np.array, sorted x values for plotting the fitted line
    - y_fit: np.array, corresponding y values on the fitted line
    """
    # Fit the polynomial curve
    coeffs = np.polyfit(x, y, degree)
    poly = np.poly1d(coeffs)

    # Generate x values for the fitted line (spanning the range of the original data)
    x_fit = np.linspace(min(x), max(x), 100)
    y_fit = poly(x_fit)

    return x_fit, y_fit


# Function to detect and remove outliers using the IQR method
def remove_outliers(x, y):
    x = np.array(x)
    y = np.array(y)
    
    Q1_x, Q3_x = np.percentile(x, [25, 75])
    Q1_y, Q3_y = np.percentile(y, [25, 75])
    
    IQR_x = Q3_x - Q1_x
    IQR_y = Q3_y - Q1_y
    
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x
    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y
    
    mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
    mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
    mask = mask_x & mask_y
    
    return x[mask], y[mask], x[~mask], y[~mask]

# Remove outliers
x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(variance_values, performance_values)

# Define the color palette: Red for "instruct" models, Blue for others
colors = ["red" if any(s in model for s in ["instruct","Hermes", "Instruct","-it", "aya", "chat", "command-r"]) else "blue" for model in model_names]

# Create ColumnDataSource with filtered data (switch x and y)
source_filtered = ColumnDataSource(data={
    'x': y_filtered,  # Performance on the x-axis
    'y': x_filtered,  # Variance on the y-axis
    'model_names': np.array(model_names)[np.in1d(performance_values, y_filtered)],
    'color': np.array(colors)[np.in1d(performance_values, y_filtered)]  # Apply the color logic
})

# Create ColumnDataSource with outlier data (switch x and y)
source_outliers = ColumnDataSource(data={
    'x': y_outliers,  # Performance on the x-axis
    'y': x_outliers,  # Variance on the y-axis
    'model_names': np.array(model_names)[np.in1d(performance_values, y_outliers)],
    'color': np.array(colors)[np.in1d(performance_values, y_outliers)]  # Apply the color logic
})

# Fit a line to the filtered data (switch x and y)
x_fit, y_fit = fit_curve(y_filtered, x_filtered, degree=1)  # Performance on x-axis, variance on y-axis

# Create the scatter plot (switch axis labels)
p = figure(width=800, height=800, title="Variance vs Performance",
           tools="pan,wheel_zoom,box_zoom,reset,save", 
           tooltips=[("Model", "@model_names"), ("Performance", "@x"), ("Variance", "@y")])

# Plot filtered data
p.scatter('x', 'y', size=10, source=source_filtered, fill_alpha=0.6, color='color')

# Plot outliers
p.scatter('x', 'y', size=10, source=source_outliers, fill_alpha=0.6, color='color')

# Plot the fitted line
p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')

# Set axis labels (switched)
p.xaxis.axis_label = 'Performance'
p.yaxis.axis_label = 'Model Variance'

# Display the plot
show(p)



In [162]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category20
from sklearn.linear_model import LinearRegression
from bokeh.models import LabelSet
from bokeh.models import Range1d


# Convert performance and variance to lists ensuring alignment by model names
model_names = list(performance.keys())
performance_values = [performance[model] for model in model_names]
variance_values = [model_avg_variance[model] for model in model_names]

model_names_bkp = model_names
model_names=[m[m.find("/")+1:] for m in model_names]
# Function to detect and remove outliers using the IQR method
def remove_outliers(x, y):
    x = np.array(x)
    y = np.array(y)
    
    Q1_x, Q3_x = np.percentile(x, [25, 75])
    Q1_y, Q3_y = np.percentile(y, [25, 75])
    
    IQR_x = Q3_x - Q1_x
    IQR_y = Q3_y - Q1_y
    
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x
    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y
    
    mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
    mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
    mask = mask_x & mask_y
    
    return x[mask], y[mask], x[~mask], y[~mask]

# Remove outliers
x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(variance_values, performance_values)

# Define the color palette: Red for "instruct" models, Blue for others
colors = ["red" if any(s in model for s in ["instruct","Hermes", "Instruct","-it", "aya", "chat", "command-r"]) else "blue" for model in model_names]

# Create ColumnDataSource with filtered data (switch x and y)
source_filtered = ColumnDataSource(data={
    'x': y_filtered,  # Performance on the x-axis
    'y': x_filtered,  # Variance on the y-axis
    'model_names': np.array(model_names)[np.in1d(performance_values, y_filtered)],
    'color': np.array(colors)[np.in1d(performance_values, y_filtered)]  # Apply the color logic
})

# Create ColumnDataSource with outlier data (switch x and y)
source_outliers = ColumnDataSource(data={
    'x': y_outliers,  # Performance on the x-axis
    'y': x_outliers,  # Variance on the y-axis
    'model_names': np.array(model_names)[np.in1d(performance_values, y_outliers)],
    'color': np.array(colors)[np.in1d(performance_values, y_outliers)]  # Apply the color logic
})

# Fit a line to the filtered data (switch x and y)
x_fit, y_fit = fit_curve(y_filtered, x_filtered, degree=1)  # Performance on x-axis, variance on y-axis

# Create the scatter plot (switch axis labels)
p = figure(width=800, height=800, title="Variance vs Performance",
           tools="pan,wheel_zoom,box_zoom,reset,save", 
           tooltips=[("Model", "@model_names"), ("Performance", "@x"), ("Variance", "@y")])

# Plot filtered data
p.scatter('x', 'y', size=10, source=source_filtered, fill_alpha=0.6, color='color')

# Plot outliers
p.scatter('x', 'y', size=10, source=source_outliers, fill_alpha=0.6, color='color')

# Plot the fitted line
p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')

# Add model name labels under each data point
labels = LabelSet(x='x', y='y', text='model_names', level='glyph', x_offset=-40, y_offset=-20, source=source_filtered,  text_font_size="9pt")
p.add_layout(labels)

labels = LabelSet(x='x', y='y', text='model_names', level='glyph', x_offset=-40, y_offset=-20, source=source_outliers,  text_font_size="9pt")
p.add_layout(labels)

# Set axis labels (switched)
p.xaxis.axis_label = 'Performance'
p.yaxis.axis_label = 'Model Variance'

p.x_range = Range1d(start=0, end=110)

p.xaxis.axis_label_text_font_style = "normal"
p.yaxis.axis_label_text_font_style = "normal"# Show the plot


# Display the plot
show(p)
from bokeh.io import export_svgs
p.output_backend = "svg"
export_svgs(p, filename="perf_modelvar.svg")

['perf_modelvar.svg']