In [1]:
%env PATH=$PATH:`chromedriver-path`

import json
import pandas as pd
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, FactorRange
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.layouts import column
import seaborn as sns

# Ensure output is displayed in the notebook
output_notebook()

PREFIX="/home/ifajcik/code/lm-evaluation-harness/benczechmark_leaderboard"

# Paths to files
OUTFNAME = PREFIX+"/hf_comparison.csv"
METADATA_PATH = PREFIX+"/leaderboard/metadata.json"



# Load metadata to get task categories
with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)


env: PATH=$PATH:`chromedriver-path`


In [2]:
# Read the CSV into a DataFrame
df = pd.read_csv(OUTFNAME)

# Ignore the first column (WinScore)
df = df.drop(columns=["MicroWinScore"])

# Rename model names in the first column
df[df.columns[0]] = df[df.columns[0]].str.replace('results_hf_', '').str.replace('results_', '').str.replace('.json', '')


In [3]:
df

Unnamed: 0.1,Unnamed: 0,benczechmark_agree-avg_mcauroc,benczechmark_belebele-avg_mcauroc,benczechmark_capek-word_perplexity,benczechmark_cermat_czech_mc-acc,benczechmark_cermat_czech_open-exact_match,benczechmark_cermat_czech_tf-avg_mcauroc,benczechmark_cermat_czmath_mc-acc,benczechmark_cermat_czmath_open-exact_match,benczechmark_correspondence-word_perplexity,...,benczechmark_spoken-word_perplexity,benczechmark_subjectivity-avg_mcauroc,benczechmark_summarization-rouge_raw_r2_mid_f,benczechmark_umimeto_biology-acc,benczechmark_umimeto_chemistry-acc,benczechmark_umimeto_czech-acc,benczechmark_umimeto_history-acc,benczechmark_umimeto_informatics-acc,benczechmark_umimeto_math-acc,benczechmark_umimeto_physics-acc
0,aya23_35b_instruct,90.45,97.47,209.23,48.38,15.11,76.95,19.05,4.4,458.25,...,120.16,92.11,5.21,87.0,75.0,63.0,84.0,88.0,75.0,75.0
1,aya23_instruct,80.34,93.77,355.23,34.51,4.5,66.56,26.98,1.89,812.77,...,177.03,79.75,3.46,80.0,69.0,51.0,77.0,80.0,65.0,74.0
2,csmpt,68.32,49.94,115.16,25.73,0.96,52.0,22.22,0.31,361.32,...,122.84,54.5,1.29,59.0,63.0,50.0,65.0,52.0,52.0,52.0
3,cstllama,54.16,48.5,154.14,27.43,0.0,49.33,27.78,0.31,328.22,...,135.14,53.63,1.7,47.0,59.0,62.0,61.0,42.0,49.0,62.0
4,gemma2-2b,62.86,76.2,31253796.38,31.28,0.32,60.29,25.4,3.14,1641.11,...,57326.59,60.06,1.46,76.0,67.0,51.0,69.0,65.0,67.0,62.0
5,gemma2-2b_instruct,74.22,89.39,3014.34,34.51,0.64,62.29,21.43,3.14,4281.25,...,951.74,79.08,2.04,77.0,66.0,52.0,73.0,74.0,74.0,75.0
6,gemma2_instruct,84.4,97.89,647.66,49.61,9.0,77.08,30.95,10.69,932.04,...,273.45,61.74,3.02,80.0,76.0,59.0,82.0,81.0,76.0,80.0
7,gemma2_lm,90.22,97.53,57886677.53,49.46,0.32,81.15,33.33,7.86,648.07,...,68797.34,89.15,2.53,92.0,89.0,57.0,88.0,94.0,85.0,82.0
8,hermes_llama31,84.52,93.73,453.01,42.68,9.65,70.54,23.81,6.29,1715.18,...,295.23,80.78,3.19,84.0,80.0,61.0,84.0,84.0,67.0,74.0
9,internlm_instruct,77.93,92.73,2877.22,35.9,0.64,57.64,26.98,5.35,4789.14,...,694.68,73.17,1.47,67.0,62.0,56.0,73.0,80.0,68.0,75.0


In [4]:
# Extract tasks and their corresponding categories from metadata
task_to_category = {}
for task, details in metadata['tasks'].items():
    task_to_category[task] = details['category']
    
plot_data = []
for col in df.columns[1:]:  # Skip the model column
    if '-' in col:  # Identify columns corresponding to metrics for tasks
        task_name, metric = col.split('-')
        if task_name in task_to_category:  # Ensure the task exists in metadata
            for model in df[df.columns[0]].unique():
                score = df.loc[df[df.columns[0]] == model, col].values
                if len(score) > 0:
                    plot_data.append({
                        'Model': model,
                        'Metric': metric,
                        'Score': float(score[0]),
                        'Category': task_to_category[task_name],
                        'Task': f"{task_name} - {metric}"
                    })

# Convert the plot_data list to a DataFrame
plot_df = pd.DataFrame(plot_data)


In [5]:
plot_df

Unnamed: 0,Model,Metric,Score,Category,Task
0,aya23_35b_instruct,avg_mcauroc,90.45,Czech Language Understanding,benczechmark_agree - avg_mcauroc
1,aya23_instruct,avg_mcauroc,80.34,Czech Language Understanding,benczechmark_agree - avg_mcauroc
2,csmpt,avg_mcauroc,68.32,Czech Language Understanding,benczechmark_agree - avg_mcauroc
3,cstllama,avg_mcauroc,54.16,Czech Language Understanding,benczechmark_agree - avg_mcauroc
4,gemma2-2b,avg_mcauroc,62.86,Czech Language Understanding,benczechmark_agree - avg_mcauroc
...,...,...,...,...,...
1295,phi3mini_instruct,acc,63.00,Factual Knowledge,benczechmark_umimeto_physics - acc
1296,qwen2_70b_instruct,acc,90.00,Factual Knowledge,benczechmark_umimeto_physics - acc
1297,qwen2_instruct,acc,83.00,Factual Knowledge,benczechmark_umimeto_physics - acc
1298,qwen2_lm,acc,78.00,Factual Knowledge,benczechmark_umimeto_physics - acc


In [6]:
import json
import pandas as pd
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, FactorRange
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.layouts import column, layout
from bokeh.palettes import Category10 as palette
import seaborn as sns
from bokeh.palettes import Turbo256

# Generate a color palette with 50 colors
# Use the Turbo256 palette and truncate it to 50 colors
palette = Turbo256[:50]

# Generate plots for each task, ordered by category
categories = sorted(plot_df['Category'].unique())
plots = []

for category in categories:
    category_data = plot_df[plot_df['Category'] == category]
    tasks = category_data['Task'].unique()
    
    for task in tasks:
        task_data = category_data[category_data['Task'] == task]
        
        factors = task_data['Model'].unique().tolist()
        scores = [task_data[task_data['Model'] == model]['Score'].values[0] for model in factors]

        # Create a ColumnDataSource
        source = ColumnDataSource(data=dict(
            x=factors,
            scores=scores,
        ))

        # Create the figure with increased size
        p = figure(x_range=factors, height=600, width=1000,
                   title=f'{task} - Category: {category}',
                   toolbar_location="above", tools="pan,wheel_zoom,box_zoom,reset")

        # Ensure the color palette matches the number of unique factors
        color_map = factor_cmap('x', palette=palette[:len(factors)], factors=factors)

        # Create the vertical bar plot
        p.vbar(x='x', top='scores', width=0.9, source=source, line_color="white",
               fill_color=color_map)

        # Add tooltips
        p.add_tools(HoverTool(tooltips=[("Model", "@x"), ("Score", "@scores")]))

        # Customize the plot
        p.y_range.start = 0
        p.xgrid.grid_line_color = None
        p.xaxis.major_label_orientation = 1.2

        plots.append(p)

# Display the plots in a column layout within the notebook
#show(column(*plots, sizing_mode='stretch_both'))

In [7]:
from sklearn.preprocessing import MinMaxScaler,RobustScaler,QuantileTransformer
from bokeh.models import ColumnDataSource, ColorBar, LinearColorMapper, HoverTool
from bokeh.plotting import figure, show
import pandas as pd
import json
from scipy.special import softmax

# Prepare the matrix data for heatmap
matrix_data = pd.DataFrame()
original_scores = pd.DataFrame()

for col in df.columns[1:]:  # Skip the model column
    if '-' in col:  # Identify columns corresponding to metrics for tasks
        task_name, metric = col.split('-')
        if task_name in task_to_category:  # Ensure the task exists in metadata
            task_df = df[df[df.columns[0]].isin(df[df.columns[0]])]
            task_df = task_df.set_index(df.columns[0])[col].rename(task_name)
            
            # Save original scores
            if original_scores.empty:
                original_scores = pd.DataFrame(task_df)
            else:
                original_scores = original_scores.join(task_df, how='outer')
            
            # Max normalization for 'word_perplexity'
            if metric == 'word_perplexity':
                task_df = -task_df + task_df.max()  # Transform by subtracting each value from the max
            if matrix_data.empty:
                matrix_data = pd.DataFrame(task_df)
            else:
                matrix_data = matrix_data.join(task_df, how='outer')

# Normalize the matrix data
# scaler = MinMaxScaler()
# normalized_matrix = scaler.fit_transform(matrix_data.fillna(0))

transformer = QuantileTransformer(output_distribution='uniform', n_quantiles=matrix_data.shape[0]//6)
normalized_matrix = transformer.fit_transform(matrix_data)


# Create a DataFrame for the heatmap
heatmap_df = pd.DataFrame(normalized_matrix, index=matrix_data.index, columns=matrix_data.columns)

# Sort columns by category
sorted_columns = sorted(heatmap_df.columns, key=lambda x: task_to_category.get(x, ''))
heatmap_df = heatmap_df[sorted_columns]
original_scores = original_scores[sorted_columns]  # Ensure original scores match the sorted columns

from bokeh.models import ColumnDataSource, ColorBar, LinearColorMapper, HoverTool
from bokeh.plotting import figure, show
from bokeh.models.annotations import LabelSet

def create_heatmap(matrix, original):
    plot_width = 1400
    plot_height = 800
    n_rows, n_cols = matrix.shape
    matrix.columns = matrix.columns.str.replace('benczechmark_', '', regex=False)
    original.columns = original.columns.str.replace('benczechmark_', '', regex=False)

    p = figure(width=plot_width, height=plot_height, 
               x_range=list(matrix.columns), y_range=list(matrix.index),
               toolbar_location="below", tools="pan,wheel_zoom,box_zoom,reset,save", x_axis_label="Model", y_axis_label="Task")

    # Create the color mapper
    color_mapper = LinearColorMapper(palette='Viridis256', low=0, high=1)

    # Flatten the matrix for Bokeh plotting
    source_data = {
        'x': [],
        'y': [],
        'colors': [],
        'task_names': [],
        'scores': [],
    }
    for i, (task, model_scores) in enumerate(matrix.iterrows()):
        for j, score in enumerate(model_scores):
            source_data['x'].append(matrix.columns[j])
            source_data['y'].append(task)
            source_data['colors'].append(score)
            source_data['task_names'].append(matrix.columns[j])
            source_data['scores'].append(original_scores.loc[task, matrix.columns[j]])

    source = ColumnDataSource(source_data)

    # Create the heatmap
    p.rect(x='x', y='y', width=1, height=1, source=source,
           line_color=None, fill_color={'field': 'colors', 'transform': color_mapper})

    # Add color bar
    color_bar = ColorBar(color_mapper=color_mapper, width=8, location=(0, 0))
    p.add_layout(color_bar, 'right')

    # Add HoverTool
    hover = HoverTool()
    hover.tooltips = [("Task", "@task_names"), ("Model", "@y"), ("Score", "@scores")]
    p.add_tools(hover)


    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.xaxis.major_label_orientation = "vertical"

    
    p.yaxis.major_label_text_font_size = "11pt"  # Adjust the size as needed (e.g., 12pt, 14pt, 16pt, etc.)
    p.xaxis.major_label_text_font_size = "11pt"  # Adjust the size as needed (e.g., 12pt, 14pt, 16pt, etc.)

    return p


In [8]:
### SORT BY INSTRUCT 

# Sort rows by whether they contain 'instruct' in their name
sorted_index = sorted(heatmap_df.index, key=lambda x: (not "instruct" in x, x))
heatmap_df = heatmap_df.loc[sorted_index]
original_scores = original_scores.loc[sorted_index]

# Generate the heatmap
heatmap = create_heatmap(heatmap_df, original_scores)

# Display the heatmap
show(heatmap)


In [9]:
heatmap_df

Unnamed: 0_level_0,agree,cermat_czech_mc,cermat_czech_open,cermat_czech_tf,grammarerrorcorrection,umimeto_czech,cermat_czmath_mc,cermat_czmath_open,klokan_qa,umimeto_math,...,propaganda_vina,propaganda_zamereni,propaganda_zanr,snli,belebele,cs_sqad32,sentiment_csfd,sentiment_fb,sentiment_mall,subjectivity
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aya23_35b_instruct,0.804196,0.676797,0.737865,0.664176,0.807709,0.694444,0.080141,0.400053,0.411394,0.609195,...,0.222222,0.732975,0.781473,0.725791,0.754304,0.693587,0.921239,0.840897,0.501488,1.0
aya23_instruct,0.388009,0.282387,0.441194,0.365613,0.325611,0.111111,0.507698,0.162051,0.127937,0.296296,...,0.516332,0.335564,0.467071,0.64326,0.403023,0.693587,0.9,0.788574,0.385417,0.57377
gemma2-2b_instruct,0.267395,0.282387,0.099688,0.273794,0.486975,0.148148,0.200101,0.290256,0.289817,0.574713,...,0.323761,0.332493,0.180238,0.284683,0.306706,0.416738,0.380628,0.397493,0.653274,0.545607
gemma2_instruct,0.577198,0.695487,0.646862,0.667498,0.189833,0.5,0.696376,0.732363,0.573576,0.643678,...,0.498623,0.593873,0.529985,0.349805,0.820031,0.659257,0.303668,0.260757,0.216449,0.145671
internlm_instruct,0.316849,0.322853,0.099688,0.185809,0.112613,0.296296,0.507698,0.47551,0.67362,0.367816,...,0.552932,0.166163,0.315096,0.372345,0.331758,0.179057,0.317498,0.330119,0.275629,0.320202
llama31_405b_instruct,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.974466,0.836592,1.0,0.882027,0.69823,1.0,0.590774,0.577974
llama31_70b_instruct,0.850483,0.971889,0.895146,0.938219,0.969383,0.722222,0.603079,0.859127,0.804256,0.868852,...,0.901425,0.934342,0.989905,0.826816,0.946792,0.724466,0.669912,0.935398,0.672087,0.995074
llama31_instruct,0.63032,0.59643,0.617611,0.380268,0.655792,0.736111,0.239919,0.683091,0.447758,0.505747,...,0.399449,0.726873,0.875297,0.518422,0.580185,0.78939,0.482947,0.681794,0.718157,0.72381
llama3_instruct,0.679654,0.542588,0.558653,0.399521,0.577264,0.708333,0.0,0.524755,0.429576,0.471264,...,0.285812,0.634694,0.75,0.384915,0.449202,0.67696,0.565484,0.589973,0.688347,0.71954
mistral03_instruct,0.325513,0.354288,0.529403,0.329234,0.342438,0.666667,0.080141,0.162051,0.393212,0.333333,...,0.778952,0.583389,0.56972,0.475076,0.313906,0.415598,0.494543,0.445837,0.517113,0.514502


In [10]:
from sklearn.metrics.pairwise import cosine_distances
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, leaves_list

# 1. Compute Cosine Distance between tasks (rows)
cosine_dist_matrix = pdist(heatmap_df, metric='cosine')

# 2. Perform hierarchical clustering on the distance matrix
# Using 'average' linkage method for clustering
Z = linkage(cosine_dist_matrix, method='average')

# 3. Get the order of rows based on the clustering
sorted_index = leaves_list(Z)

# Reorder the DataFrame according to the clustering result
heatmap_df = heatmap_df.iloc[sorted_index]
original_scores = original_scores.iloc[sorted_index]

# Generate the heatmap
heatmap = create_heatmap(heatmap_df, original_scores)

# Display the heatmap
show(heatmap)


In [11]:
from bokeh.models import LabelSet

from bokeh.plotting import figure
from bokeh.models import LinearColorMapper, ColorBar, ColumnDataSource, HoverTool, LabelSet, FixedTicker

def create_heatmap_T(data_matrix, original_scores, selected_rows=None,hide_scores_tasks=[], width=800, height=1400):
    plot_width = width
    plot_height = height
    n_rows, n_cols = data_matrix.shape

    # Clean column names (remove 'benczechmark_' prefix)
    data_matrix.columns = data_matrix.columns.str.replace('benczechmark_', '', regex=False)
    original_scores.columns = original_scores.columns.str.replace('benczechmark_', '', regex=False)

    if selected_rows is not None:
        # Select only the specified rows (models)
        data_matrix = data_matrix[selected_rows]
        original_scores = original_scores[selected_rows]

    # Set up the figure with tasks as x-axis and models as y-axis
    p = figure(
        width=plot_width, height=plot_height, 
        x_range=list(data_matrix.index), y_range=list(data_matrix.columns), 
        toolbar_location="below", tools="pan,wheel_zoom,box_zoom,reset,save", 
        x_axis_label="Task", y_axis_label="Model"
    )

    # Create the color mapper for the heatmap
    color_mapper = LinearColorMapper(palette='Viridis256', low=0, high=1)  # Light for low values, dark for high

    # Flatten the matrix for Bokeh plotting
    heatmap_data = {
        'x': [],
        'y': [],
        'colors': [],
        'model_names': [],  # Updated: Reflects model names now
        'scores': [],
    }
    label_data = {
        'x': [],
        'y': [],
        'value': [],
        'text_color': [],  # New field for label text colors
    }
    
    # Iterate through the data_matrix to populate heatmap and label data
    for row_idx, (model_name, task_scores) in enumerate(data_matrix.iterrows()):
        for col_idx, score in enumerate(task_scores):
            heatmap_data['x'].append(model_name)  # Model goes to x-axis
            heatmap_data['y'].append(data_matrix.columns[col_idx])  # Task goes to y-axis
            heatmap_data['colors'].append(score)
            heatmap_data['model_names'].append(model_name)  # Model names added to hover info

            # Get the original score
            original_score = original_scores.loc[model_name, data_matrix.columns[col_idx]]
            plot_score = data_matrix.loc[model_name, data_matrix.columns[col_idx]]
            heatmap_data['scores'].append(original_score)
            task_name = data_matrix.columns[col_idx]

            if task_name not in hide_scores_tasks:
                label_data['x'].append(model_name)
                label_data['y'].append(task_name)
                label_data['value'].append(round(original_score))  # Round the score

                # Determine text color based on score
                if plot_score <= 0.6:  # Threshold for light/dark text
                    label_data['text_color'].append('white')  # Light color for lower scores
                else:
                    label_data['text_color'].append('black')  # Dark color for higher scores

    heatmap_source = ColumnDataSource(heatmap_data)
    label_source = ColumnDataSource(label_data)

    # Create the heatmap
    p.rect(x='x', y='y', width=1, height=1, source=heatmap_source,
           line_color=None, fill_color={'field': 'colors', 'transform': color_mapper})

    # Add color bar
    # Add color bar with custom ticks
    color_bar = ColorBar(
        color_mapper=color_mapper, 
        width=8, location=(0, 0),
        ticker=FixedTicker(ticks=[0, 0.2, 0.4, 0.6, 0.8, 1]),  # Fixed ticks at 0, 20, 40, 60, 80, 100
        major_label_overrides={0: '0', 0.2: '20', 0.4: '40', 0.6: '60', 0.8: '80', 1: '100'}  # Custom labels for ticks
    )
    p.add_layout(color_bar, 'right')

    # Add HoverTool for interactivity
    hover = HoverTool()
    hover.tooltips = [("Model", "@x"), ("Task", "@y"), ("Score", "@scores")]  # Updated tooltip
    p.add_tools(hover)

    # Add labels with dynamic text color
    labels = LabelSet(x='x', y='y', text='value', source=label_source,
                      text_color='text_color', text_align='center', text_baseline='middle')
    p.add_layout(labels)

    # Customize the plot appearance
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.xaxis.major_label_orientation = "vertical"
    p.yaxis.major_label_text_font_size = "11pt"
    p.xaxis.major_label_text_font_size = "11pt"

    

    return p



In [12]:
hide_scores_tasks=["correspondence", "dialect", "essay", "fiction", "havlicek", "histcorpus", "spoken"]

# Generate the heatmap
heatmap = create_heatmap_T(heatmap_df, original_scores, hide_scores_tasks=hide_scores_tasks)


# Display the heatmap
show(heatmap)

# from bokeh.io import export_svgs
# heatmap.output_backend = "svg"
# export_svgs(heatmap, filename="heatmap_plot.svg")


In [13]:
smaller = [
    'olmo7b_instruct', 
    'phi3mini_instruct', 
    'mpt7', 
    'gemma2-2b', 
    'cstllama', 
    'gemma2-2b_instruct', 
    'internlm_instruct', 
    'csmpt', 
    'mistral03_instruct', 
    'qwen2_lm', 
    'aya23_instruct', 
    'qwen2_instruct', 
    'gemma2_instruct', 
    'llama3_instruct', 
    'llama31_lm', 
    'hermes_llama31', 
    'gemma2_lm', 
    'mistral_nemo_instruct', 
    'llama31_instruct'
]

In [14]:
larger = list(set(heatmap_df.index.tolist()) - set(smaller) - set(["llama3.1_405b_instruct"]))

def minmax_normalize_rows(matrix_data):
    # Apply Min-Max normalization to each row
    normalized_matrix = matrix_data.sub(matrix_data.min(axis=1), axis=0)  # Subtract the min of each row
    normalized_matrix = normalized_matrix.div(normalized_matrix.max(axis=1), axis=0)  # Divide by the max of each row
    return normalized_matrix
def minmax_normalize_columns(matrix_data):
    # Apply Min-Max normalization to each column
    normalized_matrix = matrix_data.sub(matrix_data.min(axis=0), axis=1)  # Subtract the min of each column
    normalized_matrix = normalized_matrix.div(normalized_matrix.max(axis=0), axis=1)  # Divide by the max of each column
    return normalized_matrix

heatmap_df_filtered = heatmap_df.loc[heatmap_df.index.isin(larger)]
normalized_m = minmax_normalize_columns(heatmap_df_filtered)
original_scores_filtered = original_scores.loc[original_scores.index.isin(larger)]

# 1. Compute Cosine Distance between tasks (rows)
cosine_dist_matrix = pdist(normalized_m, metric='cosine')

# 2. Perform hierarchical clustering on the distance matrix
# Using 'average' linkage method for clustering
Z = linkage(cosine_dist_matrix, method='average')

# 3. Get the order of rows based on the clustering
sorted_index = leaves_list(Z)

# Reorder the DataFrame according to the clustering result
heatmap_df_filtered_sorted = normalized_m.iloc[sorted_index]
original_scores_filtered_sorted = original_scores_filtered.iloc[sorted_index]


# Generate the heatmap
heatmap = create_heatmap_T(heatmap_df_filtered_sorted, original_scores_filtered_sorted, width=460, hide_scores_tasks=hide_scores_tasks)

# Display the heatmap
show(heatmap)

from bokeh.io import export_svgs
heatmap.output_backend = "svg"
export_svgs(heatmap, filename="a.svg")

RuntimeError: Neither firefox and geckodriver nor a variant of chromium browser and chromedriver are available on system PATH. You can install the former with 'conda install -c conda-forge firefox geckodriver'.

In [None]:
smaller = [
    'olmo7b_instruct', 
    'phi3mini_instruct', 
    'mpt7', 
    'gemma2-2b', 
    'cstllama', 
    'gemma2-2b_instruct', 
    'internlm_instruct', 
    'csmpt', 
    'mistral03_instruct', 
    'qwen2_lm', 
    'aya23_instruct', 
    'qwen2_instruct', 
    'gemma2_instruct', 
    'llama3_instruct', 
    'llama31_lm', 
    'hermes_llama31', 
    'gemma2_lm', 
    'mistral_nemo_instruct', 
    'llama31_instruct'
]

def minmax_normalize_rows(matrix_data):
    # Apply Min-Max normalization to each row
    normalized_matrix = matrix_data.sub(matrix_data.min(axis=1), axis=0)  # Subtract the min of each row
    normalized_matrix = normalized_matrix.div(normalized_matrix.max(axis=1), axis=0)  # Divide by the max of each row
    return normalized_matrix
def minmax_normalize_columns(matrix_data):
    # Apply Min-Max normalization to each column
    normalized_matrix = matrix_data.sub(matrix_data.min(axis=0), axis=1)  # Subtract the min of each column
    normalized_matrix = normalized_matrix.div(normalized_matrix.max(axis=0), axis=1)  # Divide by the max of each column
    return normalized_matrix

heatmap_df_filtered = heatmap_df.loc[heatmap_df.index.isin(smaller)]
normalized_m = minmax_normalize_columns(heatmap_df_filtered)
original_scores_filtered = original_scores.loc[original_scores.index.isin(smaller)]

# 1. Compute Cosine Distance between tasks (rows)
cosine_dist_matrix = pdist(normalized_m, metric='cosine')

# 2. Perform hierarchical clustering on the distance matrix
# Using 'average' linkage method for clustering
Z = linkage(cosine_dist_matrix, method='average')

# 3. Get the order of rows based on the clustering
sorted_index = leaves_list(Z)

# Reorder the DataFrame according to the clustering result
heatmap_df_filtered_sorted = normalized_m.iloc[sorted_index]
original_scores_filtered_sorted = original_scores_filtered.iloc[sorted_index]


# Generate the heatmap
heatmap = create_heatmap_T(heatmap_df_filtered_sorted, original_scores_filtered_sorted, hide_scores_tasks=hide_scores_tasks)

# Display the heatmap
show(heatmap)
from bokeh.io import export_svgs
heatmap.output_backend = "svg"
export_svgs(heatmap, filename="b.svg")

In [None]:
heatmap_df_filtered_sorted.axes[1]

In [None]:
# Generate the heatmap
heatmap = create_heatmap_T(heatmap_df, original_scores, selected_rows=["propaganda_emoce","subjectivity","sentiment_fb","sentiment_mall","sentiment_csfd"],height=300)

# Display the heatmap
show(heatmap)

In [None]:
heatmap_df.loc

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

# 1. Compute Cosine Similarity between tasks (rows)
# DO IT ON ORIGINAL DATA,linearly scaled to <0,1> WITH MAX-TRANSFORMED PPL!
# Quantile transform distorts correlations and distances within and across features.

# 1. Compute Cosine Distance between tasks (rows)
cosine_dist_matrix = pdist(MinMaxScaler().fit_transform(heatmap_df), metric='cosine')

# 2. Perform hierarchical clustering on the distance matrix
# Using 'average' linkage method for clustering
Z = linkage(cosine_dist_matrix, method='average')


plt.figure(figsize=(10, 8))
dendrogram(Z, labels=heatmap_df.index, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram of Task Clustering')
plt.xlabel('Tasks')
plt.ylabel('Distance')
# plt.yscale('log')
plt.show()


In [None]:
import umap
from scipy.spatial.distance import pdist, squareform
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

# 1. Compute Cosine Distance between models (rows of heatmap_df)
cosine_dist_matrix = pdist(heatmap_df, metric='cosine')

# Convert the distance matrix into a square form for UMAP
cosine_dist_matrix_square = squareform(cosine_dist_matrix)

# 2. Apply UMAP for dimensionality reduction
umap_model = umap.UMAP(metric='precomputed', random_state=42)
umap_embedding = umap_model.fit_transform(cosine_dist_matrix_square)

# 3. Prepare data for Bokeh plot
source = ColumnDataSource(data={
    'x': umap_embedding[:, 0],
    'y': umap_embedding[:, 1],
    'model_name': heatmap_df.index,  # Each model's name
})

# 4. Create the Bokeh plot
p = figure(title="UMAP Projection of Models", width=800, height=600,
           tools="pan,wheel_zoom,box_zoom,reset,save", 
           x_axis_label='UMAP Dimension 1', y_axis_label='UMAP Dimension 2')

# Add points to the plot
p.circle('x', 'y', size=10, source=source, fill_alpha=0.6, line_color='black')

# 5. Add hover tooltips
hover = HoverTool()
hover.tooltips = [
    ("Model", "@model_name"),
    ("UMAP Dimension 1", "@x"),
    ("UMAP Dimension 2", "@y")
]
p.add_tools(hover)

# 6. Display the plot
#show(p)


In [None]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool

# 1. Apply t-SNE for dimensionality reduction on the models (rows of heatmap_df)
n_models = heatmap_df.shape[0]
perplexity_value = min(3, n_models - 1)  # Example: set perplexity to 5 or less, but not higher than n_models - 1
tsne_model = TSNE(n_components=2, metric='cosine', perplexity=perplexity_value, random_state=42)
tsne_embedding = tsne_model.fit_transform(heatmap_df)

# 2. Prepare data for Bokeh plot
source = ColumnDataSource(data={
    'x': tsne_embedding[:, 0],
    'y': tsne_embedding[:, 1],
    'model_name': heatmap_df.index,  # Each model's name
})

# 3. Create the Bokeh plot
p = figure(title="t-SNE Projection of Models", width=800, height=600,
           tools="pan,wheel_zoom,box_zoom,reset,save", 
           x_axis_label='t-SNE Dimension 1', y_axis_label='t-SNE Dimension 2')

# Add points to the plot
p.circle('x', 'y', size=10, source=source, fill_alpha=0.6, line_color='black')

# 4. Add hover tooltips
hover = HoverTool()
hover.tooltips = [
    ("Model", "@model_name"),
    ("t-SNE Dimension 1", "@x"),
    ("t-SNE Dimension 2", "@y")
]
p.add_tools(hover)

# 5. Display the plot
show(p)


In [None]:
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, leaves_list, dendrogram
from sklearn.metrics.pairwise import cosine_distances

# List of unique categories
categories = sorted(set(task_to_category.values()))

for category in categories:
    # Filter tasks that belong to the current category
    tasks_in_category = [task for task in heatmap_df.columns if task_to_category.get("benczechmark_"+task) == category]
    
    # If there are no tasks in this category, skip it
    if len(tasks_in_category) == 0:
        raise ValueError(f"No tasks in category {category}!")

    # Filter the dataframes for the current category
    category_heatmap_df = heatmap_df[tasks_in_category]
    category_original_scores = original_scores[tasks_in_category]
    
    # 1. Compute Cosine Distance between tasks (columns)
    cosine_dist_matrix = pdist(category_heatmap_df, metric='cosine')

    # 2. Perform hierarchical clustering on the distance matrix
    Z = linkage(cosine_dist_matrix, method='average')

    # 3. Get the order of columns based on the clustering
    sorted_index = leaves_list(Z)

    # Reorder the DataFrames according to the clustering result
    category_heatmap_df = category_heatmap_df.iloc[sorted_index,:]
    category_original_scores = category_original_scores.iloc[sorted_index, :]

    # Generate the heatmap for the current category
    heatmap = create_heatmap(category_heatmap_df, category_original_scores)
    show(heatmap)

    # Plot the dendrogram for the current category
    plt.figure(figsize=(10, 8))
    dendrogram(Z, labels=category_heatmap_df.index, leaf_rotation=90, leaf_font_size=10)
    plt.title(f'Dendrogram of {category} Task Clustering')
    plt.xlabel('Tasks')
    plt.ylabel('Distance (Cosine)')
    plt.yscale('log')
    #plt.show()
