In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import json
import numpy as np
import re


In [None]:
ACC_DIR = Path("/Workspace/Users/rkoopal@deloitte.nl/accuracies")
IMAGE_DIR = Path("/Workspace/Users/rkoopal@deloitte.nl/images")

TASKS = ['sib200', 'xnli', 'wikiann']
OVERWRITE = True

In [None]:
def get_df(
    task,
    data_dir = ACC_DIR,
    overwrite = True,
    df = None
):
    if not df:
        df = pd.DataFrame()
    for file in data_dir.iterdir():
        print(file)
        if not file.is_file():
            continue
        with open(file, 'r', encoding='utf-8') as f:
            accuracies = json.load(f)
        model = str(file.stem).replace("_accuracies", "")
        task_scores = accuracies.get(task, None)
        if task_scores is None:
            continue
        if not model in df.index or overwrite:
            for lang, scores in task_scores.items():
                    df.loc[model, lang] = scores['accuracy']
    
    df.index = df.index.map(strip_name)

    return df

In [None]:
import matplotlib.lines as mlines
def extract_languages(model_name):
    # This function will extract language codes using regex, assuming the format includes '_xx-'
    # where 'xx' represents the two-letter language code.
    matches = re.findall(r'_([a-z]{2})-', model_name)
    return matches[:2]  # Return the first two matches, which should be the relevant languages

def plot_kiviat(
    df,
    lower_bound = 0,
    upper_bound = 1,
    title = "title",
    fill = False,
    colors = None,
    dots = True,
    image_dir = IMAGE_DIR,
    name = None
):
    categories = list(df.columns)
    N = len(categories)

    # What will be the angle of each axis in the plot? (divide the plot / number of variable)
    angles = [n / float(N) * 2 * 3.14159 for n in range(N)]
    angles += angles[:1]  # to close the loop

    # The plot is a circle, so we need to make sure the figure is a square
    fig, ax = plt.subplots(figsize=(8, 10), subplot_kw=dict(polar=True))

    # Draw one axe per variable and add labels
    plt.xticks(angles[:-1], categories)
    ax.set_ylim(lower_bound, upper_bound)  # Setting the limits from 0 to 1

    if colors is None:
        cmap = plt.get_cmap(colors)
        colors = cmap(np.linspace(0, 1, len(df.columns)))
    else:
        colors = plt.cm.viridis(np.linspace(0, 1, len(df)))  # Generate colors


    # Plot data for each entity
    for i, (idx, row) in enumerate(df.iterrows()):
        values = row.tolist()
        values += values[:1]  # Close the loop
        ax.plot(angles, values, linewidth=1, linestyle='solid', label=idx, color=colors[i])
        if fill:
            ax.fill(angles, values, alpha=0.1, color=colors[i])

        for label, angle_rad in zip(ax.get_xticklabels(), angles[:-1]):
            if angle_rad <= np.pi / 2 or angle_rad >= 3 * np.pi / 2:
                label.set_horizontalalignment("left")
            else:
                label.set_horizontalalignment("right")
            # Adjust the vertical alignment as well
            label.set_verticalalignment("bottom" if angle_rad < np.pi else "top")


         # Plot dots on specified categories if they appear in the row index
        if dots:
            for cat_index, cat in enumerate(categories):
                languages = extract_languages(cat)
                lang1, lang2 = languages[0], languages[1]
                if idx == lang1:  # Check if the category appears in the model name
                    angle = angles[cat_index]
                    value = values[cat_index]
                    ax.plot(angle, value, 'o', color='red', markersize=4)  # Plot a red dot
                if idx == lang2:  # Check if the category appears in the model name
                    angle = angles[cat_index]
                    value = values[cat_index]
                    ax.plot(angle, value, 'o', color='blue', markersize=4)  # Plot a red dot

    # Add a title and a legend
    plt.title(title, size=15, color='black', y=1.1)
    plt.tight_layout()
        # Initial legend from existing lines
    handles, labels = ax.get_legend_handles_labels()
    
    # Custom legend entries for specific tasks with red dots
    lang_task1_legend = mlines.Line2D([], [], color='red', marker='o', linestyle='None', markersize=4, label='Language task 1')
    lang_task2_legend = mlines.Line2D([], [], color='blue', marker='o', linestyle='None', markersize=4, label='Language task 2')
    
    # Combine original legend handles with custom ones
    if dots:
        handles.extend([lang_task1_legend, lang_task2_legend])

    # Display the combined legend
    ax.legend(handles=handles, loc='upper right', bbox_to_anchor=(-0.1, 0))

    if name:
        plt.savefig(image_dir / name, bbox_inches='tight')  # Save the figure to the file



In [None]:
# def plot_bar_charts(dataframes, labels, average_axis=1, xlabel='Languages', ylabel='Accuracy', title='Title', squish=1):
#     """
#     Plots comparison bar charts of multiple dataframes with customization options.
#     Bars are grouped together based on the 'squish' parameter, and each dataframe is represented by a consistent color.
#     A small gap is maintained between each group.

#     Parameters:
#         dataframes (list of pd.DataFrame): List of dataframes to plot.
#         labels (list of str): Labels for each dataframe, used for the legend.
#         average_axis (int): Axis to compute the average (0 for models, 1 for languages).
#         xlabel (str): Label for the x-axis.
#         ylabel (str): Label for the y-axis.
#         title (str): Title of the plot.
#         squish (int): Number of models to group together.
#     """
#     if len(dataframes) != len(labels):
#         raise ValueError("Each dataframe must have a corresponding label.")

#     # Combine all labels from the dataframes
#     all_labels = pd.Index([])
#     for df in dataframes:
#         all_labels = all_labels.union(df.columns if average_axis == 0 else df.index)

#     # Calculate the average for each dataframe and reindex to include all labels
#     averages = [df.mean(axis=average_axis).reindex(all_labels, fill_value=0) for df in dataframes]

#     # Creating a figure and a set of subplots
#     fig, ax = plt.subplots(figsize=(12, 8))

#     # The total number of groups
#     num_groups = (len(all_labels) + squish - 1) // squish

#     # Set up bar width and base indices
#     base_width = 0.8 / squish
#     group_spacing = 0.1  # Additional spacing between groups
#     indices = np.arange(0, num_groups * squish * (base_width + group_spacing), squish * (base_width + group_spacing))

#     # Plotting each dataframe
#     for i, (avg, label) in enumerate(zip(averages, labels)):
#         for j in range(0, len(all_labels), squish):
#             group_labels = all_labels[j:j+squish]
#             group_values = avg[j:j+squish]
#             group_index = indices[j // squish] + i * base_width
#             ax.bar(group_index, group_values, base_width, alpha=0.8, label=label if j == 0 else "")

#     # Adjusting x-ticks to align with the middle of groups
#     ax.set_xticks(indices + (base_width * len(dataframes) / 2) - (base_width / 2))
#     ax.set_xticklabels([','.join(all_labels[i:i+squish]) for i in range(0, len(all_labels), squish)], rotation=45, ha='right')

#     # Setting labels and title
#     ax.set_xlabel(xlabel)
#     ax.set_ylabel(ylabel)
#     ax.set_title(title)
#     ax.legend(handles=[ax.patches[i] for i in range(len(dataframes))], labels=labels)

#     plt.tight_layout()

#     # Dynamic y-axis limits based on data
#     max_y_value = max([avg.max() for avg in averages])
#     ax.set_ylim([0, max_y_value * 1.1])

#     plt.show()

# # Example usage as previously defined


# def plot_bar_charts(dataframes, labels, average_axis=1, xlabel='Languages', ylabel='Accuracy', title='Title', squish=1):
#     """
#     Plots comparison bar charts of multiple dataframes with customization options.
#     Bars are grouped together based on the 'squish' parameter, ensuring even group distribution.

#     Parameters:
#         dataframes (list of pd.DataFrame): List of dataframes to plot.
#         labels (list of str): Labels for each dataframe, used for the legend.
#         average_axis (int): Axis to compute the average (0 for models, 1 for languages).
#         xlabel (str): Label for the x-axis.
#         ylabel (str): Label for the y-axis.
#         title (str): Title of the plot.
#         squish (int): Number of models to group together.
#     """
#     if len(dataframes) != len(labels):
#         raise ValueError("Each dataframe must have a corresponding label.")

#     # Combine all labels from the dataframes
#     all_labels = pd.Index([])
#     for df in dataframes:
#         all_labels = all_labels.union(df.columns if average_axis == 0 else df.index)

#     # Calculate the average for each dataframe and reindex to include all labels
#     averages = [df.mean(axis=average_axis).reindex(all_labels, fill_value=0) for df in dataframes]

#     # Creating a figure and a set of subplots
#     fig, ax = plt.subplots(figsize=(12, 8))

#     # The total number of groups
#     num_groups = (len(all_labels) + squish - 1) // squish  # This handles incomplete groups

#     # Set up bar width and base indices
#     base_width = 0.5 / squish
#     indices = np.arange(0, num_groups * squish, squish)  # Indices for each group start

#     # Plotting each dataframe
#     for i, avg in enumerate(averages):
#         # Compute individual bar positions
#         for j in range(0, len(all_labels), squish):
#             group_labels = all_labels[j:j+squish]
#             group_values = avg[j:j+squish]
#             group_index = indices[j // squish] + i * base_width
#             ax.bar(group_index, group_values, base_width, alpha=0.8, label=labels[i] if j == 0 else "")

#     # Adjusting x-ticks to align with the middle of groups
#     ax.set_xticks(indices + base_width * len(dataframes) / 2 - base_width / 2)
#     ax.set_xticklabels([','.join(all_labels[i:i+squish]) for i in range(0, len(all_labels), squish)], rotation=45, ha='right')

#     # Setting labels and title
#     ax.set_xlabel(xlabel)
#     ax.set_ylabel(ylabel)
#     ax.set_title(title)
#     ax.legend()

#     plt.tight_layout()

#     # Dynamic y-axis limits based on data
#     max_y_value = max([avg.max() for avg in averages])
#     ax.set_ylim([0, max_y_value * 1.1])

#     plt.show()


In [None]:
def create_bar_chart(dfs, df_names, agg_func=None, title='Grouped Bar Chart', ylabel='Values', xlabel='Categories', rotate_xticks=False):
    """
    Creates a grouped bar chart for given dataframes.
    
    Parameters:
    - dfs: List of pandas DataFrames.
    - df_names: List of names corresponding to each DataFrame for legend.
    - agg_func: Function used to aggregate data. If None, data is not aggregated.
    - title: Title of the plot.
    - ylabel: Y-axis label.
    - xlabel: X-axis label.
    - rotate_xticks: Boolean, set to True to rotate x-axis labels for better readability.
    
    Returns:
    - A matplotlib bar chart.
    """
    
    # Aggregate data if an aggregation function is provided
    if agg_func is not None:
        dfs = [df.agg(agg_func, axis=0) for df in dfs]
    
    # Number of groups and bar width
    n_groups = dfs[0].shape[0]
    n_dfs = len(dfs)
    bar_width = 0.8 / n_dfs
    index = np.arange(n_groups)
    
    # Create a bar for each DataFrame
    fig, ax = plt.subplots()
    for i, df in enumerate(dfs):
        means = df.mean(axis=1)
        std = df.std(axis=1)
        ax.bar(index + i * bar_width, means, bar_width, yerr=std, label=df_names[i])
    
    # Adding labels and title
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.set_xticks(index + bar_width * (n_dfs - 1) / 2)
    ax.set_xticklabels(dfs[0].index)
    ax.legend()
    
    # Rotate xticks if specified
    plt.xticks(rotation=45, ha='right')  # 'ha' aligns the labels to the right for better readability
    
    plt.show()

In [None]:
def strip_name(
    name,
    model=True,
    ft = True,
    merging=False    
):

    # Remove model, feature type, and merge types as initially intended
    if model:
        name = name.replace("mt0-large--", "base--")
        name = name.replace("mt0-large_", "")
    if ft:
        name = name.replace("ft_", "")
    if merging:
        name = name.replace("mono", "")
        name = name.replace("pooling", "")
    
    # Explicitly remove undesired characters at the beginning and end of the string
    name = name.strip("-_")

    # Finally, strip leading/trailing whitespace and return the modified name
    return name.strip()

In [None]:
def filter_models(
    df,
    terms,   # terms to keep,
    contains = all,
    out=False
):
    models = df.index.to_list()
    keep_list = []
    for term in terms:
        keep_list.append([model for model in models if term in model])
    
    if out:
        filtered_df = df[~df.index.to_series().apply(lambda idx: contains(term in idx for term in terms))]
    else:
        filtered_df = df[df.index.to_series().apply(lambda idx: contains(term in idx for term in terms))]

    return filtered_df

In [None]:
def method_comparison_data(
    dfs,
    tasks,
    languages = ['ar', 'de', 'el', 'es'],
    merging = ['mono'],
):
    assert len(tasks) < 3, "tasks cannot be above 2"

    domains = [(task, language) for language in languages for task in tasks]

    data = pd.DataFrame()

    for task, df in zip(tasks, dfs):
        task0 = tasks[0]
        task1 = tasks[1]
        for merge in merging:
            for language in languages:
                domain_name = "_".join([task, language])
                
                data.loc[f"base", domain_name] = df.loc["base", language]

                data.loc[f"base-FT", domain_name] = df.loc[f"base--{task}_{language}-{merge}", language]

                data.loc[f"FT-FT", domain_name] = df.loc[f"{task0}_{language}--{task1}_{language}-{merge}", language]
                
                data.loc[f"FT", domain_name] = df.loc[f"{task}_{language}", language]

    return data

In [None]:
from itertools import combinations

test = [1, 2, 3, 4]
[t for t in combinations(test, 2)]

In [None]:
from itertools import combinations_with_replacement, combinations

import numpy as np
from itertools import combinations_with_replacement

def get_CLA(df, task0, task1, merge, language, languages=['ar', 'de', 'el', 'es']):
    """
    Calculate the mean of specific DataFrame values excluding the specified language.

    Parameters:
    - df (DataFrame): The DataFrame to process.
    - task0 (str): The first task identifier.
    - task1 (str): The second task identifier.
    - merge (str): The merge strategy.
    - language (str): The language to exclude.
    - languages (list): List of languages to consider.

    Returns:
    - float: The calculated mean.
    """
    try:
        values = [df.loc[f"{task0}_{lang1}--{task1}_{lang2}-{merge}", language] for lang1, lang2 in combinations_with_replacement(languages, 2) if (lang1 == language and lang2 != language) or (lang1 != language and lang2 == language)]
        # print(f"values for {task0} - {task1} - {language} are {values}")
        return np.mean(values)
    except KeyError:
        print(f"One or more keys not found in DataFrame for language {language}")
        return np.nan
    
def method_comparison_data_CLA(
    dfs,
    tasks,
    languages = ['ar', 'de', 'el', 'es'],
    merging = ['mono'],
):
    assert len(tasks) < 3, "tasks cannot be above 2"

    domains = [(task, language) for language in languages for task in tasks]

    data = pd.DataFrame()

    for task, df in zip(tasks, dfs):
        task0 = tasks[0]
        task1 = tasks[1]
        for merge in merging:
            for language in languages:
                domain_name = "_".join([task, language])
                
                data.loc[f"base", domain_name] = df.loc["base", language]

                data.loc[f"base-FT", domain_name] = df.loc[f"base--{task}_{language}-{merge}", language]

                data.loc[f"FT-FT (WL)", domain_name] = df.loc[f"{task0}_{language}--{task1}_{language}-{merge}", language]

                data.loc[f"FT-FT (WT)", domain_name] = np.mean([df.loc[f"{task}_{lang1}--{task}_{lang2}-{merge}", language] for lang1, lang2 in combinations(languages, 2)])

                data.loc[f"FT-FT (CLA)", domain_name] = get_CLA(df, task0=task0, task1=task1, merge=merge, language=language)
                
                data.loc[f"FT", domain_name] = df.loc[f"{task}_{language}", language]

    return data

In [None]:
from itertools import combinations
A = 'sib200'
B = 'xnli'
X = 'de'
Y = 'de'
langs = ['ar', 'de', 'el', 'es', 'fr', 'ru']
domains = [(task, lang) for task in (A, B) for lang in langs]
ft_domains = [f"{A}_{X}", f"{B}_{Y}"]
mix_domains = [f"{A}_{Y}", f"{B}_{X}"]
non_domains = [f"{t}_{l}" for t, l in domains if f"{t}_{l}" not in ft_domains and f"{t}_{l}" not in mix_domains]

In [None]:
mix_domains

In [None]:
import re

def extract_domains(name):
    # Extract tasks and language codes from the name using regular expressions
    match = re.match(r'(\w+)_(\w{2})--(\w+)_(\w{2})-.*', name)
    if match:
        task0 = match.group(1)
        lang0 = match.group(2)
        task1 = match.group(3)
        lang1 = match.group(4)
        return task0, task1, lang0, lang1
    else:
        return None
    
def normalize_df(df):
    for column in df.columns:
        min_val = df[column].min()
        max_val = df[column].max()
        df[column] = (df[column] - min_val) / (max_val - min_val) if (max_val - min_val) != 0 else 0
    return df

def target_switch_data(
    dfs,
    tasks,
    languages = ['ar', 'de', 'el', 'es'],
    merge = ['mono'],
):
    assert len(tasks) < 3, "tasks cannot be above 2"

    data = pd.DataFrame()

    df1 = filter_models(sib_df, terms=tasks+merge)
    df2 = filter_models(xnli_df, terms=tasks+merge)
    # Create a set of the union of the two indices of the dfs
    # model_names = set(df1.index).union(set(df2.index))
    model_names = set(df1.index)
    print(model_names)
    for model in model_names:
        A, B, X, Y = extract_domains(model)
        # domains = [(A,X), (B,Y), (A,Y), (B,X)]
        domains = [(task, lang) for task in (A, B) for lang in languages]
        ft_domains = [f"{A}_{X}", f"{B}_{Y}"]
        mix_domains = [f"{A}_{Y}", f"{B}_{X}"]
        non_domains = [f"{t}_{l}" for t, l in domains if f"{t}_{l}" not in ft_domains and f"{t}_{l}" not in mix_domains]
        tmp_df = pd.DataFrame()
        for task, lang in domains:
            if task == A:
                df = dfs[0]
            else:
                df = dfs[1]
            tmp_df.loc[f"base", f"{task}_{lang}"] = df.loc["base", lang]
            tmp_df.loc[f"FT", f"{task}_{lang}"] = df.loc[f"{task}_{lang}", lang]
            tmp_df.loc[f"FT-FT", f"{task}_{lang}"] = df.loc[model, lang]

        print(tmp_df)
        normalized = normalize_df(tmp_df)
        
        # Average the columns that fall under ft_domains and mix_domains together after normalizing
        ft_avg = normalized[ft_domains].mean(axis=1)
        mix_avg = normalized[mix_domains].mean(axis=1)
        non_avg = normalized[non_domains].mean(axis=1)
        
        data.loc[model, 'target-base'] = ft_avg['base']
        data.loc[model, 'target-FT'] = ft_avg['FT']
        data.loc[model, 'target-FT-FT'] = ft_avg['FT-FT']

        data.loc[model, 'switch-base'] = mix_avg['base']
        data.loc[model, 'switch-FT'] = mix_avg['FT']
        data.loc[model, 'switch-FT-FT'] = mix_avg['FT-FT']

        data.loc[model, 'non-base'] = non_avg['base']
        data.loc[model, 'non-FT'] = non_avg['FT']
        data.loc[model, 'non-FT-FT'] = non_avg['FT-FT']
        
    return data

In [None]:
def plot_bar_split(
    df,
    groups=[['D_ft-base', 'D_ft-FT', 'D_ft-FT-FT'], ['D_mix-base', 'D_mix-FT', 'D_mix-FT-FT']]
):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))

    # D_ft related columns
    df[groups[0]].plot(kind='bar', ax=axes[0], title='D_ft Columns')
    axes[0].set_ylabel('Values')
    axes[0].set_xticklabels(df.index, rotation=90)

    # D_mix related columns
    df[groups[1]].plot(kind='bar', ax=axes[1], title='D_mix Columns')
    axes[1].set_ylabel('Values')
    axes[1].set_xticklabels(df.index, rotation=90)

    plt.tight_layout()
    plt.show()

In [None]:
def plot_data(df, show_averages=False, title=None, xlabel='Models', ylabel='Values', xticks_rotation=45, ylim=None, save_path=None, color_map='YlGnBu'):
    """
    Plots a bar chart for the provided DataFrame with options to display averages, customizable labels, configurable y-axis limits, and save the figure to a file.
    
    Args:
    df (pd.DataFrame): DataFrame containing the data to be plotted.
    show_averages (bool): If True, plots the average values of the columns. If False, plots individual values.
    title (str): Title of the plot.
    xlabel (str): Label for the x-axis.
    ylabel (str): Label for the y-axis.
    xticks_rotation (int): Degrees of rotation for the x-tick labels.
    ylim (tuple, optional): A tuple (ymin, ymax) setting the limits of the y-axis.
    save_path (str, optional): Path to save the figure. If None, the figure is not saved.
    """
        # Create a colormap instance
    cmap = plt.get_cmap(color_map)

    # Generate an array of colors from the colormap
    colors = cmap(np.linspace(0, 1, len(df.columns)))
    fig, ax = plt.subplots(figsize=(12, 8))
    if show_averages:
        df.mean().plot(kind='bar', ax=ax, color=colors)
        ax.set_title(title)
        ax.set_xticklabels(df.columns, rotation=xticks_rotation)
    else:
        df.plot(kind='bar', ax=ax, color=colors)
        ax.set_title(title)
        ax.set_xticklabels(df.index, rotation=xticks_rotation)

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    if ylim:
        ax.set_ylim(ylim)
    
        # Adjust the legend position
    # ax.legend(loc='upper right', bbox_to_anchor=(1, 1), bbox_transform=ax.transAxes)

    plt.tight_layout(rect=[0, 0, 1.2, 1.2])  # Adjust the overall layout to fit everything neatly

    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Figure saved to {save_path}")
    
    plt.show()

In [None]:
def get_ft_langs_data(
    dfs,
    tasks,
    langs = ['ar', 'de', 'el', 'es', 'fr', 'ru']
):
    data = pd.DataFrame()
    for task, df in zip(tasks, dfs):
        target_langs = []
        other_langs = []
        for i, row in df.iterrows():
            target_lang = str(i).split("_")[1].replace("-mono", "")
            target_langs.append(df.loc[i, target_lang])
            other_langs.append(np.mean([df.loc[i, lang] for lang in langs if lang != target_lang]))
            print(f"For {task} - {target_lang} we have target values: {df.loc[i, target_lang]}")
            print(f"For {task} - {target_lang} we have non-target values: {[df.loc[i, lang] for lang in langs if lang != target_lang]}")
        data.loc[task, 'Target languages'] = np.mean(target_langs)
        data.loc[task, 'Other languages'] = np.mean(other_langs)
    
    return data


def get_cross_langs_data(
    dfs,
    tasks,
    langs = ['ar', 'de', 'el', 'es', 'fr', 'ru']
):
    data = pd.DataFrame()
    for task, df in zip(tasks, dfs):
        target_lang_scores = []
        other_lang_scores = []
        for i, row in df.iterrows():
            print(i)
            _, _, tl1, tl2 = extract_domains(i)
            target_langs = [tl1, tl2]
            print(target_langs)
            for target_lang in target_langs:
                target_lang_scores.append(df.loc[i, target_lang])
                other_lang_scores.append(np.mean([df.loc[i, lang] for lang in langs if not lang in target_langs]))
                print(f"For {task} - {target_lang} we have target values: {df.loc[i, target_lang]}")
                print(f"For {task} - {target_lang} we have non-target values: {[df.loc[i, lang] for lang in langs if lang != target_lang]}")
        data.loc[task, 'Target languages'] = np.mean(target_lang_scores)
        data.loc[task, 'Other languages'] = np.mean(other_lang_scores)
    
    return data

# analysis

### get dfs

In [None]:
#df's 
xnli_df = get_df('xnli')
wiki_df = get_df('wikiann')
sib_df = get_df('sib200')


In [None]:
ft_sib = filter_models(sib_df, terms=['base', 'mono', 'pooling', 'wikiann', 'xnli', 'experiment'], contains=any, out=True)
ft_xnli = filter_models(xnli_df, terms=['base', 'mono', 'pooling', 'experiment', 'wikiann', 'sib', 'mt0-large'], contains=any, out=True)
ft_wiki = filter_models(wiki_df, terms=['base', 'mono', 'pooling', 'experiment', 'xnli', 'sib'], contains=any, out=True)

In [None]:
base_sib = filter_models(sib_df, terms=['base', 'sib', 'mono'], contains=all, out=False)
base_xnli = filter_models(xnli_df, terms=['base', 'xnli', 'mono'], contains=all, out=False)
base_wikiann = filter_models(wiki_df, terms=['base', 'wiki', 'mono'], contains=all, out=False)

In [None]:
ft_sib_net = ft_sib - sib_df.loc['base']
ft_xnli_net = ft_xnli - xnli_df.loc['base']
ft_wiki_net = ft_wiki - wiki_df.loc['base']

base_sib_net = base_sib - sib_df.loc['base']
base_xnli_net = base_xnli - xnli_df.loc['base']
base_wiki_net = base_wikiann - wiki_df.loc['base']

In [None]:
total = pd.concat([ft_sib_net, ft_xnli_net, ft_wiki_net, base_sib_net, base_xnli_net, base_wiki_net])

In [None]:
import pandas as pd

# Example DataFrames
# Replace these with your actual DataFrames: ft_sib_net, ft_xnli_net, ft_wiki_net, base_sib_net, base_xnli_net, base_wiki_net
# Assuming they have the same columns

# Calculate the mean for each DataFrame
ft_sib_net_mean = ft_sib_net.mean().to_frame().T
ft_xnli_net_mean = ft_xnli_net.mean().to_frame().T
ft_wiki_net_mean = ft_wiki_net.mean().to_frame().T
base_sib_net_mean = base_sib_net.mean().to_frame().T
base_xnli_net_mean = base_xnli_net.mean().to_frame().T
base_wiki_net_mean = base_wiki_net.mean().to_frame().T

# Optionally, name the rows for clarity
ft_sib_net_mean.index = ['FT_SIB-200']
ft_xnli_net_mean.index = ['FT_XNLI']
ft_wiki_net_mean.index = ['FT_WikiANN']
base_sib_net_mean.index = ['Base--FT_SIB-200']
base_xnli_net_mean.index = ['Base--XNLI']
base_wiki_net_mean.index = ['Base--WikiANN']

# Concatenate the means into a single DataFrame
total_mean = pd.concat([ft_sib_net_mean, ft_xnli_net_mean, ft_wiki_net_mean, base_sib_net_mean, base_xnli_net_mean, base_wiki_net_mean])

# Print the resulting DataFrame
print(total_mean)


In [None]:
plot_data(total_mean, title="Accuracy for FT and base merged models presented as net difference with zero-shot scores.", xlabel="Model", ylabel="Accuracy (minus base score)", save_path= IMAGE_DIR / 'net_diff_ft_base.png')

In [None]:
cl_sib = filter_models(filter_models(sib_df, terms=['mono', 'sib']), terms=['base', 'xnli', 'wiki'], contains=any, out=True)
cl_xnli = filter_models(filter_models(xnli_df, terms=['mono', 'xnli']), terms=['base', 'sib', 'wiki'], contains=any, out=True)
cl_wiki = filter_models(filter_models(wiki_df, terms=['mono', 'wiki']), terms=['base', 'sib', 'xnli'], contains=any, out=True)

In [None]:
base_data = get_ft_langs_data(dfs=[base_sib, base_xnli, base_wikiann], tasks=['sib200', 'xnli', 'wikiann'])
ft_data = get_ft_langs_data(dfs=[ft_sib, ft_xnli, ft_wiki], tasks=['sib200', 'xnli', 'wikiann'])
cl_data = get_cross_langs_data([cl_sib, cl_xnli, cl_wiki], tasks=['sib200', 'xnli', 'wikiann'])
ct_data = get_cross_langs_data([cl_sib, cl_xnli, cl_wiki], tasks=['sib200', 'xnli', 'wikiann'])


### target vs other langs

In [None]:
ft_data = ft_data.rename(columns={"Target languages": "Target languages (ft)", "Other languages": "Other languages (FT)"})
base_data = base_data.rename(columns={"Target languages": "Target languages (base-ft)", "Other languages": "Other languages (base-FT)"})
cl_data = cl_data.rename(columns={"Target languages": "Target languages (ft-WL)", "Other languages": "Other languages (FT-WL)"})


In [None]:
merged_data = pd.concat([ft_data, base_data, cl_data], axis=1)

In [None]:
name = IMAGE_DIR / "base_base_ft-ft_wl-target-scores.png"
# name = None
plot_data(merged_data, xticks_rotation=0, ylim=(0, 1.1), ylabel='Accuracy', xlabel='Task', save_path=name, color_map='YlGnBu', title="Target vs non-target language performance")

In [None]:
xnli_df = filter_models(xnli_df, terms=['xnli'])
wiki_df = filter_models(xnli_df, terms=['wikiann'])
sib_df = filter_models(xnli_df, terms=['sib'])

### Target vs switched domains

In [None]:
sib_df_mono = filter_models(sib_df, terms=['mono', 'sib'])
xnli_df_mono = filter_models(sib_df, terms=['mono', 'xnli'])
wiki_df_mono = filter_models(sib_df, terms=['mono', 'wiki'])

In [None]:
ct_data = target_switch_data([wiki_df, xnli_df], tasks=['sib200', 'xnli'])

In [None]:
plot_data(ct_data[['target-FT-FT', 'switch-FT-FT']], show_averages=False, ylabel="Score relative to base and FT scores", xlabel="FT-FT-models", title="Comparison on target and switched domains for Wikiann-SIB200", save_path= IMAGE_DIR / "target_switch_comparison_wiki_sib.png", ylim=())

In [None]:
ct_pivot = ct_data[['target-FT-FT', 'switch-FT-FT', 'non-FT-FT']].rename(columns={'target-FT-FT': "Target domains", "switch-FT-FT" : "Switched domains", "non-FT-FT": "Other domains"}).transpose()
ct_pivot.columns = [col.replace("-mono", "") for col in ct_pivot.columns]

In [None]:
plot_kiviat(ct_pivot, dots=False, title="Cross task peformance FT-FT models on target and switched domains - SIB200 - XNLI", name="ct-perfromance-sib-xnli.png", colors="YlGnBu")

### method comparison

In [None]:
merge='mono'
task1 = 'wikiann'
task2 = 'sib200'
df1 = wiki_df
df2 = sib_df

# name = f"method_comparison_{task1}_{task2}_{merge}.png"
cla_name = f"method_comparison_CLA_{task1}_{task2}_{merge}.png"
# cla_name = None

cla_data = method_comparison_data_CLA(dfs=[df1, df2], tasks=[task1, task2], merging=[merge])
# data = method_comparison_data(dfs=[df1, df2], tasks=[task1, task2], merging=[merge])
plot_kiviat(cla_data, lower_bound=0, upper_bound=1.0, dots=False, title=f"Method comparison for {task1} and {task2} - {merge} merged", name=cla_name)
# plot_kiviat(data, lower_bound=0, upper_bound=1.0, dots=False, title=f"Method comparison for {task1} and {task2} - {merge} merged", name=name)


### mono - pool comparison

In [None]:
def compare_mono_pool(
    df,
    task
):
    mono_df = filter_models(df, terms=['mono', task], contains=all)
    pool_df = filter_models(df, terms=['pool', task], contains=all)

    assert len(mono_df) == len(pool_df), f"different lengths {len(mono_df)}, {len(pool_df)}"

    diff_df = pd.DataFrame(columns=mono_df.columns)

    for i, row in mono_df.iterrows():
        model = str(i).replace("-mono", "")
        pooling = model + "-pooling"

        mono_row = mono_df.loc[i]
        pool_row = pool_df.loc[pooling]

        diff = mono_row - pool_row
        diff_df.loc[model] = diff

    return diff_df

In [None]:
diff_sib = compare_mono_pool(sib_df, task='sib200')
diff_xnli = compare_mono_pool(xnli_df, task='xnli')
diff_wiki = compare_mono_pool(wiki_df, task='wikiann')

diff_sib_avg = np.mean(diff_sib)
diff_xnli_avg = np.mean(diff_xnli)
diff_wiki_avg = np.mean(diff_wiki)

In [None]:
diff_df = pd.DataFrame(columns=diff_sib.columns)
diff_df.loc['sib200'] = diff_sib_avg
diff_df.loc['xnli'] = diff_xnli_avg
diff_df.loc['wikiann'] = diff_wiki_avg

In [None]:
plot_data(diff_df, xticks_rotation=0, title='Difference in accuracy for merge methods (mono - pooling)', xlabel='Tasks', ylabel=('Accuracy mono - accuracy pooling'), save_path=IMAGE_DIR / "mono-pooling-acc.png", color_map='tab20', ylim=(-0.1, 0.1))

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

def plot_comparison_data(dfs, labels, title=None, xlabel='Models', ylabel='Values', xticks_rotation=45, ylim=None, save_path=None, colors=None):
    """
    Plots a bar chart for the provided list of DataFrames with options to display customizable labels, configurable y-axis limits, and save the figure to a file.

    Args:
    dfs (list of pd.DataFrame): List of DataFrames containing the data to be plotted.
    labels (list of str): Labels for each DataFrame, used for the legend.
    title (str, optional): Title of the plot.
    xlabel (str, optional): Label for the x-axis.
    ylabel (str, optional): Label for the y-axis.
    xticks_rotation (int, optional): Degrees of rotation for the x-tick labels.
    ylim (tuple, optional): A tuple (ymin, ymax) setting the limits of the y-axis.
    save_path (str, optional): Path to save the figure. If None, the figure is not saved.
    colors (list of str, optional): Colors for each DataFrame's bars.
    """
    fig, ax = plt.subplots(figsize=(14, 8))
    
    # Number of dataframes and width of each bar
    num_dfs = len(dfs)
    bar_width = 0.8 / num_dfs  # Adjust the width based on the number of DataFrames

    # Generate positions for each group of bars
    indices = np.arange(len(dfs[0]))  # Assumes all dfs have the same length and index
    
    for i, df in enumerate(dfs):
        pos = indices + i * bar_width
        ax.bar(pos, df.iloc[:, 0], width=bar_width, label=labels[i], color=colors[i] if colors else None)
    
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xticks(indices + bar_width * (num_dfs - 1) / 2)
    ax.set_xticklabels(dfs[0].index, rotation=xticks_rotation)
    
    if ylim:
        ax.set_ylim(ylim)

    ax.legend()

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
        print(f"Figure saved to {save_path}")
    
    plt.show()



## CF comparison

In [None]:
xnli0 = filter_models(xnli_df, terms=['pooling', 'experiment', 'xnli', 'base'], contains=any, out=True)

In [None]:
xnli0_wikisib = filter_models(xnli0, terms=['wikiann', 'sib'])

In [None]:
import re

def extract_domains(name):
    # Extract tasks and language codes from the name using regular expressions
    match = re.match(r'(\w+)_(\w{2})--(\w+)_(\w{2})-.*', name)
    if match:
        task0 = match.group(1)
        lang0 = match.group(2)
        task1 = match.group(3)
        lang1 = match.group(4)
        return task0, task1, lang0, lang1
    else:
        return None
    
def normalize_df(df):
    for column in df.columns:
        min_val = df[column].min()
        max_val = df[column].max()
        df[column] = (df[column] - min_val) / (max_val - min_val) if (max_val - min_val) != 0 else 0
    return df

def target_switch_data_2(
    df,
    eval_task,
    languages = ['ar', 'de', 'el', 'es', 'fr', 'ru'],
    merge = ['mono'],
):
    '''
    switch and target data on 1 task, for provided models
    '''
    data = pd.DataFrame()
    model_names = set(df.index)
    print(model_names)
    for model in model_names:
        _, _, X, Y = extract_domains(model)
        
        target_langs = [lang for lang in languages if lang in [X, Y]]
        non_target_langs = [lang for lang in languages if lang not in [X, Y]]

        model_df = df.loc[model]

        # Average the columns that fall under ft_domains and mix_domains together after normalizing
        tl_avg = df.loc[model, target_langs].mean()
        ntl_avg = df.loc[model, non_target_langs].mean()
        
        data.loc[model.replace("-mono", ""), 'target_lang'] = tl_avg
        data.loc[model.replace("-mono", ""), 'non_target_lang'] = ntl_avg
    return data

In [None]:
xnli0_wikisib_TS = target_switch_data_2(xnli0_wikisib, eval_task='xnli')

In [None]:
plot_data(xnli0_wikisib, show_averages=True, color_map='Pastel2', ylabel="Accuracy on XNLI", title="Performance of WikiANN-SIB200 merged models averaged per language on XNLI.", xticks_rotation=0, save_path= IMAGE_DIR / 'wikisib-on-xnli-per-lang.png', xlabel='Language')

In [None]:
plot_data(xnli0_wikisib_TS, show_averages=False, xlabel="Model (mono merged)", ylabel="Accuracy on XNLI", title="Performance of WikiANN-SIB200 merged models on target and non-target languages on XNLI.", xticks_rotation=35, color_map="tab20", save_path= IMAGE_DIR / 'target-non-wikisib-on-xnli.png')

In [None]:
ft_sib0 = filter_models(sib_df, terms=['mono', 'pooling', 'experiment', 'sib', 'base'], contains=any, out=True)
ft_xnli0 = filter_models(xnli_df, terms=['mono', 'pooling', 'experiment', 'xnli', 'base'], contains=any, out=True)
ft_wikiann0 = filter_models(wiki_df, terms=['mono', 'pooling', 'experiment', 'wiki', 'base'], contains=any, out=True)

bm_sib0 = filter_models(filter_models(sib_df, terms=['mono', 'base'], contains=all, out=False), terms=['sib'], out=True, contains=any)
bm_xnli0 = filter_models(filter_models(xnli_df, terms=['mono', 'base'], contains=all, out=False), terms=['xnli'], out=True, contains=any)
bm_wiki0 = filter_models(filter_models(wiki_df, terms=['mono', 'base'], contains=all, out=False), terms=['wiki'], out=True, contains=any)

bm_sib0_xnli = filter_models(bm_sib0, terms=['xnli'])
bm_sib0_wiki = filter_models(bm_sib0, terms=['wiki'])

bm_xnli0_sib = filter_models(bm_xnli0, terms=['sib'])
bm_xnli0_wiki = filter_models(bm_xnli0, terms=['wiki'])

bm_wiki0_xnli = filter_models(bm_wiki0, terms=['xnli'])
bm_wiki0_sib = filter_models(bm_wiki0, terms=['sib'])

ft_sib0_xnli = filter_models(ft_sib0, terms=['xnli'])
ft_sib0_wiki = filter_models(ft_sib0, terms=['wiki'])

ft_xnli0_sib = filter_models(ft_xnli0, terms=['sib'])
ft_xnli0_wiki = filter_models(ft_xnli0, terms=['wiki'])

ft_wiki0_xnli = filter_models(ft_wikiann0, terms=['xnli'])
ft_wiki0_sib = filter_models(ft_wikiann0, terms=['sib'])

sib_base = sib_df.loc['base']
xnli_base = xnli_df.loc['base']
wiki_base = wiki_df.loc['base']

In [None]:
import pandas as pd

def create_comparison_df(base_values, ft_wiki_mean, bm_wiki_mean, ft_xnli_mean, bm_xnli_mean, columns):
    """
    Create a DataFrame to compare base values and filtered model means.

    Parameters:
    - base_values : pd.Series 
    - ft_wiki_mean : pd.Series
    - bm_wiki_mean : pd.Series
    - ft_xnli_mean : pd.Series
    - bm_xnli_mean : pd.Series
    - columns : list of str

    Returns:
    - pd.DataFrame
    """
    index_labels = ['base', 'ft-wiki', 'base-ft-wiki', 'ft-xnli', 'xnli_avg']
    data = {
        'base': base_values,
        'ft-wiki': ft_wiki_mean,
        'base-ft-wiki': bm_wiki_mean,
        'ft-xnli': ft_xnli_mean,
        'xnli_avg': bm_xnli_mean
    }
    comparison_df = pd.DataFrame(data, index=index_labels, columns=columns)
    
    return comparison_df



In [None]:
# Example DataFrame creation for each task
columns = ['ar', 'de', 'el', 'es', 'fr', 'ru']

# Assuming you have these base and mean values calculated or loaded
zero_sib = create_comparison_df(sib_base, ft_sib0_wiki.mean(), bm_sib0_wiki.mean(), ft_sib0_xnli.mean(), bm_sib0_xnli.mean(), columns)
zero_xnli = create_comparison_df(xnli_base, ft_xnli0_wiki.mean(), bm_xnli0_wiki.mean(), ft_xnli0_sib.mean(), bm_xnli0_sib.mean(), columns)
zero_wikiann = create_comparison_df(wiki_base, ft_wiki0_xnli.mean(), ft_wiki0_sib.mean(), another_mean1, another_mean2, columns)  # Fill in the proper mean calculations


In [None]:
columns = ['ar', 'de', 'el', 'es', 'fr', 'ru']
index_labels = ['base', 'ft-wiki', 'base-ft-wiki', 'ft-xnli', 'xnli_avg']

ft_wiki_mean = ft_sib0_wiki.mean()
bm_wiki_mean = bm_sib0_wiki.mean()
ft_xnli_mean = ft_sib0_xnli.mean()
bm_xnli_mean = bm_sib0_xnli.mean()