In [None]:
import yaml
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional
from pathlib import Path
import pandas as pd
plt.style.use('seaborn-whitegrid')

In [None]:
def plot(task: Optional[str] = None):
    
    # Load the log
    log_path = Path('truncation-experiment.yaml')
    with log_path.open() as f:
        log = yaml.safe_load(f)
    
    # If `task` is not specified then plot all tasks
    if task is None:
        for task in log.keys():
            if task != 'model_ids':
                plot(task=task)
        return
    
    # Loop over all the datasets with the given task
    for dataset, dataset_list in log[task].items():
    
        # Extract languages
        languages = [language.title() for language in dataset_list[0].keys() 
                     if language != 'samples' and 
                     all(dct[language]['score'] is not None for dct in dataset_list)]
        
        # Set plot colors for every language
        color_mapping = {
            'Swedish': 'red',
            'Danish': 'blue',
            'Norwegian': 'purple',
            'Icelandic': 'orange',
            'Scandi': 'green',
            'Xlmr': 'brown',
            'English': 'black',
            'Finnish': 'black',
            'German': 'black',
            'Dutch': 'black',
            'Russian': 'black',
            'Arabic': 'black',
            'Random': 'black'
        }
        colors = [color_mapping[language] for language in languages]

        # Set markers for every language
        marker_mapping = {
            'Swedish': '+',
            'Danish': '+',
            'Norwegian': '+',
            'Icelandic': '+',
            'Scandi': '+',
            'Xlmr': '+',
            'English': '+',
            'Finnish': '*',
            'German': '>',
            'Dutch': '<',
            'Russian': '^',
            'Arabic': 'v',
            'Random': '|'
        }
        markers = [marker_mapping[language] for language in languages]

        # Extract scores
        scores = {
            dct['samples']: [list(map(float, dct[language.lower()]['score'].split(' ± ')))
                             for language in languages]
            for dct in dataset_list
        }

        # Set the values for the x-axis
        x = list(scores.keys())

        # Initialise plot
        fig, (score_ax, time_ax) = plt.subplots(1, 2, figsize=(30, 9))

        # Loop over the languages and colors
        for lang_idx, (name, color, marker) in enumerate(zip(languages, colors, markers)):

            # Set the y-values, along with the lower- and upper confidence bounds
            y = [tup[lang_idx][0] for tup in scores.values()]
            ymin = [val - tup[lang_idx][1] for val, tup in zip(y, scores.values())]
            ymax = [val + tup[lang_idx][1] for val, tup in zip(y, scores.values())]

            # Create scatter plot with the y-values, and a vline plot with the
            # confidence bounds
            lang_idx_shifted = lang_idx - ((len(languages) - 1) / 2)
            factor = 1 + lang_idx_shifted / 30
            x_shifted = [i * factor for i in x]
            score_ax.scatter(x=x_shifted, y=y, marker=marker, color=color, label=name)
            score_ax.vlines(x=x_shifted, ymin=ymin, ymax=ymax, color=color, alpha=0.8)
            
        # Set name of score, which depends on the task
        score_name_mapping = {
            'ner': 'Micro-average F1-score',
            'sent': 'Matthew\'s correlation coefficient',
            'linguistic-acceptability': 'Matthew\'s correlation coefficient',
            'dep': 'Labelled attachment score'
        }

        # Add plot details
        score_ax.set_xlabel('Training samples', fontsize=13)
        score_ax.set_ylabel(score_name_mapping[task], fontsize=13)
        score_ax.set_xscale('log')
        score_ax.legend(fontsize=12)
        score_ax.set_xticks(ticks=[2 ** i for i in range(5, 5 + len(dataset_list))], 
                            labels=[2 ** i for i in range(5, 5 + len(dataset_list))])

        # Extract scores
        times = {
            dct['samples']: [dct[language.lower()]['time_taken']
                             for language in languages]
            for dct in dataset_list
        }

        # Extract the time taken
        data_dict = dict(
            samples=[dct['samples'] for dct in dataset_list], 
            times=[[dct[language.lower()]['time_taken'] for language in languages] 
                   for dct in dataset_list]
        )

        # Create box plot of time taken
        df = pd.DataFrame(data_dict).explode('times')
        sns.boxplot(x='samples', y='times', data=df, ax=time_ax)

        # Add plot details
        time_ax.set_xlabel('Training samples', fontsize=13)
        time_ax.set_ylabel('Minutes', fontsize=13)

        # Set overall plot details
        plt.suptitle(f'Performance and Training Time of on the {dataset.upper()} Dataset',
                     fontsize=16)

        # Save the plot
        plt.savefig(f'gfx/{dataset}.png', facecolor='white')

        # Display the plot
        plt.show()

In [None]:
plot()