In [None]:
root = '/Users/admin/Documents/PhD/Code/perceptual-tuning-results/'

mp_folder = root + 'ABX/mp_scores'

analysis_folder = root + 'ABX/analyses/RL_AmEnglish/resampling'

fig_path = root + 'ABX/figures/rl_input_amount.pdf'

Amount of input operationalised as total duration in seconds. See notebook for data preparation in perceptual-tuning-pnas/data git

In [None]:
# Uncomment for development/debugging
%matplotlib inline


# Uncomment to plot finalized figures
"""
import matplotlib as mpl
mpl.use("pgf")
pgf_with_custom_preamble = {
    "font.family": "serif", # use serif/main font for text elements
    "text.usetex": True,    # use inline math for ticks
    "pgf.rcfonts": False,   # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{unicode-math}",  # unicode math setup
         "\\setmainfont{Doulos SIL}" # serif font via preamble
         ]
}
mpl.rcParams.update(pgf_with_custom_preamble)
"""
from scone_phobia import apply_analysis
from scone_phobia.utils.mp_scores import estimate_std
from scone_phobia.analyses.avg_error import avg_error
import scone_phobia.metadata.add_metadata as add_metadata
from scone_phobia.analyses.RL_AmEnglish import RL_AmEnglish as AE_RL
import seaborn as sns
import numpy as np
import pandas as pd
import itertools
import math
import scipy.special

# Loading subcorpus data

In [None]:
# avg_error analysis results (without resampling)

# select relevant models
dpgmm = 'dpgmm_novtln_corpus'
percTun_filt = lambda mp_fname: dpgmm in mp_fname

# launch analyses
analysis = avg_error

df_avg = apply_analysis(analysis, mp_folder,
                        filt=percTun_filt,
                        add_metadata=add_metadata.language_register,
                        resampling=False,
                        verbose=2)

# For RL we only need mp_files tested on AE
RL_filt = lambda mp_fname: percTun_filt(mp_fname) and \
                          ('BUCtest' in mp_fname or 'WSJtest' in mp_fname)
analysis = AE_RL
# get data
df_rl = apply_analysis(analysis, mp_folder,
                       filt=RL_filt,
                       add_metadata=add_metadata.language_register,
                       resampling=False,
                       verbose=2)

# Concat two dfs
df_rl['contrast type'] = df_rl['contrast']
del df_rl['contrast']
df = pd.concat([df_avg, df_rl])

In [None]:
# parsing number of sampling iteration, subcorpus size and id from model name and concatenating
def identify_subcorpus(df):
    s_N, s_id, iter_id = [], [], [] 
    for model in df['model type']:
        tokens = model.split('_')
        assert len(tokens) == 4
        nb_subsets, subset_id = tokens[2][6:].split('-')
        nb_subsets, subset_id = int(nb_subsets), int(subset_id)
        iteration = tokens[3][4:]
        if '-final' in iteration:
            iteration = iteration[:-6]
        iteration = int(iteration)
        s_N.append(nb_subsets)
        s_id.append(subset_id)
        iter_id.append(iteration)
    df['Number of sampling iterations'] = iter_id
    df['Training set splitting factor'] = s_N
    df['Training subcorpus'] = s_id
    del df['model type']
    return df

# identify subcorpora to be linked by a line
def subcorpus_group(e):
    if e>100:
        r = e//100+1
    elif e>10:
        r = e//10+1
    else:
        r=e
    return r

In [None]:
df = identify_subcorpus(df)
df["Subcorpus group"] = [subcorpus_group(e) for e in df["Training subcorpus"]]

In [None]:
# Add scores for full corpora

# select relevant models
dpgmm = 'dpgmm_novtln_vad'
percTun_filt = lambda mp_fname: dpgmm in mp_fname

# launch analyses
analysis = avg_error

df_avg_fullcorpus = apply_analysis(analysis, mp_folder,
                                   filt=percTun_filt,
                                   add_metadata=add_metadata.language_register,
                                   resampling=False,
                                   verbose=2)

analysis = AE_RL
# get data
df_rl_fullcorpus = apply_analysis(analysis, mp_folder,
                                  filt=RL_filt,
                                  add_metadata=add_metadata.language_register,
                                  resampling=False,
                                  verbose=2)

# Concat two dfs
df_rl_fullcorpus['contrast type'] = df_rl_fullcorpus['contrast']
del df_rl_fullcorpus['contrast']
df_fullcorpus = pd.concat([df_avg_fullcorpus, df_rl_fullcorpus])
df_fullcorpus["Subcorpus group"] = [1]*len(df_fullcorpus)
df_fullcorpus["Training set splitting factor"] = [1]*len(df_fullcorpus)
df_fullcorpus["Number of sampling iterations"] = [1501]*len(df_fullcorpus)
df = pd.concat([df, df_fullcorpus])

# Looking at effect trajectories as a function of training data amount

In [None]:
# selecting relevant data subset
data = pd.concat([df[(df['Number of sampling iterations'] == 1501)]])

# mapping splitting factors to data amount
training_amount = {'WSJ': 19*3600+30*60, 'GPJ': 19*3600+33*60, 
                   'BUC': 9*3600+13*60, 'CSJ': 9*3600+11*60}  # rounded to the minute
data['training amount'] = [training_amount[training_set]/float(factor)
                               for training_set, factor in zip(data['training set'], data['Training set splitting factor'])]

In [None]:
# Main figure
facet_labels = ['Read test stimuli\nAmerican English [\\textipa{\*r}]-[l]',
                'Read test stimuli\nAmerican English [w]-[j]',
                'Spont. test stimuli\nAmerican English [\\textipa{\*r}]-[l]',
                'Spont. test stimuli\nAmerican English [w]-[j]']

colors = ["red", "red", "blue", "blue"]
palette = sns.xkcd_palette(colors)

g = sns.relplot(x="training amount", y="error", kind="line",
                hue="training set",
                hue_order=['WSJ', 'BUC', 'GPJ', 'CSJ'],
                size="training set",
                size_order=['WSJ', 'BUC', 'GPJ', 'CSJ'],
                sizes=[1, 1, 1, 1],
                style="training set",
                style_order=['WSJ', 'BUC', 'GPJ', 'CSJ'],
                dashes=[(9999,1), (10,3), (9999,1), (10,3)],
                row="test set",
                row_order=["WSJ", "BUC"],
                col="contrast type",
                col_order=['L-R', 'W-Y'],
                markers=['s', 'D', 's', 'D'],
                markersize=10,
                #units="Subcorpus group", estimator=None,
                data=data,
                legend=False,
                palette=palette)


g = g.set(xscale="log")
for ax in g.axes.flatten():
    ax.grid()
g.set(xticks=[60, 600, 3600, 36000])
g.set_xticklabels(['1min', '10min', '1h', '10h'], fontsize=15)
for axes in g.axes:
    for tick in axes[0].yaxis.get_major_ticks():
        tick.label.set_fontsize(20)
g.set_ylabels('ABX error rate (in \%)', fontsize=20)
g.set_xlabels('Training set size', fontsize=20)
for ax, t in zip(g.axes.flatten(), facet_labels):
    ax.tick_params(axis='both', which='both', width=0, length=0)
    ax.set_axisbelow(True)
    ax.set_title(t, fontsize=20)
#g.despine(left=True, bottom=True)
# y range set to half that for fig2?
g.axes[0,0].set_ylim([0, 33])
g.fig.tight_layout()


g.savefig(fig_path)