In [None]:
root = '/Users/admin/Documents/PhD/Code/perceptual-tuning-results/'

fig_path = root + 'rala/figures/rl_input_amount.pdf'

# DPGMM novtln and synthetic ra/la

In [None]:
#%matplotlib inline
# Uncomment to plot finalized figures

import matplotlib as mpl
mpl.use("pgf")
pgf_with_custom_preamble = {
    "font.family": "serif", # use serif/main font for text elements
    "text.usetex": True,    # use inline math for ticks
    "pgf.rcfonts": False,   # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{unicode-math}",  # unicode math setup
         "\\setmainfont{Doulos SIL}" # serif font via preamble
         ]
}
mpl.rcParams.update(pgf_with_custom_preamble)

import rala.read_features as read_features
import rala.dis
import rala.dis_fn as dis_fn
import rala.infant_xp as infant_xp
import os.path as path
import os
import h5features as h5f
import re
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# compute dissimilarity between features

def compute_dis(feat_path, segments_path, dissimilarity, dis_path, item_name=None):
    model_rep = read_features.read(feat_path, segments_path)
    if item_name is None:
        assert len(model_rep) == 1
        item_name = list(model_rep.keys())[0]
    dis, stim_order = rala.dis.compute_dis(model_rep[item_name], dissimilarity)
    rala.dis.write_dis(dis, stim_order, dis_path)

def precompute_dis(feat_folder, segments_path, dis_name):
    dissimilarity = rala.dis_fn.get_dissimilarity(dis_name)
    for feat_file in os.listdir(feat_folder):
        if feat_file[-9:] == '.features':
            feat_path = path.join(feat_folder, feat_file)
            with h5f.Reader(feat_path) as reader:
                items = reader.items.data
            for item in items:
                item_str = item.decode('UTF-8')
                # parse item name
                feat = feat_file[:-9]
                res_id="{}FEAT__{}DIS".format(feat, dis_name)
                dis_path = path.join(root, 'dis', res_id + '.txt')
                if not(path.exists(dis_path)):
                    compute_dis(feat_path, segments_path, dissimilarity, dis_path, item_name=item)
                
        


feat_folder = path.join(root, 'rala', 'features')
segments_path = path.join(root, 'rala', 'stim', 'segments.txt')
dis_name = 'kl_dtw'  #'cos_dtw'
precompute_dis(feat_folder, segments_path, dis_name)

In [None]:
# assemble data


def parse_res_id(res_id):
    corpora = {'WSJ': ('American English', 'Read'),
               'CSJ': ('Japanese', 'Spontaneous'),
               'GPJ': ('Japanese', 'Read'),
               'BUC': ('American English', 'Spontaneous')
              }
    res = re.match(r'(.*)FEAT__(.*)DIS', res_id)
    feat, dis = res.group(1), res.group(2)
    if 'inputfeats' in feat:
        input_amount, training_lang, training_reg, training_set = 0, None, None, None
        model_type = feat
    else:
        # ad hoc
        if np.char.isnumeric(feat[-1]):
            res = re.match(r'(.*)_(.*)_(.*)_(.*)', feat)
            model_type, training_set, input_amount, input_id = \
                res.group(1), res.group(2), res.group(3), res.group(4)
            input_amount = 1/float(input_amount)
            training_lang, training_reg = corpora[training_set]
            training_set = training_set + '_' + input_id
        else:
            res = re.match(r'(.*)_(.*)', feat)
            model_type, training_set = res.group(1), res.group(2)
            input_amount = 1
            training_lang, training_reg = corpora[training_set]
    descr = {'input amount': input_amount, 'model type': model_type, 'training language': training_lang,
             'training register': training_reg, 'training set': training_set,
             'dis': dis}
    return descr


#adult xp conds
conds = ['American English', 'Japanese', 'German', 'init grid']

data = {'infant d': [], 'infant scale': [],
        'model type': [], 'input amount': [], 'training language': [],
        'training register': [], 'training set': [], 'dis': []}

dis_folder = path.join(root, 'dis')
for i, dis_file in enumerate(os.listdir(dis_folder)):
    if dis_file[-4:] == '.txt':
        dis_path = path.join(dis_folder, dis_file)
        res_id = dis_file[:-4]
        descr = parse_res_id(res_id)
        for descr_key in descr:
            data[descr_key].append(descr[descr_key])
        try:
            infant_xp_dis, infant_xp_scale = infant_xp.simulate(dis_path)
        except Exception as e:
            print('Infant')
            print(dis_file)
            print(e)
            infant_xp_dis, infant_xp_scale = np.nan, np.nan
        data['infant d'].append(infant_xp_dis)
        data['infant scale'].append(infant_xp_scale)

data = pd.DataFrame(data)     
data.to_csv(path.join(root, 'data_kldtw.txt'))

In [None]:
feat_folder = path.join(root, 'features')
segments_path = path.join(root, 'stim', 'segments.txt')
dis_name = 'kl_dtw'  #'cos_dtw'
data = pd.read_csv(path.join(root, 'data_kldtw.txt'))

In [None]:
df = data[(data['model type'] == 'dpgmm_novtln_vad')].copy()
df['infant raw d'] = df['infant d']*df['infant scale']
df['training corpus'] = [e[:3] if e else e for e in df['training set']]

def f(amount, corpus):
    training_amount = {'WSJ': 19*3600+30*60, 'GPJ': 19*3600+33*60, 
                       'BUC': 9*3600+13*60, 'CSJ': 9*3600+11*60}  # rounded to the minute
    return training_amount[corpus]*amount

df['training amount'] = [f(amount, corpus)
                               for amount, corpus in zip(df['input amount'], df['training corpus'])]

df['d in bits'] = df['infant raw d']/np.log(2)  # to get KL div in bit units (does not make sense for other dissimilarity functions)

In [None]:
facet_labels = ['Original experimental stimuli\n American English [\\textipa{\*r}]-[l]']

colors = ["blue", "blue", "red", "red"]
palette = sns.xkcd_palette(colors)

g = sns.relplot(x="training amount", y="d in bits", kind="line",
                hue="training corpus",
                hue_order=['WSJ', 'BUC', 'GPJ', 'CSJ'],
                size="training corpus",
                size_order=['WSJ', 'BUC', 'GPJ', 'CSJ'],
                sizes=[1, 1, 1, 1],
                style="training corpus",
                style_order=['WSJ', 'BUC', 'GPJ', 'CSJ'],
                dashes=[(9999,1), (10,3), (9999,1), (10,3)],
                markers=['s', 'D', 's', 'D'],
                markersize=10,
                col='dis',
                col_order = ['kl_dtw'],
                #units="Subcorpus group", estimator=None,
                data=df,
                legend=False,
                palette=palette)

g = g.set(xscale="log")
for ax in g.axes.flatten():
    ax.grid()
g.set(xticks=[60, 600, 3600, 36000])
g.set_xticklabels(['1min', '10min', '1h', '10h'], fontsize=15)
for axes in g.axes:
    for tick in axes[0].yaxis.get_major_ticks():
        tick.label.set_fontsize(20)
g.set_ylabels('Dissimilarity (in bits)', fontsize=20)
g.set_xlabels('Training set size', fontsize=20)
for ax, t in zip(g.axes.flatten(), facet_labels):
    ax.tick_params(axis='both', which='both', width=0, length=0)
    ax.set_axisbelow(True)
    ax.set_title(t, fontsize=20)
#g.despine(left=True, bottom=True)
# y range set to half that for fig2?
g.axes[0,0].set_ylim([0, 10])
g.fig.tight_layout()

g.savefig(fig_path)