In [None]:
root = '/Users/admin/Documents/PhD/Code/perceptual-tuning-results/'

# Figures qualitative analysis (phonetic categoriness)

In [None]:
# Uncomment for development/debugging
"""
%matplotlib inline
import matplotlib as mpl
inline_plots=True
"""

# Uncomment to plot finalized figures

import matplotlib as mpl
mpl.use("pgf")
pgf_with_custom_preamble = {
    "font.family": "serif", # use serif/main font for text elements
    "text.usetex": True,    # use inline math for ticks
    "pgf.rcfonts": False,   # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{unicode-math}",  # unicode math setup
         "\\setmainfont{Doulos SIL}" # serif font via preamble
         ]
}
mpl.rcParams.update(pgf_with_custom_preamble)
inline_plots=False


import pandas as pd
import seaborn

In [None]:
cleaned_data=True
nb_unq_dur=46  # 0 (no correction for misalignment control) or 46 (ms)

if cleaned_data:
    nb_unq = pd.read_csv(root + 'no_phon_cats/results/nb_unq_within_spk_cleaned.txt')
else:
    nb_unq = pd.read_csv(root + 'no_phon_cats/results/nb_unq.txt')
nb_unq = nb_unq[nb_unq['dur'] == nb_unq_dur]
    
nb_cat = pd.read_csv(root + 'no_phon_cats/results/nb_cat.txt')
dur = pd.read_csv(root + 'no_phon_cats/results/dur.txt')

In [None]:
def prepare_data(dur, nb_cat, nb_unq):
    dur, nb_cat, nb_unq = dur.copy(), nb_cat.copy(), nb_unq.copy()
    dur['duration'] = 1000*dur['duration']
    del dur['Unnamed: 0']
    del dur['info']
    del dur['test']
    dur = dur.rename(columns={'duration': 'y', 'unit': 'id'})
    dur['measure type'] = 'duration'


    del nb_cat['Unnamed: 0']
    nb_cat = nb_cat.rename(columns={'nb cat': 'y'})
    nb_cat['measure type'] = 'nb cat'
    nb_cat['id'] = 0


    del nb_unq['Unnamed: 0']
    del nb_unq['test']
    del nb_unq['dur']
    del nb_unq['count type']
    nb_unq = nb_unq.rename(columns={'nunique': 'y', 'word_trans': 'id'})

    d = {by: df for by, df in nb_unq.groupby('by_spk')}
    nb_unq_within, nb_unq_across = [d[True], d[False]]
    del nb_unq_within['by_spk']
    del nb_unq_across['by_spk']
    nb_unq_within['measure type'] = 'nb unq within'
    nb_unq_across['measure type'] = 'nb unq across'
    data = pd.concat([nb_cat, dur, nb_unq_within, nb_unq_across])
    return data

In [None]:
data = prepare_data(dur, nb_cat, nb_unq)

# keep only one baseline
data = data[(data['model'] == 'GMM') | 
            ((data['model'] != 'GMM') & ([e in ['WSJ', 'GPJ'] for e in data['train']]))]
def cond(e, f):
    if f == 'GMM':
        return 'GMM (read)' if e in ['WSJ', 'GPJ'] else 'GMM (spont.)'
    elif f == 'HMM-phone':
        return 'Phoneme recognizer baseline'
    else:
        return 'Phone-state baseline'

data['test lang'] = ['AE' if e in ['WSJ', 'BUC'] else 'JP' for e in data['train']]
data['cond'] = [cond(e, f) for e, f in zip(data['train'], data['model'])]
del data['train']


colors = ["white", "white", "dusty pink"]
my_palette = seaborn.xkcd_palette(colors)
g = seaborn.catplot(data=data, y='y', x='cond', col='measure type', row='test lang',
                    kind='bar', sharey=False, order=['GMM (read)', 'GMM (spont.)', 'Phoneme recognizer baseline'],
                    palette=my_palette)

edge_colors = ["xkcd:putty", "xkcd:putty", "xkcd:dusty pink"]
title = ["Number of\nlearned units", "Duration\n of activation", "Acoustic (in)variance\n(within speaker)", "Acoustic (in)variance\n(across speaker)"]
ylab = ["No. units", "Duration (ms)", "No. distinct units\n for 10 repetitions", "No. of distinct units\nfor 10 repetitions"]
xlab = []  #["Read speech model", "Spont. speech model", "Phoneme recognizer\n(supervised baseline)"]
g.set_xticklabels(xlab, rotation=45, ha='right', fontsize=15)
for i, row in enumerate(g.axes):
    for j, axis in enumerate(row):
        axis.set_ylabel(ylab[j], fontsize=25)
        axis.set_xlabel('')
        for tick in axis.yaxis.get_major_ticks():
            tick.label.set_fontsize(25) 
        for tick in axis.xaxis.get_major_ticks():
            tick.label.set_fontsize(20) 
        if i == 0:
            axis.set_title(title[j], fontsize=25)
        else:
            axis.set_title('')
        patches = [e for e in axis.get_children() if isinstance(e, mpl.patches.Rectangle)]
        for color, patch in zip(edge_colors, patches):
            patch.set_edgecolor(color)
            patch.set_linewidth(3)
        if j == 0:
            axis.set_ylim([0, 1000])
            #axis.set_yscale('log')
        if j == 1:
            axis.set_ylim([0, 100])
        if j in [2, 3]:
            if nb_unq_dur==0:
                axis.set_ylim([1, 10])
                axis.set_yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
            else:
                axis.set_ylim([1, 4])
                axis.set_yticks([1, 2, 3, 4])

# Define some hatches
hatches = ['\\', '.', '', '\\', '*', 'o']

# Loop over the bars
for axis in g.axes.flatten():
    for i,thisbar in enumerate(axis.patches):
        # Set a different hatch for each bar
        thisbar.set_hatch(hatches[i])

for ax in g.axes.flatten():
    ax.tick_params(axis='both', which='both', width=0, length=0)
    ax.set_axisbelow(True)
    ax.grid(axis='y')   
#g.despine(left=True)

g.fig.tight_layout()
if not(inline_plots):
    if cleaned_data:
        if nb_unq_dur == 0:
            out=root + "no_phon_cats/figures/main_cleaned_nomisalignmentcorrection.pdf"
        else:
            out=root + "no_phon_cats/figures/main_cleaned.pdf"
    else:
        if nb_unq_dur == 0:
            out=root + "no_phon_cats/figures/main_nomisalignmentcorrection.pdf"
        else:
            out=root + "no_phon_cats/figures/main.pdf"
    g.savefig(out)

In [None]:
colors = ["white", "white", "dusty pink", "blue grey"]
my_palette = seaborn.xkcd_palette(colors)
g = seaborn.catplot(data=data, y='y', x='cond', col='measure type', row='test lang',
                    kind='bar', sharey=False, order=['GMM (read)', 'GMM (spont.)', 'Phoneme recognizer baseline',
                                                     'Phone-state baseline'],
                    palette=my_palette)

edge_colors = ["xkcd:putty", "xkcd:putty", "xkcd:dusty pink", "xkcd:blue grey"]
title = ["Number of\nlearned units", "Duration\n of activation", "Acoustic (in)variance\n(within speaker)", "Acoustic (in)variance\n(across speaker)"]
ylab = ["No. units", "Duration (ms)", "No. distinct units\n for 10 repetitions", "No. of distinct units\nfor 10 repetitions"]
xlab = []  #["Read speech model", "Spont. speech model", "Phoneme recognizer\n(supervised baseline)"]
g.set_xticklabels(xlab, rotation=45, ha='right', fontsize=15)
for i, row in enumerate(g.axes):
    for j, axis in enumerate(row):
        axis.set_ylabel(ylab[j], fontsize=25)
        axis.set_xlabel('')
        for tick in axis.yaxis.get_major_ticks():
            tick.label.set_fontsize(25) 
        for tick in axis.xaxis.get_major_ticks():
            tick.label.set_fontsize(20) 
        if i == 0:
            axis.set_title(title[j], fontsize=25)
        else:
            axis.set_title('')
        patches = [e for e in axis.get_children() if isinstance(e, mpl.patches.Rectangle)]
        for color, patch in zip(edge_colors, patches):
            patch.set_edgecolor(color)
            patch.set_linewidth(3)
        if j == 0:
            axis.set_ylim([0, 3000])
            #axis.set_yscale('log')
        if j == 1:
            axis.set_ylim([0, 100])
        if j in [2, 3]:
            if nb_unq_dur==0:
                axis.set_ylim([1, 10])
                axis.set_yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
            else:
                axis.set_ylim([1, 4])
                axis.set_yticks([1, 2, 3, 4])

# Define some hatches
hatches = ['\\', '.', '', '', '*', 'o']

# Loop over the bars
for axis in g.axes.flatten():
    for i,thisbar in enumerate(axis.patches):
        # Set a different hatch for each bar
        thisbar.set_hatch(hatches[i])
        
for ax in g.axes.flatten():
    ax.tick_params(axis='both', which='both', width=0, length=0)
    ax.set_axisbelow(True)
    ax.grid(axis='y')   
#g.despine(left=True)
        
g.fig.tight_layout()
if not(inline_plots):
    if cleaned_data:
        if nb_unq_dur == 0:
            out=root + "no_phon_cats/figures/supp_cleaned_nomisalignmentcorrection.pdf"
        else:
            out=root + "no_phon_cats/figures/supp_cleaned.pdf"
    else:
        if nb_unq_dur == 0:
            out=root + "no_phon_cats/figures/supp_nomisalignmentcorrection.pdf"
        else:               
            out=root + "no_phon_cats/figures/supp.pdf"
    g.savefig(out)

# 1-2h models

In [None]:
def prepare_data_small(dur, nb_cat, nb_unq):
    dur, nb_cat, nb_unq = dur.copy(), nb_cat.copy(), nb_unq.copy()
    dur['duration'] = 1000*dur['duration']
    del dur['Unnamed: 0']
    #del dur['info']
    del dur['test']
    dur = dur.rename(columns={'duration': 'y', 'unit': 'id'})
    dur['subcorpus'] = [int(e.split('_')[-1]) for e in dur['train']]
    dur['train'] = [e.split('_')[0] for e in dur['train']]
    dur['measure type'] = 'duration'


    del nb_cat['Unnamed: 0']
    nb_cat = nb_cat.rename(columns={'nb cat': 'y', 'train subset': 'subcorpus'})
    nb_cat['measure type'] = 'nb cat'
    nb_cat['id'] = 0

    for i in range(1,11):
        del nb_unq[i]['Unnamed: 0']
        del nb_unq[i]['test']
        del nb_unq[i]['dur']
        del nb_unq[i]['count type']
        assert all([int(e.split('_')[-1])==i for e in nb_unq[i]['train']])
        nb_unq[i]['train'] = [e.split('_')[0] for e in nb_unq[i]['train']]
        nb_unq[i]['subcorpus'] = i
        nb_unq[i] = nb_unq[i].rename(columns={'nunique': 'y', 'word_trans': 'id'})
    nb_unq = pd.concat(nb_unq.values())

    d = {by: df for by, df in nb_unq.groupby('by_spk')}
    nb_unq_within, nb_unq_across = [d[True], d[False]]
    del nb_unq_within['by_spk']
    del nb_unq_across['by_spk']
    nb_unq_within['measure type'] = 'nb unq within'
    nb_unq_across['measure type'] = 'nb unq across'
    data = pd.concat([nb_cat, dur, nb_unq_within, nb_unq_across])
    return data

In [None]:
cleaned_data=True
nb_unq_dur=46  # use 46 or 0 for control with no correction for misalignment 

if cleaned_data:
    nb_unq = {i: pd.read_csv(root + 'no_phon_cats_1h/results/nb_unq_within_spk_cleaned_{}.txt'.format(i))
              for i in range(1,11)}
else:
    nb_unq = {i: pd.read_csv(root + 'no_phon_cats_1h/results/nb_unq_{}.txt'.format(i))
              for i in range(1,11)}
nb_unq = {i: nb_unq[i][nb_unq[i]['dur'] == nb_unq_dur] for i in range(1,11)}

nb_cat = pd.read_csv(root + 'no_phon_cats_1h/results/nb_cat.txt')
dur = pd.read_csv(root + 'no_phon_cats_1h/results/dur.txt')


if cleaned_data:
    nb_unq_full = pd.read_csv(root + 'no_phon_cats/results/nb_unq_within_spk_cleaned.txt')
else:
    nb_unq_full = pd.read_csv(root + 'no_phon_cats/results/nb_unq.txt')
nb_unq_full = nb_unq_full[nb_unq_full['dur'] == nb_unq_dur]
    
nb_cat_full = pd.read_csv(root + 'no_phon_cats/results/nb_cat.txt')
dur_full = pd.read_csv(root + 'no_phon_cats/results/dur.txt')

In [None]:
data = prepare_data_small(dur, nb_cat, nb_unq)

def cond(e, f):
    return 'GMM (read) 1-2h' if e in ['WSJ', 'GPJ'] else 'GMM (spont.) 1-2h'

data['test lang'] = ['AE' if e in ['WSJ', 'BUC'] else 'JP' for e in data['train']]
data['cond'] = [cond(e, f) for e, f in zip(data['train'], data['model'])]
del data['train']

# Average data over the 10 subcorpus
data_avg = data.groupby(['id', 'measure type', 'y', 'test lang', 'cond'], as_index=False).mean()
del data_avg['subcorpus']

# Also get full models
data_full = prepare_data(dur_full, nb_cat_full, nb_unq_full)

# keep only one baseline
data_full = data_full[(data_full['model'] == 'GMM') | 
            ((data_full['model'] != 'GMM') & ([e in ['WSJ', 'GPJ'] for e in data_full['train']]))]
def cond(e, f):
    if f == 'GMM':
        return 'GMM (read)' if e in ['WSJ', 'GPJ'] else 'GMM (spont.)'
    elif f == 'HMM-phone':
        return 'Phoneme recognizer baseline'
    else:
        return 'Phone-state baseline'

data_full['test lang'] = ['AE' if e in ['WSJ', 'BUC'] else 'JP' for e in data_full['train']]
data_full['cond'] = [cond(e, f) for e, f in zip(data_full['train'], data_full['model'])]
del data_full['train']


full_data = pd.concat([data_avg, data_full])

In [None]:
if nb_unq_dur == 0:
    corder = ['nb unq within', 'nb unq across']
else:
    corder = ['nb cat', 'duration', 'nb unq within', 'nb unq across']

colors = ["white", "white", "white", "white", "dusty pink"]
my_palette = seaborn.xkcd_palette(colors)
g = seaborn.catplot(data=full_data, y='y', x='cond', col='measure type', row='test lang',
                    kind='bar', sharey=False, order=['GMM (read)', 'GMM (spont.)',
                                                     'GMM (read) 1-2h', 'GMM (spont.) 1-2h',
                                                     'Phoneme recognizer baseline'],
                    col_order=corder,
                    palette=my_palette)

edge_colors = ["xkcd:putty", "xkcd:putty", "xkcd:baby blue", "xkcd:baby blue", "xkcd:dusty pink"]
if nb_unq_dur == 0:
    title = ["Acoustic (in)variance\n(within speaker)", "Acoustic (in)variance\n(across speaker)"]
    ylab = ["No. distinct units\n for 10 repetitions", "No. of distinct units\nfor 10 repetitions"]
else:
    title = ["Number of\nlearned units", "Duration\n of activation", "Acoustic (in)variance\n(within speaker)", "Acoustic (in)variance\n(across speaker)"]
    ylab = ["No. units", "Duration (ms)", "No. distinct units\n for 10 repetitions", "No. of distinct units\nfor 10 repetitions"]
xlab = []  #["Read speech model", "Spont. speech model", "Phoneme recognizer\n(supervised baseline)"]
g.set_xticklabels(xlab, rotation=45, ha='right', fontsize=15)
for i, row in enumerate(g.axes):
    for j, axis in enumerate(row):
        axis.set_ylabel(ylab[j], fontsize=25)
        axis.set_xlabel('')
        for tick in axis.yaxis.get_major_ticks():
            tick.label.set_fontsize(25) 
        for tick in axis.xaxis.get_major_ticks():
            tick.label.set_fontsize(20) 
        if i == 0:
            axis.set_title(title[j], fontsize=25)
        else:
            axis.set_title('')
        patches = [e for e in axis.get_children() if isinstance(e, mpl.patches.Rectangle)]
        for color, patch in zip(edge_colors, patches):
            patch.set_edgecolor(color)
            patch.set_linewidth(3)
        if nb_unq_dur==0:
            axis.set_ylim([1, 8])
            axis.set_yticks([1, 2, 3, 4, 5, 6, 7, 8])
        else:
            if j == 0:
                axis.set_ylim([0, 1000])
                #axis.set_yscale('log')
            if j == 1:
                axis.set_ylim([0, 100])
            if j in [2, 3]:
                axis.set_ylim([1, 4])
                axis.set_yticks([1, 2, 3, 4])

# Define some hatches
hatches = ['\\', '.', '\\', '.', '*', 'o']

# Loop over the bars
for axis in g.axes.flatten():
    for i,thisbar in enumerate(axis.patches):
        # Set a different hatch for each bar
        thisbar.set_hatch(hatches[i])

for ax in g.axes.flatten():
    ax.tick_params(axis='both', which='both', width=0, length=0)
    ax.set_axisbelow(True)
    ax.grid(axis='y')   
#g.despine(left=True)

g.fig.tight_layout()
if not(inline_plots):
    if cleaned_data:
        if nb_unq_dur==0:
            out=root + "no_phon_cats_1h/figures/main_cleaned_nomisalignmentcorrection.pdf"
        else:
            out=root + "no_phon_cats_1h/figures/main_cleaned.pdf"
    else:
        if nb_unq_dur==0:
            out=root + "no_phon_cats_1h/figures/main_nomisalignmentcorrection.pdf"
        else:
            out=root + "no_phon_cats_1h/figures/main.pdf"
    g.savefig(out)