In [None]:
base_root = '/Users/admin/Documents/PhD/Code/perceptual-tuning-results/'

# Counting distinct model representations of a phonetic category

In [None]:
%matplotlib inline

import os.path as path
import os
import numpy as np
import pandas as pd
import seaborn as sns

# Utility functions

In [None]:
# data pre-processing

def get_counts(root, train, test, dur=10, by_spk=True, seed=0, within_exclusion=True):
    if by_spk:
        spk_str = ''
    else:
        spk_str = '_multispk'
    if not(within_exclusion) and by_spk:
        exc_str =  '_noexclusion'
    else:
        exc_str = ''
    template = ('dominant_units_{}ms_around_central_frame_{}'
                '_{}_most_conservative{}{}{}.txt')
    filename = path.join(root, template.format(dur, train, test, spk_str, seed, exc_str))
    df = pd.read_csv(filename, index_col=0)
    return df


def get_phonetic_categories(root, test, by_spk=True, within_exclusion=True):
    if by_spk:
        spk_str = ''
    else:
        spk_str = 'multispk_'
    if not(within_exclusion) and by_spk:
        exc_str =  '_noexclusion'
    else:
        exc_str = ''
    template = 'contextphones_most_conservative_{}{}{}_phoncat_{}.txt'
    filename = path.join(root, template.format(spk_str, test, exc_str, 'types'))
    types = pd.read_csv(filename, index_col=0)
    filename = path.join(root, template.format(spk_str, test, exc_str, 'items'))
    items = pd.read_csv(filename, index_col=0)
    return types, items


def augment_counts(counts, types, **kwargs):
    # add various columns from types
    cols = [e for e in types.columns if e!='size']
    for col in cols:
        counts[col] = [types[col].loc[id] for id in counts['context+phone ID']]
    # add lb-ub column
    cols = counts.columns
    ub_cols = [e for e in counts.columns if e != 'nunique_lb']
    ub_df = counts[ub_cols]
    ub_df['count type'] = 'upper bound'
    del ub_df['nunique']
    ub_df = ub_df.rename(columns={'nunique_ub': 'nunique'})
    lb_cols = [e for e in counts.columns if e != 'nunique_ub']
    lb_df = counts[lb_cols]
    lb_df['count type'] = 'lower bound'
    del lb_df['nunique']
    lb_df = lb_df.rename(columns={'nunique_lb': 'nunique'})
    counts = pd.concat([ub_df, lb_df])
    # add other regressors
    for arg_name in kwargs:
        counts[arg_name] = kwargs[arg_name]
    return counts


def prepare_data(root, train_corpora, test_corpora, by_spks, durs, seeds, within_exclusion=True):
    # main loading function
    all_counts = []
    for train, test in zip(train_corpora, test_corpora):
        print(train)
        print(test)
        for by_spk in by_spks:
            print(by_spk)
            types, items = get_phonetic_categories(root, test, by_spk=by_spk, within_exclusion=within_exclusion) 
            for dur in durs:
                for seed in seeds:
                    counts = get_counts(root, train, test, by_spk=by_spk, dur=dur, seed=seed, within_exclusion=within_exclusion)
                    kwargs = {'train': train, 'test': test, 'by_spk': by_spk, 'dur': dur, 'seed': seed}
                    if not(by_spk):
                        kwargs['spk'] = None
                    counts = augment_counts(counts, types, **kwargs)
                    all_counts.append(counts)
    counts = pd.concat(all_counts)
    return counts

In [None]:
# various groupings
# **Possible extensions: word of a same family, plurals etc.?**

def agg_even_words(counts):
    """ 
    Aggregate on both central phones in words of even length    
    """
    group_cols = ['test', 'train', 'by_spk', 'spk', 'dur', 'seed', 'model', 'count type', 'word_trans']
    counts = counts.groupby(group_cols, as_index=False)['nunique'].mean()
    return counts


def agg_multispk(counts):
    # For by_spk row only: aggregate over all available speakers for each word type
    by_spk_ix = counts['by_spk']
    other_ix = [not(e) for e in counts['by_spk']]
    group_cols = ['test', 'train', 'by_spk', 'dur', 'seed', 'model', 'count type', 'word_trans']
    counts_by_spk = counts[by_spk_ix].groupby(group_cols, as_index=False)['nunique'].mean()
    counts_by_spk['spk'] = 'Aggregated'
    counts = pd.concat([counts_by_spk, counts[other_ix]])
    del counts['spk']
    return counts


def agg_seeds(counts):
    group_cols = ['test', 'train', 'by_spk', 'dur','model', 'count type', 'word_trans']
    count_means = counts.groupby(group_cols, as_index=False)['nunique'].mean()
    # for std there is some bug with using as_index=False, so we use reset_index instead
    count_stds = counts.groupby(group_cols)['nunique'].std().reset_index()
    return count_means, count_stds

# Main

In [None]:
root = base_root + 'no_phon_cats/phone_rep_count'

## Count unique representations

In [None]:
# pre-process data (done only once)
AE_corpora = ['WSJ', 'BUC']
JP_corpora = ['GPJ', 'CSJ']
corp_AE = [(train_corpus, test_corpus) for train_corpus in AE_corpora for test_corpus in ['WSJ']]
corp_JP = [(train_corpus, test_corpus) for train_corpus in JP_corpora for test_corpus in ['GPJ']]
train_corpora, test_corpora = zip(*(corp_AE+corp_JP))
durs = [0,46]  #10*np.arange(1, 6)
seeds = np.arange(10)
by_spks = [True, False]
counts = prepare_data(root, train_corpora, test_corpora, by_spks, durs, seeds, within_exclusion=True)
counts.to_csv(path.join(root, 'all_counts_clean_within_spk_stims.txt'))

counts = prepare_data(root, train_corpora, test_corpora, by_spks, durs, seeds, within_exclusion=False)
counts.to_csv(path.join(root, 'all_counts_no_stim_cleaning.txt'))

In [None]:
# load preprocessed data

def load_data(res_file):
    counts = pd.read_csv(res_file, index_col=0, low_memory=False)
    assert not('None' in counts['spk'])
    counts['spk'] = [spk if by_spk else "None" for by_spk, spk in zip(counts['by_spk'], counts['spk'])]
    return counts


res_file_within_clean = path.join(root, 'all_counts_clean_within_spk_stims.txt')
res_file_no_clean = path.join(root, 'all_counts_no_stim_cleaning.txt')
counts_within_clean = load_data(res_file_within_clean)
counts = load_data(res_file_no_clean)


In [None]:
# Do some structured averaging
def process_counts(counts):
    counts = agg_even_words(counts)
    counts = agg_multispk(counts)
    count_means, count_stds = agg_seeds(counts)
    return count_means, count_stds

#count_means, count_stds = process_counts(counts_within_clean)
count_means, count_stds = process_counts(counts)
count_means = count_means[((count_means['count type'] == 'upper bound') & (count_means['model'] != 'GMM'))
                         |((count_means['count type'] == 'lower bound') & (count_means['model'] == 'GMM'))]

count_means.to_csv(base_root + 'no_phon_cats/results/nb_unq.txt')


#count_means, count_stds = process_counts(counts_within_clean)
count_means, count_stds = process_counts(counts_within_clean)
count_means = count_means[((count_means['count type'] == 'upper bound') & (count_means['model'] != 'GMM'))
                         |((count_means['count type'] == 'lower bound') & (count_means['model'] == 'GMM'))]

count_means.to_csv(base_root + 'no_phon_cats/results/nb_unq_within_spk_cleaned.txt')

## Count unique representations for 1-2h models

In [None]:
root = base_root + 'no_phon_cats_1h/phone_rep_count'

In [None]:
# pre-process data (done only once)
AE_corpora = ['WSJ', 'BUC']
JP_corpora = ['GPJ', 'CSJ']
for subcorpus in range(1,11):
    corp_AE = [("{}_{}".format(train_corpus, subcorpus), test_corpus) for train_corpus in AE_corpora for test_corpus in ['WSJ']]
    corp_JP = [("{}_{}".format(train_corpus, subcorpus), test_corpus) for train_corpus in JP_corpora for test_corpus in ['GPJ']]
    train_corpora, test_corpora = zip(*(corp_AE+corp_JP))
    durs = [0, 46]  #10*np.arange(1, 6)
    seeds = np.arange(10)
    by_spks = [True, False]
    
    counts = prepare_data(root, train_corpora, test_corpora, by_spks, durs, seeds, within_exclusion=True)
    counts.to_csv(path.join(root, 'all_counts_clean_within_spk_stims_subcorpus{}.txt'.format(subcorpus)))

    counts = prepare_data(root, train_corpora, test_corpora, by_spks, durs, seeds, within_exclusion=False)
    counts.to_csv(path.join(root, 'all_counts_no_stim_cleaning_subcorpus{}.txt'.format(subcorpus)))

In [None]:
# load preprocessed data

def load_data(res_file):
    counts = pd.read_csv(res_file, index_col=0, low_memory=False)
    assert not('None' in counts['spk'])
    counts['spk'] = [spk if by_spk else "None" for by_spk, spk in zip(counts['by_spk'], counts['spk'])]
    return counts


fnames_within_clean = [path.join(root, 'all_counts_clean_within_spk_stims_subcorpus{}.txt'.format(subcorp)) 
                       for subcorp in range(1,11)]
fnames_no_clean = [path.join(root, 'all_counts_no_stim_cleaning_subcorpus{}.txt'.format(subcorp))
                   for subcorp in range(1,11)]
counts_within_clean = {i: load_data(fnames_within_clean[i-1]) for i in range(1,11)}
counts = {i: load_data(fnames_no_clean[i-1]) for i in range(1,11)}


In [None]:
# Do some structured averaging
def process_counts(counts):
    counts = agg_even_words(counts)
    counts = agg_multispk(counts)
    count_means, count_stds = agg_seeds(counts)
    return count_means, count_stds

#count_means, count_stds = process_counts(counts_within_clean)
# we ignore the count deviations due to random seeds here, as we are not interested conceptually in thi source of noise
count_means = {i: process_counts(counts[i])[0] for i in range(1,11)}
# we take only the lower bounds since all model considered are GMM and we want to be conservative
for i in range(1,11):
    count_means[i] = count_means[i][(count_means[i]['count type'] == 'lower bound')]
    count_means[i].to_csv(base_root + 'no_phon_cats_1h/results/nb_unq_{}.txt'.format(i))


#count_means, count_stds = process_counts(counts_within_clean)
# we ignore the count deviations due to random seeds here, as we are not interested conceptually in thi source of noise
count_means = {i: process_counts(counts_within_clean[i])[0] for i in range(1,11)}
# we take only the lower bounds since all model considered are GMM and we want to be conservative
for i in range(1,11):
    count_means[i] = count_means[i][(count_means[i]['count type'] == 'lower bound')]
    count_means[i].to_csv(base_root + 'no_phon_cats_1h/results/nb_unq_within_spk_cleaned_{}.txt'.format(i))
