In [None]:
root = '/Users/admin/Documents/PhD/Code/perceptual-tuning-results/'

In [None]:
# Uncomment for development/debugging
%matplotlib inline


# Uncomment to plot finalized figures
"""
import matplotlib as mpl
mpl.use("pgf")
pgf_with_custom_preamble = {
    "font.family": "serif", # use serif/main font for text elements
    "text.usetex": True,    # use inline math for ticks
    "pgf.rcfonts": False,   # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{unicode-math}",  # unicode math setup
         "\\setmainfont{Doulos SIL}" # serif font via preamble
         ]
}
mpl.rcParams.update(pgf_with_custom_preamble)
"""

import pandas as pd
import os.path as path
import numpy as np
import pandas as pd
import seaborn


# Forced alignment

In [None]:
# Forced alignment phone durations

In [None]:
corpora = ['WSJ', 'GPJ']
durs = {}
for corpus in corpora:
    durs[corpus] = pd.read_csv(path.join(root + 'no_phon_cats/unit_activation', 'forced_alignment_durs_{}.txt'.format(corpus)), index_col=0)

In [None]:
# duration by averaging first on speakers then on phones
for corpus in durs:
    print('{}: {}'.format(corpus, durs[corpus].groupby(['phone']).mean().mean()))

# standard deviation of speaker average across phones
for corpus in durs:
    print('{}: {}'.format(corpus, durs[corpus].groupby(['phone']).mean().std()))

    
# standard deviation of phone average across speakers
for corpus in durs:
    print('{}: {}'.format(corpus, durs[corpus].groupby(['spk']).mean().std()))

    
# standard deviation across phones and speakers
for corpus in durs:
    print('{}: {}'.format(corpus, durs[corpus].std()))

# conclusion reg. variability: speaker estimate very reliable, but there is some variability from one phone to another


In [None]:
(0.097518+0.087964)/4. # -> take +/- 46ms window for our phone rep analyses

In [None]:
# Check no silence included:
[set(durs[corpus]['phone']) for corpus in durs]

# Activation

In [None]:
def read_unit_ids(root, train_corpus):
    test_corpora = {'WSJ': 'WSJ', 'BUC': 'WSJ', 'GPJ': 'GPJ', 'CSJ': 'GPJ'}
    test_corpus = test_corpora[train_corpus]
    unit_ids = {'unit': [], 'model': [], 'info': [], 'train': []}
    model = 'hmm_state'
    filename = path.join(root, 'activation_{}_{}_skip0_basic_{}_info.txt'.format(train_corpus,
                                                                                 test_corpus,
                                                                                 model))
    with open(filename, 'r') as fh:
        for i, line in enumerate(fh):
            phone, hmm_state, pdf_id = line.strip().split()
            hmm_state, pdf_id = int(hmm_state), int(pdf_id)
            unit_ids['unit'].append(i)
            unit_ids['model'].append('HMM-state')
            unit_ids['info'].append((phone, hmm_state, pdf_id))
            unit_ids['train'].append(train_corpus)
    model = 'hmm_phone'
    filename = path.join(root, 'activation_{}_{}_skip0_basic_{}_info.txt'.format(train_corpus,
                                                                                 test_corpus,
                                                                                 model))
    with open(filename, 'r') as fh:
        lines = fh.readlines()
    assert len(lines) == 1
    phones = lines[0].strip().split()
    for i, phone in enumerate(phones):
        unit_ids['unit'].append(i)
        unit_ids['model'].append('HMM-phone')
        unit_ids['info'].append(phone)
        unit_ids['train'].append(train_corpus)
    return pd.DataFrame(unit_ids)

## Duration

In [None]:
def read_durs(res_file, avg=True):
    durs = []
    with open(res_file, 'r') as fh:
        for line in fh:
            durations = [float(e) for e  in line.strip().split()]
            if avg:
                if durations:
                    durations = np.mean(durations)
                else:
                    durations = np.nan
            durs.append(durations)
    return durs

def format_durs(root, file_template):
    data = {'train': [], 'test': [], 'model': [], 'duration': [], 'unit': []}
    for train, test in [('WSJ', 'WSJ'), ('BUC', 'WSJ'), ('GPJ', 'GPJ'), ('CSJ', 'GPJ')]:
            for model in ['GMM', 'HMM-phone', 'HMM-state']:
                res_file = path.join(root, file_template.format(train, test, model))
                for i, duration in enumerate(read_durs(res_file)):
                    data['train'].append(train)
                    data['test'].append(test)
                    data['model'].append(model)
                    data['duration'].append(duration)
                    data['unit'].append(i)
    data = pd.DataFrame(data)
    return data

def get_data(root, analysis_type, skip, cond='all-utts'):
    template = 'activation_{}_{}_skip' + str(skip) + '_' + analysis_type + '_duration_{}_' + cond + '.txt'
    data = format_durs(root, template)
    corpora = ['WSJ', 'BUC', 'GPJ', 'CSJ']
    unit_ids = pd.concat([read_unit_ids(root, corpus) for corpus in corpora])
    unit_ids
    data= pd.merge(data, unit_ids, on=['model', 'unit', 'train'], how='outer')
    return data


In [None]:
data=get_data(root + 'no_phon_cats/unit_activation', 'basic', 0, cond='no-sil-utts')
data.to_csv(root + 'no_phon_cats/results/dur.txt')

### Duration 1-2h models

In [None]:
def format_durs(root, file_template):
    data = {'train': [], 'test': [], 'model': [], 'duration': [], 'unit': []}
    for batch in ['_10_{}'.format(i) for i in range(1, 11)]:  # ad hoc...
        for train, test in [('WSJ', 'WSJ'), ('BUC', 'WSJ'), ('GPJ', 'GPJ'), ('CSJ', 'GPJ')]:
                for model in ['GMM']:   
                    res_file = path.join(root, file_template.format(train+batch, test, model))
                    for i, duration in enumerate(read_durs(res_file)):
                        data['train'].append(train+batch)
                        data['test'].append(test)
                        data['model'].append(model)
                        data['duration'].append(duration)
                        data['unit'].append(i)
    data = pd.DataFrame(data)
    return data

def get_data(root, analysis_type, skip, cond='all-utts'):
    template = 'activation_{}_{}_skip' + str(skip) + '_' + analysis_type + '_duration_{}_' + cond + '.txt'
    data = format_durs(root, template)
    return data

In [None]:
data=get_data(root + 'no_phon_cats_1h/unit_activation', 'basic', 0, cond='no-sil-utts')
data.to_csv(root + 'no_phon_cats_1h/results/dur.txt')