In [None]:
root = '/Users/admin/Documents/PhD/Code/perceptual-tuning-results/'

In [None]:
# Uncomment for development/debugging
%matplotlib inline


# Uncomment to plot finalized figures
"""
import matplotlib as mpl
mpl.use("pgf")
pgf_with_custom_preamble = {
    "font.family": "serif", # use serif/main font for text elements
    "text.usetex": True,    # use inline math for ticks
    "pgf.rcfonts": False,   # don't setup fonts from rc parameters
    "pgf.preamble": [
         "\\usepackage{unicode-math}",  # unicode math setup
         "\\setmainfont{Doulos SIL}" # serif font via preamble
         ]
}
mpl.rcParams.update(pgf_with_custom_preamble)
"""

import scipy.io as io
import os.path as path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

## GMM

In [None]:

def load_model(filename):
    """
    Load GMM model saved in .mat from Jason Chang's library
    Output: 
        Dictionary with entries:
            cov : array of floats (n_components,dim,dim)
                Contains the computed covariance matrices of the mixture.
            
            means : array of floats (n_components,dim)
                Contains the computed means of the mixture.
            
            log_weights : array of floats (n_components,)
    """
    # file format is not consistent between Chang's init and update steps...
    data = io.loadmat(filename)
    sh = data['clusters'].shape
    if sh[0] == 1:
        K = sh[1]
        #dt = data['clusters'].dtype.descr
        #keys = [dt[i][0] for i in range(len(dt))]  # names of various descriptors for clusters
        model = {}        
        logpi = [data['clusters'][0,i]['logpi'] for i in range(K)]
        model['log_weights'] = np.concatenate(logpi).reshape((K,))  # (K,)
        mu = [data['clusters'][0,i]['mu'] for i in range(K)]
        model['means'] = np.column_stack(mu).T  # (K,d)
        d = model['means'].shape[1]
        Sigma = [data['clusters'][0,i]['Sigma'].reshape((1,d,d)) for i in range(K)]
        model['cov'] = np.concatenate(Sigma, axis=0)  # (K,d,d)
    else:
        K = sh[0]
        model = {}        
        logpi = [data['clusters'][i,0]['logpi'] for i in range(K)]
        model['log_weights'] = np.concatenate(logpi).reshape((K,))  # (K,)
        mu = [data['clusters'][i,0]['mu'] for i in range(K)]
        model['means'] = np.column_stack(mu).T  # (K,d)
        d = model['means'].shape[1]
        Sigma = [data['clusters'][i,0]['Sigma'].reshape((1,d,d)) for i in range(K)]
        model['cov'] = np.concatenate(Sigma, axis=0)  # (K,d,d)
    return model

In [None]:

def get_GMM_cat(model_folder, corpora):
    nb_cat = {}
    nb_cat_small_models = {}
    for corpus in corpora:
        model_file = path.join(model_folder, corpus+'_1501-final.mat')
        model = load_model(model_file)
        nb_cat[corpus] = len(model['log_weights'])
        nb_cat_small_models[corpus] = []
        for i in range(1, 11):
            model_file = path.join(model_folder, corpus+'_10_{}'.format(i))
            model = load_model(model_file)
            nb_cat_small_models[corpus].append(len(model['log_weights']))
    return nb_cat, nb_cat_small_models

## Supervised

In [None]:
def get_nlines(filename):
    with open(filename, 'r') as fh:
        return len(fh.readlines())


def get_nitems(filename):
    with open(filename, 'r') as fh:
        lines = fh.readlines()
    assert len(lines) == 1
    return len(lines[0].strip().split())


def get_sup_cat(activation_folder, corpora):
    nb_cat = {'nb cat': [], 'train': [], 'model': []}
    test_corpora = {'BUC': 'WSJ', 'CSJ': 'GPJ', 'WSJ': 'WSJ', 'GPJ': 'GPJ'}
    for corpus in corpora:
        test_corpus = test_corpora[corpus]
        model = 'hmm_phone'
        filename = path.join(activation_folder, 'activation_{}_{}_skip0_basic_{}_info.txt'.format(corpus, test_corpus, model))
        nb_cat['nb cat'].append(get_nitems(filename))
        nb_cat['train'].append(corpus)
        nb_cat['model'].append('HMM-phone')
        model = 'hmm_state'
        filename = path.join(activation_folder, 'activation_{}_{}_skip0_basic_{}_info.txt'.format(corpus, test_corpus, model))
        nb_cat['nb cat'].append(get_nlines(filename))
        nb_cat['train'].append(corpus)
        nb_cat['model'].append('HMM-state')
    return nb_cat



# Main

In [None]:
corpora = ['BUC', 'WSJ', 'CSJ', 'GPJ']
nb_cat_GMM, nb_cat_small_models_GMM = get_GMM_cat(root + 'no_phon_cats/models/', corpora)

corpora = ['WSJ', 'GPJ', 'BUC', 'CSJ']
nb_cat = get_sup_cat(root + 'no_phon_cats/unit_activation/', corpora)

for corpus in nb_cat_GMM:
    nb_cat['nb cat'].append(nb_cat_GMM[corpus])
    nb_cat['train'].append(corpus)
    nb_cat['model'].append('GMM')
nb_cat = pd.DataFrame(nb_cat)
nb_cat.to_csv(root + '/no_phon_cats/results/nb_cat.txt')

## Nb_cat.txt for 1-2h models

In [None]:
corpora = ['BUC', 'WSJ', 'CSJ', 'GPJ']
nb_cat_GMM, nb_cat_small_models_GMM = get_GMM_cat(root + 'no_phon_cats_1h/models/', corpora)

nb_cat_small = {'model': [], 'nb cat': [], 'train': [], 'train subset': []}
for corpus in nb_cat_small_models_GMM:
    for i, nb in enumerate(nb_cat_small_models_GMM[corpus]):
        nb_cat_small['nb cat'].append(nb)
        nb_cat_small['train'].append(corpus)
        nb_cat_small['train subset'].append(i+1)
        nb_cat_small['model'].append('GMM')
nb_cat_small = pd.DataFrame(nb_cat_small)
nb_cat_small.to_csv(root + 'no_phon_cats_1h/results/nb_cat.txt')


## Nb cat as a function of iteration number for all models 

In [None]:
def get_nb_cat(model_folder, corpus, split_factor, subset, nb_iter):
    if nb_iter==1501:
        it = '1501-final'
    else:
        it = str(nb_iter)
    fname = '_'.join([corpus, str(split_factor), str(subset), it]) + '.mat'
    model_file = path.join(model_folder, fname)
    model = load_model(model_file)
    return len(model['log_weights'])


corpora = ['BUC', 'WSJ', 'CSJ', 'GPJ']
split_factors = [1, 10, 100, 1000]
subsets = {1 : [1],
           10 : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
           100 : [1, 11, 21, 31, 41, 51, 61, 71, 81, 91],
           1000 : [1, 101, 201, 301, 401, 501, 601, 701, 801, 901]}
nb_iters = [1, 21, 61, 141, 301, 621, 1261, 1501]     

df = {'nb_cat': [], 'train': [], 'split_factor': [], 'subset': [], 'nb_iter': []}
for corpus in corpora:
    for split in split_factors:
        for subset in subsets[split]:
            for nb_iter in nb_iters:
                df['nb_cat'].append(get_nb_cat(root + 'convergence/models/', corpus, split, subset, nb_iter))
                df['train'].append(corpus)
                df['split_factor'].append(split)
                df['subset'].append(subset)
                df['nb_iter'].append(nb_iter)
df = pd.DataFrame(df)

In [None]:
df.to_csv(root + 'convergence/results/nb_cat.txt')