In [4]:
def _load_model():
    """
    Restore Checkpoint

    :return: attention visualizer
    """
    usr_dir.import_usr_dir('../submodule')

    visualizer = visualization.AttentionVisualizer('transformer_base', 'transformer', "data_dir",
                                                   'word_to_phonetic', beam_size=5)
    tf.Variable(0, dtype=tf.int64, trainable=False, name='global_step')

    sess = tf.train.MonitoredTrainingSession(
        checkpoint_dir="../checkpoints/word_to_phonetic/transformer-transformer_base_single_gpu-fr-best_model",
        save_summaries_secs=0,
    )

    return sess, visualizer

def _get_attention_matrices(input_word, sess, visualizer):
    """
    Run model to get the attention matrices and phonetic text

    :param input_word
    :param sess: tf session
    :param visualizer: attention visualizer
    :return: inp_text, out_text(phonetic) and sum_all_layers
    """
    _, inp_text, out_text, att_mats = visualizer.get_vis_data_from_string(sess, input_word)
    inp_text = [str(c, 'Latin-1') for c in inp_text]  # Decodes Latin-1 because of Frenche and Spanish special chars
    out_text = [str(c, 'Latin-1') for c in out_text]

    # Removes both padding and "end of sequence" markers
    inp_text = [v for v in inp_text if v != '<EOS>']
    out_text = [v for v in out_text if v != '<EOS>' and v != '<pad>']

    # Gets layes 0 and 4 of the "inp_out" matrices
    att_matrices = np.array(attention._get_attention(inp_text, out_text, *att_mats)["inp_out"]["att"])[np.array([0, 4]),
                   :, :, :]

    # Sum all attention heads
    sum_all_head = np.sum(att_matrices, axis=1)

    # Sum layers 0 and 4
    sum_all_layers = _normalize(np.sum(sum_all_head, axis=0)[:len(out_text), :len(inp_text)])

    return inp_text, out_text, sum_all_layers

def _normalize(matrix):
    """
        input: a numpy matrix
        return: matrix with 0 mean and 1 std
    """
    return (matrix - np.mean(matrix))/np.std(matrix)

def _mapping(inp_text, out_text, sum_all_layers):
    # Base threshold
    # fr : 0.75
    # es : 0.4
    if len(out_text) > 4:
        threshold = 0.75
    else:
        threshold = 0

    # While we have too many silent_letters detected
    while (True):
        # Gets the silent_letters indices
        # We consider that a letter is silent if its attention value is below mean attention + threshold * std attention
        silent_letters_idx = [i for i, idx in enumerate(np.argmax(sum_all_layers, axis=0))
                              if sum_all_layers[idx, i] < np.mean(sum_all_layers[idx, :])
                              + threshold * np.std(sum_all_layers[idx, :])]
        # Reduces threshold if too many silent letters are detected
        # Can happen in french when we have 3 lettres graphemes
        if len(silent_letters_idx) > 1 / 3 * len(inp_text):
            threshold -= 0.1
        else:
            break

    # Creates the phoneme attribution list
    phon_list = np.array(out_text)[np.argmax(sum_all_layers, axis=0)]
    phon_list[silent_letters_idx] = "$"  # "$" is our encoding for silent letters
    phon_list = phon_list.tolist()  # needed for the += just below

    # Checks if all the phonemes are attributed and if they are only present the correct number of time in the list
    # If not, the phoneme is concatenated to its most probable neighbor
    # and the least probable phoneme is replaced by a silent letter (this can happen for small datasets)
    for i, phon in enumerate(out_text):
        if phon not in phon_list:
            phon_list[np.argmax(sum_all_layers[i, :])] += phon

    # test = np.where(np.array(phon_list) == phon)[0]
    #     if len(test > 1):
    #         phon_list[np.max(test)] = "%"

    ##NOT WORKING PROPERLY

    # Creates the g_p tupple list
    g_p = [(l, phon_list[i]) for i, l in enumerate(inp_text)]

    # Creates the final g_p mapping
    mapping = []
    for phon, letters in groupby(g_p, lambda x: x[1]):
        graph = "".join([letter[0] for letter in letters])
        mapping.append(graph + "-" + phon)

    return ["".join(inp_text), "".join(out_text), mapping]

def _load_gp_prog(progression):
    gpProg = pd.read_csv(os.path.join('../api/word_to_phonetic/fr/files/','gp_prog.csv'), sep=";")
    gpProg.columns = [["GP", "LESSON"]]
    gpProg = gpProg.loc[gpProg["GP"].notnull()]

    return gpProg

def _get_unique_words(wordGp):
    uniqueWordList = []
    for word, pred, gpMatch, copy in wordGp:
        if (word, pred) not in uniqueWordList:
            uniqueWordList.append((word, pred))
        else:
            wordGp.remove((word, pred, gpMatch, copy))
    return wordGp


def _generate_word_list(wordGp, gpProg):
    
    tempList = []
    for i in range(len(gpProg)):
        lesson = gpProg.loc[i]
        
        for word, pred, gpMatch, copy in wordGp[:]:
            for gp in gpMatch[:]:
                if gp == lesson["GP"]:
                    gpMatch.remove(gp)
            if len(gpMatch) == 0:
                tempList.append(((int(lesson["LESSON"])),("").join(word),(".").join(pred),(".").join(copy)))
                wordGp.remove((word, pred, gpMatch, copy))
    for word, pred, gpMatch, copy in wordGp[:]:
        tempList.append((999,("").join(word),(".").join(pred),(".").join(copy)))
    
    wordList = pd.DataFrame()
    wordList = wordList.append(tempList,ignore_index=True)
    wordList.columns = [["LESSON", "GRAPHEME","PHONEME", "GPMATCH"]]
    return wordList

In [2]:
from __future__ import print_function

import os

import tensorflow as tf
import numpy as np
import pandas as pd

from copy import deepcopy
from tensor2tensor.visualization import attention
from tensor2tensor.visualization import visualization
from tensor2tensor.utils import usr_dir
from itertools import groupby


  from ._conv import register_converters as _register_converters


In [7]:
# def g2p_mapping_file(corpus, progression):
#gpProg = _load_gp_prog("progression")
#sess, visualizer = _load_model()

corpus = ['test', 'valet', 'valaient']
results = [_get_attention_matrices(word, sess, visualizer) for word in corpus]

g_p_results = [_mapping(r[0], r[1], r[2])[2] for r in results]
print([r[1] for r in results])
print(g_p_results)

wordGp = list(zip(corpus, [r[1] for r in results], deepcopy(g_p_results), deepcopy(g_p_results)))
wordGp = _get_unique_words(wordGp)
wordList = _generate_word_list(wordGp, gpProg)

#return wordList.to_json()

[['t', 'E', 's', 't'], ['v', 'a', 'l', 'E'], ['v', 'a', 'l', 'E']]
[['t-t', 'e-E', 's-s', 't-t'], ['v-v', 'a-a', 'l-l', 'et-E'], ['v-v', 'a-a', 'l-l', 'aie-E', 'nt-$']]


In [None]:
#g2p_mapping_file(['test', 'valet', 'valaient'], '')