In [1]:
import pandas as pd
import alignment_graph_phon.graph_phon_alignment as gpa
import csv
import os

In [2]:
def extract_grapheme_phoneme_from_g2p_output(phonetic_lexicon_path):
    with open(phonetic_lexicon_path) as file:
        text = file.read()
        
    lines = text.split('\n')
    lines = [x for x in lines if x!='']

    #Save lexicon in grapheme and phoneme list
    graphemes = []
    graphemes_aligned = []
    phonemes = []
    phonemes_aligned = []

    for line in lines:
        line = line.split("\t")

        grapheme = line[0]
        phoneme = line[1]
        grapheme_align, phoneme_align = gpa.align_word_and_phon_trans(grapheme, phoneme)

        graphemes.append(grapheme)
        phonemes.append(phoneme)
        graphemes_aligned.append(grapheme_align)
        phonemes_aligned.append(phoneme_align)

    return graphemes, phonemes, graphemes_aligned, phonemes_aligned

def create_alignment_lexicon(graphemes, phonemes, graph_align_list, phon_align_list):
    matrix = []
    for idx in range(len(phonemes)):
        row = [graphemes[idx],phonemes[idx],graph_align_list[idx], phon_align_list[idx]]
        matrix.append(row)

    lexicon_df = pd.DataFrame(matrix, columns = ["graphemes", "phonemes", "graphemes_align", "phonemes_align"])
    lexicon_df = lexicon_df.set_index("graphemes")
    lexicon_df = lexicon_df.dropna(subset=['phonemes'])

    return lexicon_df

In [4]:
# Define inputs and outputs
lexiconDir = '../../astla-data/lrec-data/'
# phonetic_lexicon = '2-lexicon-basiscript-g4p1-checked.tsv'
# aligned_lexicon = "3-aligned-lexicon-basiscript-g4p1.csv"
phonetic_lexicon = '2-lexicon-dart-preposttest-checked.tsv'
aligned_lexicon = "3-aligned-lexicon-dart-preposttest.csv"

# Create input and output paths
phonetic_lexicon_path = os.path.join(lexiconDir, phonetic_lexicon)
aligned_lexicon_path = os.path.join(lexiconDir, aligned_lexicon)
graphemes, phonemes, graphemes_aligned, phonemes_aligned = extract_grapheme_phoneme_from_g2p_output(phonetic_lexicon_path)

# Align the graphemes and phonemes of each entry in the lexicon
lexiconDF = create_alignment_lexicon(graphemes, phonemes, graphemes_aligned, phonemes_aligned)

# Export lexicon
lexiconDF.to_csv(aligned_lexicon_path, quoting = csv.QUOTE_NONNUMERIC, quotechar='"')
print("Alignment Lexicon created, see: ", aligned_lexicon_path)

Alignment Lexicon created, see:  ../../astla-data/lrec-data/3-aligned-lexicon-dart-preposttest.csv
