In [2]:
import pandas as pd
import os
import glob
import re

# ADAPT
import alignment_adapt.run_adapt as run_adapt
import alignment_adapt.cgn2_adapt_map as cgn2_adapt_map
import alignment_adapt.deduce_pcus_orig_phon as deduce_pcus

#ADAGT
import alignment_adagt.adagt as adagt
import alignment_adagt.deduce_pcus_orig_graph as deduce_pcus_graph


# INPUTS - Language Resources

Phonetic Lexicons

In [3]:
lrecDataDir = '../../astla-data/lrec-data'
phoneticLexiconFileDART = os.path.join(lrecDataDir, '2-lexicon-dart-preposttest-checked.tsv')
phoneticLexiconFileBS= os.path.join(lrecDataDir, '2-lexicon-basiscript-g4p1-checked.tsv')
phoneticLexiconFileDART_BS = os.path.join(lrecDataDir, '2-lexicon.tsv')

In [4]:
with open(phoneticLexiconFileDART, 'r') as f:
    lexiconDART = [line.replace('\n', '').split('\t') for line in f.readlines()]

with open(phoneticLexiconFileBS, 'r') as f:
    lexiconBS = [line.replace('\n', '').split('\t') for line in f.readlines()]

lexiconDictDART= pd.DataFrame(lexiconDART).set_index(0)
lexiconDictBS = pd.DataFrame(lexiconBS).set_index(0)
lexicon_dict = pd.concat([lexiconDictDART,lexiconDictBS]).drop_duplicates().sort_index()

lexicon_dict.to_csv(phoneticLexiconFileDART_BS)
lexicon_dict.loc['bakken', 1]

'b A k @'

Aligned Phonetic lexicons

In [5]:
lrecDataDir = '../../astla-data/lrec-data'
alignedPhoneticLexiconFileDART = os.path.join(lrecDataDir, '3-aligned-lexicon-dart-preposttest.csv')
alignedPhoneticLexiconFileBS= os.path.join(lrecDataDir, '3-aligned-lexicon-basiscript-g4p1.csv')
alignedPhoneticLexiconFileDART_BS= os.path.join(lrecDataDir, '3-aligned-lexicon.csv')

In [6]:
pcuLexiconDART = pd.read_csv(alignedPhoneticLexiconFileDART)
pcuLexiconBS = pd.read_csv(alignedPhoneticLexiconFileBS)

pcu_lexicon = pd.concat([pcuLexiconDART,pcuLexiconBS]).drop_duplicates().sort_index()

# Convert strings to list of strings
pcu_lexicon['graphemes_align'] = pcu_lexicon['graphemes_align'].apply(lambda x: eval(x))
pcu_lexicon['phonemes_align'] = pcu_lexicon['phonemes_align'].apply(lambda x: eval(x))

pcu_lexicon.to_csv(alignedPhoneticLexiconFileDART_BS)
pcu_lexicon

Unnamed: 0,graphemes,phonemes,graphemes_align,phonemes_align
0,bal,b A l,"[b, a, l]","[b, A, l]"
0,bakken,b A k @,"[b, a, kk, e, n]","[b, A, k, @, *]"
1,bank,b A N k,"[b, a, n, k]","[b, A, N, k]"
2,bestaat,b @ s t a t,"[b, e, s, t, aa, t]","[b, @, s, t, a, t]"
3,bewaar,b @ w a r,"[b, e, w, aa, r]","[b, @, w, a, r]"
...,...,...,...,...
106,web,w E p,"[w, e, b]","[w, E, p]"
107,zanger,z A N @ r,"[z, a, ng, e, r]","[z, A, N, @, r]"
108,zieke,z i k @,"[z, ie, k, e]","[z, i, k, @]"
109,zoutje,z AU t j @,"[z, ou, t, j, e]","[z, AU, t, j, @]"


Sound Pure PCUs

In [7]:
klankzuiver_dict = {
    "a": ["aa"],
    "A": ["a"],
    "b": ["b"],
    "d": ["d"],
    "e": ["ee"], 
    "E": ["e"],
    "EI": ["ei", "ij"],
    "f": ["f"],
    "G": ["g", "ch"],
    "h": ["h"],
    "i": ["ie"],
    "I": ["i"],
    "j": ["j"],
    "k": ["k"],
    "l": ["l"],
    "m": ["m"],
    "n": ["n"],
    "N": ["ng"],
    "o": ["oo"],
    "EU": ["eu"],
    "U": ["u"],
    "UI": ["ui"],
    "O": ["o"],
    "p": ["p"],
    "r": ["r"],
    "s": ["s"],
    "t": ["t"],
    "u": ["oe"],
    "v": ["v"],
    "w": ["w"], 
    "AU": ["au", "ou"],
    "x": ["g", "ch"],
    "y": ["uu"],
    "U": ["u"],
    "z": ["z"],
}

# Getter & Setter Functions

In [8]:
def getPhoneticTranscription(prompt):
    try:
        return lexicon_dict.loc[prompt, 1]
    except:
        return '<unk>'
    
getPhoneticTranscription('blauw')

'b l AU'

In [9]:
# Functions
def getPCUS(pcu_lexicon, prompt):
    try:
        selected_row = pcu_lexicon[pcu_lexicon['graphemes'] == prompt].reset_index()
        pcus_aligned = selected_row.loc[0,'graphemes_align']
        phonemes_aligned = selected_row.loc[0,'phonemes_align']
        return pcus_aligned, phonemes_aligned
    
    except:
        print('NOT IN LEXION:', prompt)
        return [],[]
    
getPCUS(pcu_lexicon, 'web') # web, worstje

(['w', 'e', 'b'], ['w', 'E', 'p'])

In [10]:
"""
Input: 
pcus_aligned: ['w', 'e', 'b']
phones_aligned: ['w', 'E', 'p']

Output:
For each PCU-phoneme pair whether it is sound pure (klankzuiver) or not: [True, True, False]
"""
def isKlankzuiver(graphemes_aligned, phonemes_aligned):
    return [phone in klankzuiver_dict.keys() and graphemes_aligned[idx] in klankzuiver_dict[phone] for idx, phone in enumerate(phonemes_aligned)]

# Label each target word with "klankzuiver": False or True
def wordKlankzuiver(row):
    return row['klankzuiver_pcus_total'] == len(row['target_phones']) and row['klankzuiver_pcus_total']>0

isKlankzuiver(getPCUS(pcu_lexicon, 'web')[0], getPCUS(pcu_lexicon, 'web')[1])

[True, True, False]

# Read original data

DART read word lists

In [11]:
### Fill DART DF List with different dart preposttest subsets with annotations
dart_df_list = []

### DART: Stephanies annotations (SPECOM set)
csv_file_list = glob.glob('../../astla-data/dart-preposttest/specom-data/chunks_attempts_matched_6sep_manAnn_whispert/*.csv')

for csv_file in csv_file_list:

    # Read all annotations
    df = pd.read_csv(csv_file).fillna('<unk>')
    
    # Add phonetic transcription of prompt
    df['prompt_in_phonemes'] = df['prompt'].apply(getPhoneticTranscription)
    df['phonTrans'] = df['phonTrans'].apply(lambda x: re.sub('\s\s+', ' ', x.replace('-', ' ')))
    # print(df['assessmentDescription'])

    # Select relevant data
    df_selection = df[['index', 'prompt', 'prompt_in_phonemes', 'phonTrans', 'assessmentDescription']].rename(columns={'index': 'filename', 'assessmentDescription': 'error_cat'})
    
    dart_df_list.append(df_selection)

print(len(dart_df_list))

### DART Core set
dart_core_df = pd.read_csv('../../astla-data/dart-preposttest/slate-data/10-output/dart-coreset-all-information.csv')
# print(dart_core_df['phon_category'])

def extractPromptFromFilename(filename):
    return filename.split('_')[1]
    
dart_core_df['prompt'] = dart_core_df['filename'].apply(extractPromptFromFilename)
dart_core_df['prompt_in_phonemes_prep']= dart_core_df['prompt_in_phonemes'].apply(lambda x: re.sub('\s\s+', ' ', x.replace('-', ' ')))

dart_df = dart_core_df.loc[:, ['filename', 'prompt', 'prompt_in_phonemes_prep', 'phonetician1', 'phon_category']].rename(columns={'phonetician1': 'phonTrans', 'prompt_in_phonemes_prep': 'prompt_in_phonemes', 'phon_category':'error_cat'})
dart_df.head(2)

dart_df_list.append(dart_df)
print(len(dart_df_list))


22
23


In [12]:
# Concatenate both DART datasets
dart_df = pd.concat(dart_df_list).reset_index(drop=True)
dart_df.to_csv('/vol/tensusers5/wharmsen/astla-data/lrec-data/dart-reading-selection.csv')
dart_df

Unnamed: 0,filename,prompt,prompt_in_phonemes,phonTrans,error_cat
0,250ab45e-a370-4e97-94d6-0d12c2791a34_lesje,lesje,l E S @,l E S @,cor
1,250ab45e-a370-4e97-94d6-0d12c2791a34_kijken,kijken,k EI k @,k EI k @,cor
2,250ab45e-a370-4e97-94d6-0d12c2791a34_hagel,hagel,h a x @ l,h A N h a x @ r,multi-sub
3,250ab45e-a370-4e97-94d6-0d12c2791a34_jaap,jaap,j a p,j a p,cor
4,250ab45e-a370-4e97-94d6-0d12c2791a34_sterk,sterk,s t E r k,s t E r k,cor
...,...,...,...,...,...
667,e5920085-8c1b-415c-b25d-24b5e5a4a2bf_stuwdam,stuwdam,s t y w d A m,s t y w d A m,prompt
668,e5920085-8c1b-415c-b25d-24b5e5a4a2bf_vorst,vorst,v O r s t,s O r v,multi
669,e5920085-8c1b-415c-b25d-24b5e5a4a2bf_warmst,warmst,w A r m s t,w A r m s,del
670,e5920085-8c1b-415c-b25d-24b5e5a4a2bf_web,web,w E p,h E p,sub


Basiscript dictations

In [13]:
# BasiScript
basiscript_df = pd.read_csv('/vol/tensusers5/wharmsen/spelling-data/aseda/3-classdata/word-dataframe-basiscript-dictee.csv', converters={'target_phones': eval,
                                                           'target_pcus': eval,
                                                           'original_pcus': eval})

def isGroep4p1(pair_id):
    return pair_id.find('.g4.p1') != -1

def isGroep4p2(pair_id):
    return pair_id.find('.g4.p2') != -1

groep4DF = basiscript_df[basiscript_df['pair_id'].apply(isGroep4p1)]
groep4p2DF = basiscript_df[basiscript_df['pair_id'].apply(isGroep4p2)]

groep4DF.head()

Unnamed: 0,pair_id,original,target_aligned,original_aligned,correct,target_phones,target_pcus,original_pcus
35,bsdict.g4.p1-pair0,huis,huis,huis,1,"[h, UI, s]","[h, ui, s]","[h, ui, s]"
36,bsdict.g4.p1-pair1,keus,keus,keus,1,"[k, EU, s]","[k, eu, s]","[k, eu, s]"
37,bsdict.g4.p1-pair2,fee,fee,fee,1,"[f, e]","[f, ee]","[f, ee]"
38,bsdict.g4.p1-pair3,leet,leed,leet,0,"[l, e, t]","[l, ee, d]","[l, ee, t]"
39,bsdict.g4.p1-pair4,bank,bank,bank,1,"[b, A, N, k]","[b, a, n, k]","[b, a, n, k]"


In [14]:
# Restore Student IDs
studentIDlist = []
for i in range(int(len(groep4DF)/25)):
    studentID = 's' + str(i)
    studentIDlist += [studentID] * 25

groep4DF.loc[:,'studentID'] = studentIDlist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  groep4DF.loc[:,'studentID'] = studentIDlist


In [15]:
# Make data selection and add prompts
groep4DF.loc[:, 'target'] = groep4DF.loc[:,'target_aligned'].apply(lambda x: x.replace("*", "") )
groep4DF = groep4DF[['target', 'original', 'studentID']].reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  groep4DF.loc[:, 'target'] = groep4DF.loc[:,'target_aligned'].apply(lambda x: x.replace("*", "") )


# Alignment 1: Target graph-phon

pcu_lexicon contains for each prompt the alignment between target graphemes and phonemes.
For each prompt in the data set, we look up the corresponding alignment in this pcu_lexicon

DART

In [16]:
target_pcus = []
target_phones = []
klankzuiver = []

for idx, row in dart_df.iterrows():
    prompt = dart_df.loc[idx, 'prompt']
    pcus_aligned, phonemes_aligned = getPCUS(pcu_lexicon, prompt)

    pcus_isKlankzuiver = isKlankzuiver(pcus_aligned, phonemes_aligned)

    target_pcus.append(pcus_aligned)
    target_phones.append(phonemes_aligned)
    klankzuiver.append(pcus_isKlankzuiver)

dart_df['target_phones'] = target_phones
dart_df['target_pcus'] = target_pcus
dart_df['klankzuiver'] = klankzuiver
dart_df['klankzuiver_pcus_total'] = dart_df['klankzuiver'].apply(lambda x: sum(x))
dart_df['word_klankzuiver'] = dart_df.apply(wordKlankzuiver, axis = 1)

dart_df.head(2)

Unnamed: 0,filename,prompt,prompt_in_phonemes,phonTrans,error_cat,target_phones,target_pcus,klankzuiver,klankzuiver_pcus_total,word_klankzuiver
0,250ab45e-a370-4e97-94d6-0d12c2791a34_lesje,lesje,l E S @,l E S @,cor,"[l, E, S, @]","[l, e, sj, e]","[True, True, False, False]",2,False
1,250ab45e-a370-4e97-94d6-0d12c2791a34_kijken,kijken,k EI k @,k EI k @,cor,"[k, EI, k, @, *]","[k, ij, k, e, n]","[True, True, True, False, False]",3,False


BasiScript

In [17]:
target_pcus = []
target_phones = []
klankzuiver = []

for idx, row in groep4DF.iterrows():
    prompt = groep4DF.loc[idx, 'target']
    pcus_aligned, phonemes_aligned = getPCUS(pcu_lexicon, prompt)

    pcus_isKlankzuiver = isKlankzuiver(pcus_aligned, phonemes_aligned)

    target_pcus.append(pcus_aligned)
    target_phones.append(phonemes_aligned)
    klankzuiver.append(pcus_isKlankzuiver)

groep4DF['target_phones'] = target_phones
groep4DF['target_pcus'] = target_pcus
groep4DF['klankzuiver'] = klankzuiver
groep4DF['klankzuiver_pcus_total'] = groep4DF['klankzuiver'].apply(lambda x: sum(x))
groep4DF['word_klankzuiver'] = groep4DF.apply(wordKlankzuiver, axis = 1)
groep4DF.head(2)

Unnamed: 0,target,original,studentID,target_phones,target_pcus,klankzuiver,klankzuiver_pcus_total,word_klankzuiver
0,huis,huis,s0,"[h, UI, s]","[h, ui, s]","[True, True, True]",3,True
1,keus,keus,s0,"[k, EU, s]","[k, eu, s]","[True, True, True]",3,True


# ADAPT and ADAGT

ADAPT: Align phonetic strings of DART data

In [18]:
# ADAPT alignment

# prompt = "kijken"
# prompt_ref = "k EI k @"
# prompt_hyp = "k EI k @ n"

prompt = 'hagel'
prompt_ref = 'h a x @ l'
prompt_hyp = 'h A N h a x @ r'

# PCU lexicon uses * for ins/del
target_pcus, target_phones = getPCUS(pcu_lexicon, prompt)
target_phones_adapt = [cgn2_adapt_map.cgn2_to_adapt_dict[phone] for phone in target_phones] #o.a. change * to - 
print(target_pcus)
print(target_phones)
print(target_phones_adapt)

# ADAPT uses '-' for ins/del
align_ref, align_hyp, align_ref_adapt, align_hyp_adapt = run_adapt.reverse_align_two_phone_strings(prompt_ref, prompt_hyp)
print(align_ref)
print(align_hyp)
print(align_ref_adapt)
print(align_hyp_adapt)

pcus_target_graph, pcus_target_phon_adapt, pcus_orig_phon_adapt = deduce_pcus.computePCUs(align_ref_adapt, align_hyp_adapt, target_phones_adapt, target_pcus, '-')

# Postprocess
pcus_target_phon = [cgn2_adapt_map.adapt_to_cgn2_dict[phone] for phone in pcus_target_phon_adapt[0]]
pcus_orig_phon = [cgn2_adapt_map.adapt_to_cgn2_dict[phone] for phone in pcus_orig_phon_adapt[0]]
print(pcus_target_graph, pcus_target_phon, pcus_orig_phon)

['h', 'a', 'g', 'e', 'l']
['h', 'a', 'x', '@', 'l']
['h', 'a', 'x', '@', 'l']


KeyError: '-'

In [None]:
# Initialize new columns
pcus_target_graph_list = []
pcus_target_phon_list = []
pcus_orig_phon_list = []

for idx, row in dart_df.iterrows():
    # try:
    # Preprocess
    prompt_ref = row['prompt_in_phonemes']
    prompt_hyp = row['phonTrans']

    # The case that the prompt word is not read
    if(prompt_ref == '<unk>' or prompt_hyp == '<unk>'):
        print(prompt_ref, prompt_hyp)
        pcus_target_graph = []
        pcus_target_phon = []
        pcus_orig_phon = []

    else:
        try:
            # ADAPT alignment
            align_ref, align_hyp, align_ref_adapt, align_hyp_adapt = run_adapt.reverse_align_two_phone_strings(prompt_ref, prompt_hyp)

            target_phones = row['target_phones']
            target_phones_adapt = [cgn2_adapt_map.cgn2_to_adapt_dict[phone] for phone in target_phones]

            target_pcus = row['target_pcus']

            # Alignment with original phonemes
            pcus_target_graph, pcus_target_phon_adapt, pcus_orig_phon_adapt = deduce_pcus.computePCUs(align_ref_adapt, align_hyp_adapt, target_phones_adapt, target_pcus, '-')

            # Postprocess
            pcus_target_phon = [cgn2_adapt_map.adapt_to_cgn2_dict[phone] for phone in pcus_target_phon_adapt[0]]
            pcus_orig_phon = [cgn2_adapt_map.adapt_to_cgn2_dict[phone] for phone in pcus_orig_phon_adapt[0]]
        except:
            print(target_pcus)
            print(target_phones)
            print(target_phones_adapt)
            print(align_ref)
            print(align_hyp)
            print(align_ref_adapt)
            print(align_hyp_adapt)
            print(pcus_target_graph, pcus_target_phon_adapt, pcus_orig_phon_adapt)
        
    pcus_target_graph_list.append(pcus_target_graph[0]) if len(pcus_target_graph) > 0 else pcus_target_graph_list.append(pcus_target_graph)
    pcus_target_phon_list.append(pcus_target_phon)
    pcus_orig_phon_list.append(pcus_orig_phon)

dart_df['pcus_target_graph'] = pcus_target_graph_list
dart_df['pcus_target_phon'] = pcus_target_phon_list
dart_df['pcus_orig_phon'] = pcus_orig_phon_list
dart_df.head(3)

s t u p <unk>
m o j <unk>
d e k @ <unk>
s p y w <unk>
s p i r k r A x t <unk>
f l I t s <unk>
j O p <unk>
f l I t s <unk>
<unk> <unk>


Unnamed: 0,filename,prompt,prompt_in_phonemes,phonTrans,error_cat,target_phones,target_pcus,klankzuiver,klankzuiver_pcus_total,word_klankzuiver,pcus_target_graph,pcus_target_phon,pcus_orig_phon
0,250ab45e-a370-4e97-94d6-0d12c2791a34_lesje,lesje,l E S @,l E S @,cor,"[l, E, S, @]","[l, e, sj, e]","[True, True, False, False]",2,False,"[l, e, sj, e]","[l, E, S, @]","[l, E, S, @]"
1,250ab45e-a370-4e97-94d6-0d12c2791a34_kijken,kijken,k EI k @,k EI k @,cor,"[k, EI, k, @, *]","[k, ij, k, e, n]","[True, True, True, False, False]",3,False,"[k, ij, k, e, n]","[k, EI, k, @, *]","[k, EI, k, @, *]"
2,250ab45e-a370-4e97-94d6-0d12c2791a34_hagel,hagel,h a x @ l,h A N h a x @ r,multi-sub,"[h, a, x, @, l]","[h, a, g, e, l]","[True, False, True, False, True]",3,False,"[-, -, -, h, a, g, e, l]","[*, *, *, h, a, x, @, l]","[h, A, N, h, a, x, @, r]"


In [None]:
pcuSegmFileDART = os.path.join(lrecDataDir, '4-pcu-segmentations-dart-preposttest.csv')
dart_df.to_csv(pcuSegmFileDART, sep=';')

ADAGT: Align grapheme strings of BasiScript data

In [None]:
groep4DF.head(1)

Unnamed: 0,target,original,studentID,target_phones,target_pcus,klankzuiver,klankzuiver_pcus_total,word_klankzuiver
0,huis,huis,s0,"[h, UI, s]","[h, ui, s]","[True, True, True]",3,True


In [None]:
# Initialize new columns
target_aligned_list = []
original_aligned_list = []
pcus_target_phon_list = []
pcus_target_graph_list = []
pcus_orig_graph_list = []

for idx, row in groep4DF.iterrows():

    try:
        # Preprocess
        prompt_ref = row['target']
        prompt_hyp = row['original']

        # ADAGT alignment
        align_ref, align_hyp = adagt.align(prompt_ref, prompt_hyp)

        target_phones = row['target_phones']
        target_pcus = row['target_pcus']

        # Alignment with original phonemes
        pcus_target_phon, pcus_target_graph, pcus_orig_graph = deduce_pcus_graph.computePCUs(align_ref, align_hyp, target_pcus, target_phones, '-')
        
    # Except case is activated in case word is not read (in that case "target_phones" is NaN)
    except:
        align_ref = ''
        align_hyp = ''
        pcus_target_phon = []
        pcus_target_graph = []
        pcus_orig_graph = []

    target_aligned_list.append(align_ref)
    original_aligned_list.append(align_hyp)
    pcus_target_phon_list.append(pcus_target_phon)
    pcus_target_graph_list.append(pcus_target_graph[0]) if len(pcus_target_graph) > 0 else pcus_target_graph_list.append(pcus_target_graph)
    pcus_orig_graph_list.append(pcus_orig_graph)

In [None]:
groep4DF['target_aligned'] = target_aligned_list
groep4DF['original_aligned'] = original_aligned_list
groep4DF['pcus_target_graph'] = pcus_target_graph_list
groep4DF['pcus_target_phon'] = [x[0] if len(x)>0 else [] for x in pcus_target_phon_list ]
groep4DF['pcus_orig_graph'] = [x[0] if len(x)>0 else [] for x in pcus_orig_graph_list ]
groep4DF.head(3)

Unnamed: 0,target,original,studentID,target_phones,target_pcus,klankzuiver,klankzuiver_pcus_total,word_klankzuiver,pcus_target_graph,pcus_target_phon,pcus_orig_graph,target_aligned,original_aligned
0,huis,huis,s0,"[h, UI, s]","[h, ui, s]","[True, True, True]",3,True,"[h, ui, s]","[h, UI, s]","[h, ui, s]",huis,huis
1,keus,keus,s0,"[k, EU, s]","[k, eu, s]","[True, True, True]",3,True,"[k, eu, s]","[k, EU, s]","[k, eu, s]",keus,keus
2,fee,fee,s0,"[f, e]","[f, ee]","[True, True]",2,True,"[f, ee]","[f, e]","[f, ee]",fee,fee


In [None]:
pcuSegmFileBS = os.path.join(lrecDataDir, '4-pcu-segmentations-basiscript-g4p1.csv')
groep4DF.to_csv(pcuSegmFileBS, sep=';')

Test for adagt and deduce_pcus_graph

In [None]:
# Test function
import alignment_adagt.adagt as adagt
import alignment_adagt.deduce_pcus_orig_graph as deduce_pcus_graph

# Preprocess
prompt_ref = 'monteur'
prompt_hyp = 'montur'

# ADAGT alignment
align_ref, align_hyp = adagt.align(prompt_ref, prompt_hyp)
print(align_ref, align_hyp)

target_phones, target_pcus = getPCUS(pcu_lexicon, align_ref)
print(target_phones, target_pcus)

# Alignment with original phonemes
pcus_target_phon, pcus_target_graph, pcus_orig_graph = deduce_pcus_graph.computePCUs(align_ref, align_hyp, target_pcus, target_phones, '-')
print(pcus_target_phon, pcus_target_graph, pcus_orig_graph)

monteur mont*ur
['m', 'o', 'n', 't', 'eu', 'r'] ['m', 'O', 'n', 't', 'EU', 'r']
[['m', 'o', 'n', 't', 'eu', 'r']] [['m', 'O', 'n', 't', 'EU', 'r']] [['m', 'o', 'n', 't', 'u', 'r']]


In [None]:
deduce_pcus_graph.computePCUs('rei-s', 'r-ijs', ['r', 'ei', 's'], ['r', 'EI', 's'], '*')
deduce_pcus_graph.computePCUs('vrouw', 'vraow', ['v', 'r', 'auw'], ['v', 'r', 'AU'], '*')

([['v', 'r', 'AU', '*']], [['v', 'r', 'auw', '*']], [['v', 'r', 'a', 'ow']])