In [2]:
import scipy.io
from scipy.io import savemat
import numpy as np

In [3]:
def read_file(file_path):
    with open(file_path, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

### words.mat

In [5]:
def process_word_boundaries_data(tokenized_data):
    words, start, end = [], [], []
    for i in range(0, len(tokenized_data), 5):
        start.append(float(tokenized_data[i+2]))
        if i+2+5 < len(tokenized_data) and float(tokenized_data[i+2]) + float(tokenized_data[i+3]) > float(tokenized_data[i+2+5]):
            end.append(float(tokenized_data[i+2+5]))
        else:
            end.append(float(tokenized_data[i+2]) + float(tokenized_data[i+3]))
        words.append(tokenized_data[i+4])
    return words, start, end
    
def save_words_mat(file_path, filename, save_directory):
    data = read_file(file_path)
    tokenized_data = data.replace('\t', ' ').split(' ')
    while("" in tokenized_data):
        tokenized_data.remove("")

    words, start, end = process_word_boundaries_data(tokenized_data)
    word_time_intervals = list(zip(start, end))
    print(word_time_intervals)
    mat_data = {"spurtWordTimes": np.array(word_time_intervals, dtype=np.float_), "words": words}
    mat_filename = save_directory + "words/" + filename[:-4] + "_words.mat"
    print(mat_data)
#     savemat(mat_filename, mat_data)

### vowel.mat & syllable.mat

In [4]:
import syllabifier
language = syllabifier.English # or: syllabifier.loadLanguage("english.cfg")
spn_containing_files = []
PHONEME_LENGTH = 3

In [5]:
def process_phonemes(data_arr):
    start, end, ph = [], [], []
    final = ''
    for i in range(0, len(data_arr), 3):
        if data_arr[i+2] == "sil":
            continue
        if data_arr[i+2] == "spn":
            spn_containing_files.append(filename)
            print(filename)
            return
        start.append(data_arr[i])
        end.append(data_arr[i+1])
        ph.append(data_arr[i+2])
        final += data_arr[i+2].upper() + ' '
    return start, end, ph, final

def syllabify_and_extract_times(language, final, start, end):
    syllables = syllabifier.syllabify(language, final)
    main_str = syllabifier.stringify(syllables)
    arr = main_str.split(".")  # syllable array

    final_st, final_end, syl = [], [], []
    c = 0
    for e in arr:
        phoneme = e.split(" ")
        while "" in phoneme:
            phoneme.remove("")
        final_st.append(start[c])
        final_end.append(end[c+len(phoneme)-1])
        syl.append(phoneme)
        c += len(phoneme)
    return final_st, final_end, syl

def extract_vowels(start, end, ph):
    v = syllabifier.English["vowels"]
    vowel_start, vowel_end = [], []
    for i in range(len(ph)):
        s = ph[i]
        if len(ph[i]) == PHONEME_LENGTH:
            s = ph[i][:-1]
        if s.upper() in v:
            vowel_start.append(float(start[i]))
            vowel_end.append(float(end[i]))
    return vowel_start, vowel_end

def process_syllables(syl, start, end):
    syllables = []
    for s in syl:
        ss = " ".join([ph.lower()[:-1] if len(ph) == PHONEME_LENGTH else ph.lower() for ph in s])
        if ss:
            syllables.append(ss)
    
    syllable_times = [[float(start[i]), float(end[i])] for i in range(len(start))]
    return syllables, syllable_times

In [6]:
def save_vowel_and_syllable_mat(file_path, filename, save_directory):
    data = read_file(file_path)
    tokenized_data = data.replace('\t', ' ').split(' ')
    while "" in tokenized_data:
        tokenized_data.remove("")
        
    start, end, ph, final = process_phonemes(tokenized_data)
    
    final_st, final_end, syl = syllabify_and_extract_times(language, final, start, end)

    vowel_start, vowel_end = extract_vowels(start, end, ph)
    mdic_vowel = {"vowelStartTime": np.array(vowel_start, dtype=np.float32), "vowelEndTime": np.array(vowel_end, dtype=np.float32)}
    savemat("{}vowel/{}_vowel.mat".format(save_directory, filename[:-4]), mdic_vowel)

    syllables, syllable_times = process_syllables(syl, final_st, final_end)
    mdic_syllable = {"spurtSyl": np.array(syllables), "spurtSylTimes": np.array(syllable_times, dtype=np.float_)}
    savemat("{}syllable/{}_syllable.mat".format(save_directory, filename[:-4]), mdic_syllable)

### Generate mat files

In [8]:
import os
from tqdm import tqdm

# all these should end with a slash
word_directory = "../old_TCSSBC/word_and_phoneme_boundaries/DUC2001_FA_result/"
vowel_syl_directory = "../old_TCSSBC/word_and_phoneme_boundaries/DUC2001_FA_result_phn/"
save_directory = "./mat_files/"

In [10]:
empty_files = [] # having 0 words
for filename in tqdm(os.listdir(word_directory)):
    if os.path.getsize(word_directory + filename) == 0:
        empty_files.append(filename)

    if filename.endswith(".txt") and os.path.getsize(word_directory + filename) > 0:
        file_path = "{}{}".format(word_directory, filename)
        print(filename)
        save_words_mat(file_path, filename, save_directory)
        break

  0%|          | 0/10707 [00:00<?, ?it/s]

FT911-2650_62.txt
[(0.03, 0.11), (0.11, 0.41), (0.41, 0.5599999999999999), (0.56, 1.09), (1.09, 1.18), (1.18, 1.27), (1.27, 1.67), (1.67, 1.7999999999999998), (1.8, 2.06), (2.06, 2.68), (2.68, 3.13), (3.13, 3.34), (3.34, 3.7199999999999998), (3.72, 4.09), (4.09, 4.27), (4.27, 4.3999999999999995), (4.4, 4.69), (4.69, 5.11), (5.11, 5.31), (5.31, 5.789999999999999), (5.79, 5.91), (5.91, 6.6), (6.6, 6.84), (6.84, 7.04), (7.04, 7.64)]
{'spurtWordTimes': array([[0.03, 0.11],
       [0.11, 0.41],
       [0.41, 0.56],
       [0.56, 1.09],
       [1.09, 1.18],
       [1.18, 1.27],
       [1.27, 1.67],
       [1.67, 1.8 ],
       [1.8 , 2.06],
       [2.06, 2.68],
       [2.68, 3.13],
       [3.13, 3.34],
       [3.34, 3.72],
       [3.72, 4.09],
       [4.09, 4.27],
       [4.27, 4.4 ],
       [4.4 , 4.69],
       [4.69, 5.11],
       [5.11, 5.31],
       [5.31, 5.79],
       [5.79, 5.91],
       [5.91, 6.6 ],
       [6.6 , 6.84],
       [6.84, 7.04],
       [7.04, 7.64]]), 'words': ['the', 'eb




In [8]:
for filename in tqdm(os.listdir(vowel_syl_directory)):
    # word directory bcoz we check if there are no words
    if filename.endswith(".txt") and os.path.getsize(word_directory + filename) > 0:
        file_path = "{}{}".format(vowel_syl_directory, filename)
        save_vowel_and_syllable_mat(file_path, filename, save_directory)

100%|██████████| 10708/10708 [01:55<00:00, 92.70it/s] 


In [11]:
empty_files # these files dont have mat files generated

['WSJ910208-0130_8.txt',
 'LA103089-0043_46.txt',
 'WSJ910710-0148_12.txt',
 'SJMN91-06283083_37.txt',
 'SJMN91-06283083_26.txt',
 'WSJ911121-0136_15.txt',
 'FBIS3-41_21.txt',
 'LA061589-0143_29.txt',
 'SJMN91-06246065_10.txt',
 'LA010890-0031_72.txt',
 'WSJ880621-0079_39.txt',
 'LA052289-0050_90.txt',
 'SJMN91-06246065_11.txt',
 'LA102190-0045_61.txt',
 'LA011889-0067_21.txt',
 'LA081490-0030_18.txt',
 'LA103089-0043_45.txt',
 'SJMN91-06276078_14.txt',
 'LA050889-0075_45.txt',
 'LA061589-0143_8.txt',
 'SJMN91-06187248_69.txt',
 'LA103089-0043_67.txt',
 'WSJ920114-0145_40.txt',
 'LA102190-0045_62.txt',
 'SJMN91-06246065_12.txt',
 'LA103089-0043_63.txt',
 'LA102190-0045_73.txt',
 'LA032589-0044_33.txt',
 'SJMN91-06283083_27.txt',
 'LA042790-0205_18.txt',
 'WSJ910702-0078_44.txt',
 'WSJ910107-0139_40.txt',
 'SJMN91-06246065_38.txt',
 'LA061589-0143_7.txt',
 'LA102190-0045_52.txt',
 'LA081490-0030_19.txt',
 'SJMN91-06290146_13.txt',
 'WSJ880621-0079_70.txt',
 'LA050889-0075_46.txt',
 'WSJ

In [14]:
# testing
test = scipy.io.loadmat('../mat_files/words/FT911-2650_62_words.mat')
print(test)

{'spurtWordTimes': array([[0.03, 0.11],
       [0.11, 0.41],
       [0.41, 0.56],
       [0.56, 1.09],
       [1.09, 1.18],
       [1.18, 1.27],
       [1.27, 1.67],
       [1.67, 1.8 ],
       [1.8 , 2.06],
       [2.06, 2.68],
       [2.68, 3.13],
       [3.13, 3.34],
       [3.34, 3.72],
       [3.72, 4.09],
       [4.09, 4.27],
       [4.27, 4.4 ],
       [4.4 , 4.69],
       [4.69, 5.11],
       [5.11, 5.31],
       [5.31, 5.79],
       [5.79, 5.91],
       [5.91, 6.6 ],
       [6.6 , 6.84],
       [6.84, 7.04],
       [7.04, 7.64]]), '__version__': '1.0', '__header__': 'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Dec  6 17:53:29 2023', 'words': array([u'the        ', u'ebrd       ', u'is         ', u'ministering',
       u'to         ', u'a          ', u'region     ', u'that       ',
       u'has        ', u'historical ', u'ties       ', u'with       ',
       u'western    ', u'europe     ', u'and        ', u'that       ',
       u'has        ', u'declared   ', u'its    