In [60]:
import os
import csv

# === CONFIG ===
input_folder = r"Teto"             # folder containing .ust files
output_csv    = "syllable_dataset.csv"

# your vowel count + syllable split helpers
vowels = {"a","e","i","o","u"}
def count_vowels(w):
    return sum(1 for c in w if c in vowels)

def split_into_syllables(word):
    word_l = word.lower()
    # single‐syllable exceptions
    whole_word_exceptions = {
        "name","this","that","these","those","some","none",
        "pear","pair","song","share","you","your","our",
        "hour","power","meme","screen","screens","read",
        "real","have","face","even","they","change",
        "fruit","care","heard","heart","ready","chain",
        "greet","are"
    }
    if word_l in whole_word_exceptions:
        return [word_l]

    # suffix list
    suffixes = {
        "tion","sion","cian","ding","ling","nion","terred",
        "vous","toire","cent","cause","cient","stand",
        "fone","phone","nia","tched","ted","kov",
        "phane","quence","one","thing","ing","ning"
    }
    suffix = None
    stem = word_l
    for sfx in suffixes:
        if word_l.endswith(sfx):
            suffix = sfx
            stem = word_l[:-len(sfx)]
            break

    # split stem on each vowel
    if count_vowels(stem) <= 1:
        parts = [stem]
    else:
        parts = []
        cur = ""
        for c in stem:
            cur += c
            if c in vowels:
                parts.append(cur)
                cur = ""
        if cur:
            parts.append(cur)

    if suffix:
        parts.append(suffix)
    return parts

# the actual UST parser
def parse_ust(path):
    # read lines, fallback to Shift-JIS if UTF-8 fails
    try:
        with open(path, "r", encoding="utf-8") as f:
            lines = f.readlines()
    except UnicodeDecodeError:
        with open(path, "r", encoding="shift_jis") as f:
            lines = f.readlines()

    entries = []
    cur_lyric = None
    cur_pitch = None

    for L in lines:
        L = L.strip()
        if L.startswith("[#"):
            cur_lyric = cur_pitch = None
        elif L.startswith("Lyric="):
            cur_lyric = L.split("=",1)[1]
        elif L.startswith("NoteNum="):
            cur_pitch = L.split("=",1)[1]

        if cur_lyric is not None and cur_pitch is not None:
            if cur_lyric.upper() != "R":
                # split into actual syllables
                for pos, syl in enumerate(split_into_syllables(cur_lyric)):
                    entries.append((syl, pos, int(cur_pitch)))
            cur_lyric = cur_pitch = None

    return entries

# gather & write CSV
all_entries = []
for fn in os.listdir(input_folder):
    if fn.lower().endswith(".ust"):
        path = os.path.join(input_folder, fn)
        all_entries.extend(parse_ust(path))

with open(output_csv, "w", newline="", encoding="utf-8") as out:
    w = csv.writer(out)
    w.writerow(["syllable","position","pitch"])
    w.writerows(all_entries)

print(f"✅ Parsed {len(all_entries)} syllables → {output_csv}")

✅ Parsed 1740 syllables → syllable_dataset.csv
