# Prepare PHN

### Analyse

In [4]:
import os
from collections import Counter

# Verzeichnisse
labels_dir = "data/en/phn"

# Initialisierung
label_counter = Counter()
total_frames = 0

# Alle Label-Dateien durchgehen
label_files = [f for f in os.listdir(labels_dir) if f.endswith(".phn")]

for file in label_files:
    with open(os.path.join(labels_dir, file), "r") as f:
        lines = f.read().splitlines()
        for line in lines:
            parts = line.strip().split()
            if len(parts) == 3:
                label = parts[2]
                label_counter.update([label])
                total_frames += 1

# Ausgabe
print("--- Label Verteilung ---")
for label, count in label_counter.most_common():
    percent = count / total_frames * 100
    print(f"{label}: {count} Frames ({percent:.2f}%)")

print(f"\nGesamtanzahl Frames: {total_frames}")


--- Label Verteilung ---
SIL: 110639 Frames (12.37%)
ah: 73024 Frames (8.17%)
t: 59095 Frames (6.61%)
n: 51704 Frames (5.78%)
ih: 50133 Frames (5.61%)
d: 35973 Frames (4.02%)
s: 35545 Frames (3.97%)
r: 30188 Frames (3.38%)
l: 29738 Frames (3.33%)
iy: 28853 Frames (3.23%)
dh: 26305 Frames (2.94%)
m: 23660 Frames (2.65%)
k: 22102 Frames (2.47%)
ae: 21505 Frames (2.40%)
eh: 21444 Frames (2.40%)
z: 21277 Frames (2.38%)
w: 20140 Frames (2.25%)
er: 19586 Frames (2.19%)
ay: 16468 Frames (1.84%)
hh: 16232 Frames (1.82%)
b: 15881 Frames (1.78%)
uw: 15026 Frames (1.68%)
p: 14862 Frames (1.66%)
v: 13115 Frames (1.47%)
f: 12782 Frames (1.43%)
ey: 12553 Frames (1.40%)
ao: 12197 Frames (1.36%)
aa: 10806 Frames (1.21%)
ow: 10668 Frames (1.19%)
ng: 10474 Frames (1.17%)
y: 7617 Frames (0.85%)
g: 7293 Frames (0.82%)
oov: 7018 Frames (0.78%)
aw: 5966 Frames (0.67%)
th: 5878 Frames (0.66%)
sh: 4853 Frames (0.54%)
uh: 4306 Frames (0.48%)
jh: 3558 Frames (0.40%)
ch: 3118 Frames (0.35%)
oy: 2268 Frames (0.25

# Clean up 
Remove Suffix and cobine SILs 

In [3]:
import os
from pathlib import Path

# Ordner mit PHN-Dateien
phn_dir = Path("data/en/phn")

# Definition der Silence-Bezeichner (die zu "SIL" normalisiert werden sollen)
sil_labels = {"sil", "SIL", "silence", "SILENCE"}

# Alle .phn-Dateien durchgehen
for phn_file in phn_dir.glob("*.phn"):
    new_lines = []

    with open(phn_file, "r", encoding="utf-8") as f:
        lines = f.read().splitlines()

    for line in lines:
        parts = line.strip().split()
        if len(parts) != 3:
            continue  # Ungültige Zeile überspringen

        start, end, label = parts

        # Silence vereinheitlichen
        base_label = label.split("_")[0].lower()
        if base_label in sil_labels:
            cleaned_label = "SIL"
        else:
            # Nur Phonem ohne Suffix behalten
            cleaned_label = label.split("_")[0]

        new_lines.append(f"{start} {end} {cleaned_label}")

    # Datei überschreiben
    with open(phn_file, "w", encoding="utf-8") as f:
        f.write("\n".join(new_lines))

print("Alle .phn-Dateien wurden bereinigt (Suffixe entfernt, SIL vereinheitlicht).")


Alle .phn-Dateien wurden bereinigt (Suffixe entfernt, SIL vereinheitlicht).


### Remove OOV

In [1]:
from pathlib import Path
import numpy as np
from tqdm import tqdm

# Eingabe- und Ausgabeordner
base_dir = Path("data/en")
features_in = base_dir / "features"
labels_in = base_dir / "labels"
features_out = base_dir / "features_clean"
labels_out = base_dir / "labels_clean"
features_out.mkdir(exist_ok=True)
labels_out.mkdir(exist_ok=True)

# Alle Dateien iterieren
files = list(features_in.glob("*.npy"))

removed_frames = 0
total_frames = 0
kept_files = 0

for feat_path in tqdm(files, desc="OOV-Entfernung", unit="dateien"):
    base = feat_path.stem
    label_path = labels_in / f"{base}.txt"

    if not label_path.exists():
        continue

    # Lade Features und Labels
    features = np.load(feat_path)
    with open(label_path) as f:
        labels = [line.strip() for line in f]

    assert len(features) == len(labels), f"Mismatch in {base}"

    # Filtere alles außer OOV
    mask = [l != "oov" for l in labels]
    features_clean = features[mask]
    labels_clean = [l for l in labels if l != "oov"]

    removed = len(labels) - len(labels_clean)
    removed_frames += removed
    total_frames += len(labels)
    kept_files += 1

    # Speichern
    np.save(features_out / f"{base}.npy", features_clean)
    with open(labels_out / f"{base}.txt", "w") as f:
        f.write("\n".join(labels_clean))

print(f"\n--- OOV-Bereinigung abgeschlossen ---")
print(f"📦 Verarbeitete Dateien: {kept_files}")
print(f"🧹 Entfernte OOV-Frames: {removed_frames}")
print(f"✅ Verbleibende Frames: {total_frames - removed_frames}")
print(f"📁 Gespeichert in: {features_out} und {labels_out}")


OOV-Entfernung: 100%|███████████████████████████████████████████████████████| 26482/26482 [05:37<00:00, 78.43dateien/s]


--- OOV-Bereinigung abgeschlossen ---
📦 Verarbeitete Dateien: 26482
🧹 Entfernte OOV-Frames: 293337
✅ Verbleibende Frames: 13943888
📁 Gespeichert in: data\en\features_clean und data\en\labels_clean



