# Interfere with model

In [6]:
import librosa
import numpy as np
import tensorflow as tf

# Modell und Parameter
model = tf.keras.models.load_model("viseme_model.h5")
sr = 16000
n_fft = 400
hop_length = 160
n_mfcc = 13

# Viseme-Decoder (inverse LabelEncoder falls nötig)
idx_to_viseme = {0: "AE", 1: "FV", 2: "MPB", 3: "SIL", 4: "SK", 5: "WURO", 6: "YIL"}

def extract_mfcc(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    return np.vstack([mfcc, delta, delta2]).T  # → [T, 39]

def predict_visemes(wav_path):
    y, _ = librosa.load(wav_path, sr=sr)
    features = extract_mfcc(y, sr)  # → [T, 39]
    
    # Modell erwartet Batch + Zeit: [1, T, 39]
    input_data = np.expand_dims(features, axis=0)
    predictions = model.predict(input_data)[0]  # → [T, 7]
    predicted_ids = np.argmax(predictions, axis=-1)  # → [T]

    # Zeitstempel pro Frame (jede 10ms bei hop_length=160, sr=16k)
    frame_duration = hop_length / sr

    for i, label_id in enumerate(predicted_ids):
        time = i * frame_duration
        viseme = idx_to_viseme.get(label_id, "UNK")
        print(f"{time:.2f}s\t{viseme}")

# Beispiel: eine WAV-Datei vorhersagen
predict_visemes("data_orginal\TEST\DR1\FAKS0\SA1.WAV.wav")





  predict_visemes("data_orginal\TEST\DR1\FAKS0\SA1.WAV.wav")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
0.00s	SIL
0.01s	SIL
0.02s	SIL
0.03s	SIL
0.04s	SIL
0.05s	SIL
0.06s	SIL
0.07s	SIL
0.08s	SIL
0.09s	SIL
0.10s	SIL
0.11s	SIL
0.12s	SIL
0.13s	SIL
0.14s	SIL
0.15s	SIL
0.16s	SIL
0.17s	SIL
0.18s	SIL
0.19s	SIL
0.20s	SIL
0.21s	SIL
0.22s	SIL
0.23s	SK
0.24s	SIL
0.25s	SIL
0.26s	SIL
0.27s	SIL
0.28s	SIL
0.29s	SIL
0.30s	SIL
0.31s	SIL
0.32s	SIL
0.33s	SIL
0.34s	SIL
0.35s	SIL
0.36s	SIL
0.37s	SIL
0.38s	SIL
0.39s	SIL
0.40s	SIL
0.41s	SIL
0.42s	SIL
0.43s	SIL
0.44s	SIL
0.45s	SIL
0.46s	SIL
0.47s	SIL
0.48s	SIL
0.49s	SIL
0.50s	SIL
0.51s	SIL
0.52s	SIL
0.53s	SIL
0.54s	SIL
0.55s	SIL
0.56s	SIL
0.57s	SIL
0.58s	SIL
0.59s	SIL
0.60s	SK
0.61s	SK
0.62s	SK
0.63s	SK
0.64s	SK
0.65s	SK
0.66s	SK
0.67s	SK
0.68s	SK
0.69s	SK
0.70s	YIL
0.71s	YIL
0.72s	YIL
0.73s	YIL
0.74s	YIL
0.75s	YIL
0.76s	YIL
0.77s	YIL
0.78s	YIL
0.79s	YIL
0.80s	SIL
0.81s	SIL
0.82s	SIL
0.83s	SIL
0.84s	SIL
0.85s	SIL
0.86s	SIL
0.87s	SIL
0.88s	AE
0.89s	AE
0.90s	AE
0.91s	AE
0.92s	AE
0.93s	AE
0.94s	A

In [9]:
import re 

phoneme_to_viseme = {
    # MPB (Lippen geschlossen)
    "m": "MPB", "p": "MPB", "b": "MPB", "em": "MPB", "en": "MPB",

    # FV (Zähne auf Lippe)
    "f": "FV", "v": "FV",

    # AE (offener Mund, Vokale)
    "ae": "AE","axr": "AE", "aa": "AE","aw": "AE","ay": "AE", "ah": "AE", "ao": "AE", "eh": "AE", "er": "AE", "ax": "AE", "ix": "AE",

    # YIL (Zunge vorn, schmaler Mund)
    "iy": "YIL", "ih": "YIL", "ey": "YIL", "el": "YIL", "l": "YIL", "y": "YIL",

    # WURO (rund, Lippen nach vorn)
    "w": "WURO","oy": "WURO","ux": "WURO", "uw": "WURO", "uh": "WURO", "r": "WURO", "ow": "WURO", "ax-h": "WURO",

    # SK (Zunge oben oder hinten)
    "s": "SK", "eng": "SK", "z": "SK", "t": "SK", "d": "SK","dx": "SK", "n": "SK", "nx": "SK", "ng": "SK","k": "SK", "g": "SK", "ch": "SK", "jh": "SK", "sh": "SK", "zh": "SK", "th": "SK", "dh": "SK",

    # SIL (alle Pausen, Geräusche etc.)
    "sil": "SIL", "pau": "SIL", "h#": "SIL", "epi": "SIL", "pcl": "SIL", "tcl": "SIL", "kcl": "SIL", "bcl": "SIL", "dcl": "SIL", "gcl": "SIL", "q": "SIL", "hh": "SIL", "hv": "SIL"
}


def clean_phoneme(phoneme):
    return re.sub(r"\d$", "", phoneme.lower())

def log_phn_as_viseme_with_seconds(phn_path, sr=16000):
    with open(phn_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 3:
                continue
            start_sample, end_sample, phoneme = parts
            start_sec = int(start_sample) / sr
            end_sec = int(end_sample) / sr
            phoneme_clean = clean_phoneme(phoneme)
            viseme = phoneme_to_viseme.get(phoneme_clean, "SIL")
            print(f"{start_sec:.3f}s - {end_sec:.3f}s: {phoneme} → {viseme}")
log_phn_as_viseme_with_seconds("data_orginal\TEST\DR1\FAKS0\SA1.PHN")

0.000s - 0.603s: h# → SIL
0.603s - 0.703s: sh → SK
0.703s - 0.799s: iy → YIL
0.799s - 0.880s: hv → SIL
0.880s - 1.010s: ae → AE
1.010s - 1.055s: dcl → SIL
1.055s - 1.069s: d → SK
1.069s - 1.099s: y → YIL
1.099s - 1.173s: er → AE
1.173s - 1.232s: dcl → SIL
1.232s - 1.248s: d → SK
1.248s - 1.345s: aa → AE
1.345s - 1.417s: r → WURO
1.417s - 1.488s: kcl → SIL
1.488s - 1.506s: k → SK
1.506s - 1.643s: s → SK
1.643s - 1.787s: uw → WURO
1.787s - 1.824s: dx → SK
1.824s - 1.896s: ih → YIL
1.896s - 1.992s: ng → SK
1.992s - 2.031s: gcl → SIL
2.031s - 2.073s: g → SK
2.073s - 2.114s: r → WURO
2.114s - 2.197s: iy → YIL
2.197s - 2.336s: s → SK
2.336s - 2.410s: iy → YIL
2.410s - 2.534s: w → WURO
2.534s - 2.647s: aa → AE
2.647s - 2.820s: sh → SK
2.820s - 2.852s: epi → SIL
2.852s - 2.928s: w → WURO
2.928s - 3.042s: aa → AE
3.042s - 3.078s: dx → SK
3.078s - 3.190s: er → AE
3.190s - 3.274s: q → SIL
3.274s - 3.406s: ao → AE
3.406s - 3.466s: l → YIL
3.466s - 3.587s: y → YIL
3.587s - 3.699s: iy → YIL
3.699s -

  log_phn_as_viseme_with_seconds("data_orginal\TEST\DR1\FAKS0\SA1.PHN")
