In [2]:
import os
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import joblib
from tqdm import tqdm

Config

In [3]:
SR = 48000  # Sample rate
N_MFCC = 20  # MFCC features
GMM_COMPONENTS = 32  # For faster training; use 512 in production
IVECTOR_DIM = 100  # i-vector size (total variability space)
MFCC_DIR = r"trials\features\48k_mfcc_extra_hfcc_extra"
Y_DIR = r"trials\features\48k_mfcc_extra_hfcc_extra"

Utils

In [4]:
def train_ubm(mfccs_list, n_components=GMM_COMPONENTS):
    all_feats = np.vstack(mfccs_list)
    ubm = GaussianMixture(n_components=n_components, covariance_type='diag', max_iter=100)
    ubm.fit(all_feats)
    return ubm

def train_t_matrix(mfccs_list, ubm, R=100, n_iter=5):
    K, D = ubm.means_.shape
    T = np.random.randn(K * D, R) * 0.1

    # Precompute per-utterance stats
    stats = []
    for mfccs in tqdm(mfccs_list, desc="Computing Baum-Welch stats"):
        N, F = compute_bw_stats(mfccs, ubm)
        stats.append((N, F))

    for iteration in range(n_iter):
        T_num = np.zeros((K * D, R))
        T_den = np.zeros((R, R))

        for N, F in tqdm(stats, desc=f"T-matrix EM iter {iteration+1}"):
            # Centered stats
            S = np.zeros((K, D))
            for k in range(K):
                S[k] = F[k] - N[k] * ubm.means_[k]
            S = S.flatten()

            # Inverse sigma (diagonal covs)
            sigma = ubm.covariances_.reshape(K, D).flatten() + 1e-6
            inv_sigma = 1. / sigma
            T_invSigma = T.T * inv_sigma[None, :]

            # E-step: compute posterior for i-vector w
            cov_w = np.linalg.inv(T_invSigma @ T + np.eye(R))
            mean_w = cov_w @ (T_invSigma @ S)

            # M-step accumulators
            T_num += np.outer(S, mean_w)
            T_den += N.sum() * (np.outer(mean_w, mean_w) + cov_w)

        # Update T
        T = T_num @ np.linalg.inv(T_den)

    return T, stats

def compute_bw_stats(mfccs, ubm):
    if mfccs.ndim == 1:
        mfccs = mfccs.reshape(1, -1)
    responsibilities = ubm.predict_proba(mfccs)
    N = np.sum(responsibilities, axis=0)  # [K]
    F = np.dot(responsibilities.T, mfccs)  # [K, D]
    return N, F

def extract_ivec(N, F, ubm, T):
    """
    Extract an i-vector using full per-component covariances.
    N: [K] - zero order stats
    F: [K, D] - first order stats
    T: [K*D, R] - total variability matrix
    """
    K, D = ubm.means_.shape
    R = T.shape[1]

    # Flattened UBM means and covariances
    m = ubm.means_.flatten()
    covs = ubm.covariances_.reshape(K, D)  # still diagonal, but per component
    T_blocks = T  # shape: (K*D, R)

    # Compute centered stats
    F_dev = (F - N[:, None] * ubm.means_).flatten()  # (K*D,)

    # Construct precision matrix (inverse of covariance)
    inv_sigma = 1.0 / covs.flatten()  # (K*D,)
    T_transpose_Sigma_inv = T_blocks.T * inv_sigma[None, :]  # (R, K*D)

    # Compute posterior covariance of i-vector (R x R)
    cov_i = np.linalg.inv(T_transpose_Sigma_inv @ T_blocks + np.eye(R))

    # Compute posterior mean of i-vector (R,)
    mean_i = cov_i @ (T_transpose_Sigma_inv @ F_dev)

    return mean_i

Main Pipeline

In [None]:
def main():
    print("Loading features...")
    mfcc_train = joblib.load(os.path.join(MFCC_DIR, 'X_train.joblib'))[:,:150]
    mfcc_test = joblib.load(os.path.join(MFCC_DIR, 'X_test.joblib'))[:,:150]
    print(f"Loaded {len(mfcc_train) + len(mfcc_test)} samples.")

    print("Loading labels...")
    y_train = joblib.load(os.path.join(Y_DIR, 'y_train.joblib'))
    y_test = joblib.load(os.path.join(Y_DIR, 'y_test.joblib'))

    print("Training UBM...")
    ubm = train_ubm(mfcc_train)

    print("Training T-matrix...")
    T, stats = train_t_matrix(mfcc_train, ubm, R=IVECTOR_DIM, n_iter=5)

    print("Extracting i-vectors...")
    ivecs = []
    for (N, F) in tqdm(stats, desc="Extracting i-vectors"):
        ivec = extract_ivec(N, F, ubm, T)
        ivecs.append(ivec)
    ivecs = np.vstack(ivecs)

    print("Training classifier...")
    # Train classifier using the extracted i-vectors
    clf = KNeighborsClassifier(n_neighbors=4)
    clf.fit(ivecs, y_train)

    # For test data, need to extract i-vectors first
    test_stats = []
    for mfcc in tqdm(mfcc_test, desc="Computing test stats"):
        N, F = compute_bw_stats(mfcc, ubm)
        test_stats.append((N, F))

    test_ivecs = []
    for (N, F) in tqdm(test_stats, desc="Extracting test i-vectors"):
        ivec = extract_ivec(N, F, ubm, T)
        test_ivecs.append(ivec)
    test_ivecs = np.vstack(test_ivecs)

    # Evaluate on i-vectors
    print("Evaluating classifier...")
    preds = clf.predict(test_ivecs)
    print(classification_report(y_test, preds, target_names=["M_20s", "F_20s", "M_50s", "F_50s"]))

if __name__ == "__main__":
    main()

Loading features...
Loaded 181880 samples.
Loading labels...
Training UBM...
Training T-matrix...


Computing Baum-Welch stats: 100%|██████████| 172786/172786 [01:28<00:00, 1945.02it/s]
T-matrix EM iter 1: 100%|██████████| 172786/172786 [1:05:26<00:00, 44.00it/s]
T-matrix EM iter 2:  12%|█▏        | 21097/172786 [05:15<2:28:39, 17.01it/s]