In [1]:
#  Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Set Working Directory
import os
BASE_DIR = '/content/drive/MyDrive/speech_understanding_project'
DATA_DIR = os.path.join(BASE_DIR, 'data')
os.makedirs(DATA_DIR, exist_ok=True)


Mounted at /content/drive


In [2]:
!pip install scikit-learn numpy tqdm



In [3]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm
import glob

In [4]:
# Step 4: Load Feature Files
def load_all_features(feature_dir, max_utts=None, max_frames=1000):
    all_feats = []
    files = sorted(glob.glob(os.path.join(feature_dir, '*.npy')))
    if max_utts: files = files[:max_utts]

    for f in tqdm(files):
        feat = np.load(f)
        if feat.shape[0] > max_frames:
            # randomly sample `max_frames` for speed
            idx = np.random.choice(feat.shape[0], max_frames, replace=False)
            feat = feat[idx]
        all_feats.append(feat)

    return np.concatenate(all_feats, axis=0)

# Load LRL features for clustering (e.g., 10K frames max)
lrl_feat_dir = os.path.join(DATA_DIR, 'features_lrl')
all_lrl_feats = load_all_features(lrl_feat_dir, max_utts=100, max_frames=100)
print("All feature shape:", all_lrl_feats.shape)


0it [00:00, ?it/s]


ValueError: need at least one array to concatenate

In [5]:
# Train K-Means to Get Pseudo-Units
N_CLUSTERS = 100  # ← number of pseudo-phoneme units

kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=42, batch_size=1024)
kmeans.fit(all_lrl_feats)

# 📁 Save the model
import joblib
joblib.dump(kmeans, os.path.join(DATA_DIR, f'kmeans_{N_CLUSTERS}.joblib'))
print(f"KMeans model saved.")


0it [00:00, ?it/s]


ValueError: need at least one array to concatenate

In [None]:
# Quantize Utterances into Pseudo-Units
def quantize_features_to_units(feature_dir, kmeans_model, output_path):
    files = sorted(glob.glob(os.path.join(feature_dir, '*.npy')))
    with open(output_path, 'w') as f:
        for file in tqdm(files):
            utt_id = os.path.basename(file).replace('.npy', '')
            features = np.load(file)
            units = kmeans_model.predict(features)
            unit_seq = " ".join(map(str, units))
            f.write(f"{utt_id}\t{unit_seq}\n")
    print(f"Discrete units saved to: {output_path}")


In [None]:
# Step 7: Save Discrete Sequences
quantized_dir = os.path.join(DATA_DIR, 'units')
os.makedirs(quantized_dir, exist_ok=True)

# LRL Pseudo-Units
quantize_features_to_units(
    feature_dir=lrl_feat_dir,
    kmeans_model=kmeans,
    output_path=os.path.join(quantized_dir, 'lrl_units.txt')
)

# HRL Units (optional, for supervised comparison or decoder training)
quantize_features_to_units(
    feature_dir=os.path.join(DATA_DIR, 'features_hrl'),
    kmeans_model=kmeans,
    output_path=os.path.join(quantized_dir, 'hrl_units.txt')
)
