In [12]:
import os
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
import warnings
from pydub import AudioSegment
from sklearn.preprocessing import StandardScaler

#ignore warnings
#warnings.filterwarnings("ignore")

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# Install the ffmpeg audio processing tool
#!sudo apt-get install -y ffmpeg

!pip install pydub



In [None]:
def convert_mp3_to_wav(mp3_folder, wav_folder):
    if not os.path.exists(wav_folder):
        os.makedirs(wav_folder)
        print(f"Created directory: {wav_folder}")

        for filename in os.listdir(mp3_folder):
            if filename.endswith(".mp3"):
                mp3_path = os.path.join(mp3_folder, filename)

                wav_filename = os.path.splitext(filename)[0] + '.wav'
                wav_path = os.path.join(wav_folder, wav_filename)

                print(f"Converting {mp3_path} to {wav_path}")

                try:
                    audio = AudioSegment.from_mp3(mp3_path)
                    audio.export(wav_path, format="wav")
                    print("...Done.")
                except Exception as e:
                    print(f"Could not convert {mp3_path}. Error: {e}")

    else:
        print(f"Directory {wav_folder} already exists. Skipping conversion.")

In [4]:
speaker1_mp3_path = "../data/samples/kimura_kimi_stories"
speaker2_mp3_path = "../data/samples/oda_ito_stories"

speaker1_wav_path = "../data/samples/kimura_kimi_stories/wav"
speaker2_wav_path = "../data/samples/oda_ito_stories/wav"

convert_mp3_to_wav(speaker1_mp3_path, speaker1_wav_path)
convert_mp3_to_wav(speaker2_mp3_path, speaker2_wav_path)

Directory ../data/samples/kimura_kimi_stories/wav already exists. Skipping conversion.
Directory ../data/samples/oda_ito_stories/wav already exists. Skipping conversion.


In [5]:
speaker1_path = speaker1_wav_path
speaker2_path = speaker2_wav_path


speaker_paths = {
    "Speaker1": [os.path.join(speaker1_path, f) for f in os.listdir(speaker1_path) if f.endswith('.wav')],
    "Speaker2": [os.path.join(speaker2_path, f) for f in os.listdir(speaker2_path) if f.endswith('.wav')]
}

print(f"Found {len(speaker_paths['Speaker1'])} WAV files for Speaker1.")
print(f"Found {len(speaker_paths['Speaker2'])} WAV files for Speaker2.")

Found 23 WAV files for Speaker1.
Found 15 WAV files for Speaker2.


In [15]:
def extract_features(file_path):
    try:
        audio, sample_rate = librosa.load(file_path, sr=16000)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=20)

        # each row is a time frame and each column is an MFCC coefficient
        return mfccs.T
    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

all_features = {}
speaker_features = {}
#store the scaler for each speaker
scalers = {}
for speaker, paths in speaker_paths.items():
    combined_features = []
    print(f"Extracting features for {speaker}...")
    for path in paths:
        features = extract_features(path)
        if features is not None:
            combined_features.append(features)
    
    all_features[speaker] = np.vstack(combined_features)

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(all_features[speaker])

    speaker_features[speaker] = scaled_features
    scalers[speaker] = scaler
    
    print(f"Finished extracting features for {speaker}. Shape: {speaker_features[speaker].shape}")

Extracting features for Speaker1...
Finished extracting features for Speaker1. Shape: (711208, 20)
Extracting features for Speaker2...
Finished extracting features for Speaker2. Shape: (257415, 20)


In [16]:
gmm_models = {}

print("Training GMM for each speaker...")
for speaker, features in speaker_features.items():
    gmm = GaussianMixture(n_components=16, random_state=0)
    gmm.fit(features)
    gmm_models[speaker] = gmm
    print(f"Model for {speaker} trained successfully.")

print("\nAll models have been trained.")

Training GMM for each speaker...
Model for Speaker1 trained successfully.
Model for Speaker2 trained successfully.

All models have been trained.
