In [1]:
import gc
import torch
import torchaudio
import librosa
import keras
import numpy as np
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from torchsummary import summary
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import warnings

tqdm.pandas()

# suppress all warnings
warnings.filterwarnings("ignore")




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RATE_HZ = 16000 # resampling rate in Hz
MAX_LENGTH = 128000 # maximum audio interval length to consider (= RATE_HZ * SECONDS)
CSV_FILE_PATH: str = "bio_metadata.csv"
NATIVE_FILE_PATH: str = "native_bio_metadata.csv"
ALL_SPEAKERS_PATH: str = "speakers_all.csv"
NON_NATIVE_FILE_PATH: str = "non_native_bio_metadata.csv"
COL_SIZE: int = 30
SILENCE_THRESHOLD: float = .01
RATE: int = 2400
N_MFCC: int = 13

In [3]:
# extract acoustic features from audio files function
def extract_mfcc_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    return mfccs_processed

# extract, chroma_stft, spectral_centroid, 
# spectral_bandwidth, spectral_rolloff, zero_crossing_rate function
def extract_accoustic_features(file_name):
    """
        Extracts accoustic features from audio file
        Takes in the file name and returns the following features:
        Args:
            :param file_name: str: name of the file to extract features from
        Returns:
            :return mfcc: np.array: Mel-frequency cepstral coefficients
            :return chroma_stft: np.array: Chroma short-time Fourier transform
            :return spectral_centroid: np.array: Spectral centroid
            :return spectral_bandwidth: np.array: Spectral bandwidth
            :return spectral_rolloff: np.array: Spectral rolloff
            :return zero_crossing_rate: np.array: Zero crossing rate
    """
    try:
        audio, sample_rate = librosa.load(f'./data/audio/{file_name}')
        mfcc = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=COL_SIZE).T, axis=0)
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sample_rate).T, axis=0)
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio).T, axis=0)
        return mfcc, chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    

# extract pitch intensity, duration, loudness, jitter, shimmer, hnr function (prosodic features)
def extract_prosodic_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        pitch_intensity = np.mean(librosa.pyin.piptrack(y=audio, sr=sample_rate).T, axis=0)
        duration = np.mean(librosa.effects.time_stretch(audio, 1.0).T, axis=0)
        loudness = np.mean(librosa.feature.rms(y=audio).T, axis=0)
        jitter = np.mean(librosa.effects.jitter(y=audio).T, axis=0)
        shimmer = np.mean(librosa.effects.shimmer(y=audio).T, axis=0)
        hnr = np.mean(librosa.effects.harmonic(y=audio).T, axis=0)
        return pitch_intensity, duration, loudness, jitter, shimmer, hnr
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    

# extract plp features function
def extract_plp_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        plp = np.mean(librosa.beat.plp(y=audio, sr=sample_rate, n_mfcc=COL_SIZE).T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    return plp

In [4]:
# get wav from file function
def get_wav(file_name):
    try:
        audio, sample_rate = librosa.load(f'./data/audio/{file_name}.wav')
        return librosa.core.resample(y=audio, orig_sr=sample_rate, target_sr=RATE, scale=True)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None

# convert wave to mfcc function
def wave_to_mfcc(audio, sample_rate):
    try:
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing audio")
        return None
    return mfccs_processed

# normalize mfcc function
def normalize_mfcc(mfcc):
    mms = MinMaxScaler()
    return mms.fit_transform(np.abs(mfcc))

# to categorical function
def to_categorical(y):
    lang_dict = {}
    for index, language in enumerate(set(y)):
        lang_dict[language] = index
    y = list(map(lambda x: lang_dict[x],y))
    return keras.utils.to_categorical(y, len(lang_dict)), lang_dict


In [5]:
# load the native_bio_metadata.csv
native_bio_metadata = pd.read_csv(NATIVE_FILE_PATH)
native_bio_metadata.head()

Unnamed: 0,href,language_num,sex,birth_place,native_language,other_languages,age_sex,age_of_english_onset,english_learning_method,english_residence,length_of_english_residence,age
0,http://accent.gmu.edu/browse_language.php?func...,mandarin1,female,"['shanxi,', 'china']",mandarin\n(cmn),['none'],"['26,', 'female', '']",13.0,academic,usa,2.0,26.0
1,http://accent.gmu.edu/browse_language.php?func...,mandarin2,female,"['nanjing,', 'china']",mandarin\n(cmn),"['japanese', '']","['38,', 'female', '']",14.0,academic,usa,0.8,38.0
2,http://accent.gmu.edu/browse_language.php?func...,mandarin3,male,"['jilin,', 'china']",mandarin\n(cmn),"['italian', 'german', 'french', '']","['43,', 'male', '']",10.0,academic,usa,14.0,43.0
3,http://accent.gmu.edu/browse_language.php?func...,mandarin4,female,"['shanghai,', 'china']",mandarin\n(cmn),"['japanese', '']","['24,', 'female', '']",6.0,academic,usa,1.0,24.0
4,http://accent.gmu.edu/browse_language.php?func...,mandarin5,female,"['beijing,', 'china']",mandarin\n(cmn),['none'],"['31,', 'female', '']",12.0,academic,usa,2.0,31.0


In [6]:
# drop href, age_sex, age_of_english_onset, other_languages, birthplace
native_bio_metadata.drop(columns=['href', 'age', 'age_of_english_onset', 'other_languages', 'birth_place', 'age_sex', 'length_of_english_residence', 'english_learning_method'], inplace=True)
native_bio_metadata.head()

Unnamed: 0,language_num,sex,native_language,english_residence
0,mandarin1,female,mandarin\n(cmn),usa
1,mandarin2,female,mandarin\n(cmn),usa
2,mandarin3,male,mandarin\n(cmn),usa
3,mandarin4,female,mandarin\n(cmn),usa
4,mandarin5,female,mandarin\n(cmn),usa


In [7]:
# describe the native_bio_metadata
native_bio_metadata.describe()

Unnamed: 0,language_num,sex,native_language,english_residence
count,1224,1224,1224,1224
unique,1169,2,27,3
top,english578,female,english\n(eng),usa
freq,3,635,604,1031


In [8]:
# remove \n and select first one in native_bio_metadata['english_residence']
native_bio_metadata['english_residence'] = native_bio_metadata['english_residence'].apply(lambda x: x.split('\n')[0])
native_bio_metadata.sample(10)

Unnamed: 0,language_num,sex,native_language,english_residence
427,english200,female,english\n(eng),usa
372,english123,female,english\n(eng),usa
719,english591,female,english\n(eng),usa
347,english86,male,english\n(eng),usa
457,english235,female,english\n(eng),usa
852,german15,male,german\n(deu),usa
1024,portuguese61,female,portuguese\n(por),usa
825,french59,female,french\n(fra),usa
840,french84,female,french\n(fra),usa
1122,french25,male,french\n(fra),uk


In [9]:
# value counts
native_bio_metadata['english_residence'].value_counts()

english_residence
usa       1031
uk         115
canada      78
Name: count, dtype: int64

In [10]:
native_bio_metadata.shape

(1224, 4)

In [11]:
# to categorical function
def to_categorical(y):
    lang_dict = {}
    for index, language in enumerate(set(y)):
        lang_dict[language] = index
    y = list(map(lambda x: lang_dict[x],y))
    return keras.utils.to_categorical(y, len(lang_dict)), lang_dict

In [12]:
# add a new column 'file' with the full path of the audio file, the audio files are location in './data/native_combined/'
native_bio_metadata.loc[:, 'file'] = native_bio_metadata['language_num'].apply(lambda x: f"data/audio/{x}.wav")
native_bio_metadata.head()

Unnamed: 0,language_num,sex,native_language,english_residence,file
0,mandarin1,female,mandarin\n(cmn),usa,data/audio/mandarin1.wav
1,mandarin2,female,mandarin\n(cmn),usa,data/audio/mandarin2.wav
2,mandarin3,male,mandarin\n(cmn),usa,data/audio/mandarin3.wav
3,mandarin4,female,mandarin\n(cmn),usa,data/audio/mandarin4.wav
4,mandarin5,female,mandarin\n(cmn),usa,data/audio/mandarin5.wav


In [13]:
native_bio_metadata['english_residence'].value_counts()

english_residence
usa       1031
uk         115
canada      78
Name: count, dtype: int64

In [14]:
def get_transform_audio(file):
    audio,rate = torchaudio.load(str(file))
    transform = torchaudio.transforms.Resample(rate,RATE_HZ)
    audio = transform(audio).squeeze(0).numpy()
    audio = audio[:MAX_LENGTH] # truncate to first part of audio to save RAM
    return audio

In [15]:
native_bio_metadata['audio'] = native_bio_metadata['file'].progress_apply(get_transform_audio)

100%|██████████| 1224/1224 [00:15<00:00, 77.91it/s]


In [16]:
native_bio_metadata['english_residence'].value_counts()

english_residence
usa       1031
uk         115
canada      78
Name: count, dtype: int64

In [17]:
native_bio_metadata.sample(5)

Unnamed: 0,language_num,sex,native_language,english_residence,file,audio
270,arabic179,male,arabic\n(ars),usa,data/audio/arabic179.wav,"[-4.7082267e-06, -1.7670247e-05, -1.8126171e-0..."
385,english146,male,english\n(eng),usa,data/audio/english146.wav,"[0.0028268, 0.0045301947, 0.0040544877, 0.0043..."
883,italian17,male,italian\n(ita),usa,data/audio/italian17.wav,"[-0.0003382139, -0.0004581136, -0.0011106753, ..."
836,french73,male,french\n(fra),usa,data/audio/french73.wav,"[-0.004545769, -0.0034042199, -0.0011074148, -..."
569,english398,female,english\n(eng),usa,data/audio/english398.wav,"[0.0022351844, 0.004397306, 0.0042619184, 0.00..."


In [18]:
X = native_bio_metadata['file'].values
y = native_bio_metadata['english_residence'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((979,), (245,), (979,), (245,))

In [19]:
# create a counter for train and test labels
train_counter = Counter(y_train)
test_counter = Counter(y_test)

print(f"Train Count: {train_counter}")
print(f"Test Count: {test_counter}")

Train Count: Counter({'usa': 825, 'uk': 92, 'canada': 62})
Test Count: Counter({'usa': 206, 'uk': 23, 'canada': 16})


In [20]:
y_train_cat, _ = to_categorical(y_train)
y_test_cat, lang_dict = to_categorical(y_test)

In [21]:
lang_dict

{'usa': 0, 'uk': 1, 'canada': 2}

In [28]:
# create a transformer model for the training

# transformer model
class TransformerModel(nn.Module):
    def __init__(self, num_classes, num_features):
        super(TransformerModel, self).__init__()
        self.num_classes = num_classes
        self.num_features = num_features
        self.encoder = nn.TransformerEncoderLayer(d_model=self.num_features, nhead=8)
        self.transformer = nn.TransformerEncoder(self.encoder, num_layers=6)
        self.fc = nn.Linear(self.num_features, self.num_classes)
        
    def forward(self, x):
        x = self.transformer(x)
        x = self.fc(x)
        return x
    
# create a dataset class
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        audio, rate = torchaudio.load(self.X[idx])
        transform = torchaudio.transforms.Resample(rate, RATE_HZ)
        audio = transform(audio).squeeze(0).numpy()
        audio = audio[:MAX_LENGTH]
        return audio, self.y[idx]
    
# create a dataloader
train_dataset = AudioDataset(X_train, y_train_cat)
test_dataset = AudioDataset(X_test, y_test_cat)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# get the number of features from the first batch shape
num_features = next(iter(train_loader))[0].shape[1]


# create a model
model = TransformerModel(num_classes=len(lang_dict), num_features=num_features)

# create a loss function
criterion = nn.CrossEntropyLoss()

# create an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# create a scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# train the model
def train(model, train_loader, criterion, optimizer, scheduler, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (audio, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(audio)
            loss = criterion(outputs, torch.max(labels, 1)[1])
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 10 == 9:
                print(f"Epoch: {epoch+1}, Batch: {i+1}, Loss: {running_loss/10}")
                running_loss = 0.0
        scheduler.step()
    print("Training Finished")
    
# train the model
train(model, train_loader, criterion, optimizer, scheduler, num_epochs=10)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 196608000000 bytes.