In [82]:
import gc
import torch
import torchaudio
import librosa
import numpy as np
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from torchsummary import summary
import keras
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import warnings

tqdm.pandas()

In [None]:
# suppress all warnings
warnings.filterwarnings("ignore")

In [83]:
RATE_HZ = 16000 # resampling rate in Hz
MAX_LENGTH = 128000 # maximum audio interval length to consider (= RATE_HZ * SECONDS)
CSV_FILE_PATH: str = "bio_metadata.csv"
NATIVE_FILE_PATH: str = "native_bio_metadata.csv"
ALL_SPEAKERS_PATH: str = "speakers_all.csv"
NON_NATIVE_FILE_PATH: str = "non_native_bio_metadata.csv"
COL_SIZE: int = 30
SILENCE_THRESHOLD: float = .01
RATE: int = 2400
N_MFCC: int = 13

In [84]:
# extract acoustic features from audio files function
def extract_mfcc_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    return mfccs_processed

# extract, chroma_stft, spectral_centroid, 
# spectral_bandwidth, spectral_rolloff, zero_crossing_rate function
def extract_accoustic_features(file_name):
    """
        Extracts accoustic features from audio file
        Takes in the file name and returns the following features:
        Args:
            :param file_name: str: name of the file to extract features from
        Returns:
            :return mfcc: np.array: Mel-frequency cepstral coefficients
            :return chroma_stft: np.array: Chroma short-time Fourier transform
            :return spectral_centroid: np.array: Spectral centroid
            :return spectral_bandwidth: np.array: Spectral bandwidth
            :return spectral_rolloff: np.array: Spectral rolloff
            :return zero_crossing_rate: np.array: Zero crossing rate
    """
    try:
        audio, sample_rate = librosa.load(f'./data/audio/{file_name}')
        mfcc = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=COL_SIZE).T, axis=0)
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sample_rate).T, axis=0)
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio).T, axis=0)
        return mfcc, chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    

# extract pitch intensity, duration, loudness, jitter, shimmer, hnr function (prosodic features)
def extract_prosodic_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        pitch_intensity = np.mean(librosa.pyin.piptrack(y=audio, sr=sample_rate).T, axis=0)
        duration = np.mean(librosa.effects.time_stretch(audio, 1.0).T, axis=0)
        loudness = np.mean(librosa.feature.rms(y=audio).T, axis=0)
        jitter = np.mean(librosa.effects.jitter(y=audio).T, axis=0)
        shimmer = np.mean(librosa.effects.shimmer(y=audio).T, axis=0)
        hnr = np.mean(librosa.effects.harmonic(y=audio).T, axis=0)
        return pitch_intensity, duration, loudness, jitter, shimmer, hnr
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    

# extract plp features function
def extract_plp_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        plp = np.mean(librosa.beat.plp(y=audio, sr=sample_rate, n_mfcc=COL_SIZE).T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    return plp

In [85]:
# get wav from file function
def get_wav(file_name):
    try:
        audio, sample_rate = librosa.load(f'./data/audio/{file_name}.wav')
        return librosa.core.resample(y=audio, orig_sr=sample_rate, target_sr=RATE, scale=True)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None

# convert wave to mfcc function
def wave_to_mfcc(audio, sample_rate):
    try:
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing audio")
        return None
    return mfccs_processed

# normalize mfcc function
def normalize_mfcc(mfcc):
    mms = MinMaxScaler()
    return mms.fit_transform(np.abs(mfcc))

# to categorical function
def to_categorical(y):
    lang_dict = {}
    for index, language in enumerate(set(y)):
        lang_dict[language] = index
    y = list(map(lambda x: lang_dict[x],y))
    return keras.utils.to_categorical(y, len(lang_dict)), lang_dict


In [86]:
# load the native_bio_metadata.csv
native_bio_metadata = pd.read_csv(NATIVE_FILE_PATH)
native_bio_metadata.head()

Unnamed: 0,href,language_num,sex,birth_place,native_language,other_languages,age_sex,age_of_english_onset,english_learning_method,english_residence,length_of_english_residence,age
0,http://accent.gmu.edu/browse_language.php?func...,mandarin1,female,"['shanxi,', 'china']",mandarin\n(cmn),['none'],"['26,', 'female', '']",13.0,academic,usa,2.0,26.0
1,http://accent.gmu.edu/browse_language.php?func...,mandarin2,female,"['nanjing,', 'china']",mandarin\n(cmn),"['japanese', '']","['38,', 'female', '']",14.0,academic,usa,0.8,38.0
2,http://accent.gmu.edu/browse_language.php?func...,mandarin3,male,"['jilin,', 'china']",mandarin\n(cmn),"['italian', 'german', 'french', '']","['43,', 'male', '']",10.0,academic,usa,14.0,43.0
3,http://accent.gmu.edu/browse_language.php?func...,mandarin4,female,"['shanghai,', 'china']",mandarin\n(cmn),"['japanese', '']","['24,', 'female', '']",6.0,academic,usa,1.0,24.0
4,http://accent.gmu.edu/browse_language.php?func...,mandarin5,female,"['beijing,', 'china']",mandarin\n(cmn),['none'],"['31,', 'female', '']",12.0,academic,usa,2.0,31.0


In [87]:
# drop href, age_sex, age_of_english_onset, other_languages, birthplace
native_bio_metadata.drop(columns=['href', 'age', 'age_of_english_onset', 'other_languages', 'birth_place', 'age_sex', 'length_of_english_residence', 'english_learning_method'], inplace=True)
native_bio_metadata.head()

Unnamed: 0,language_num,sex,native_language,english_residence
0,mandarin1,female,mandarin\n(cmn),usa
1,mandarin2,female,mandarin\n(cmn),usa
2,mandarin3,male,mandarin\n(cmn),usa
3,mandarin4,female,mandarin\n(cmn),usa
4,mandarin5,female,mandarin\n(cmn),usa


In [88]:
# describe the native_bio_metadata
native_bio_metadata.describe()

Unnamed: 0,language_num,sex,native_language,english_residence
count,1224,1224,1224,1224
unique,1169,2,27,3
top,english578,female,english\n(eng),usa
freq,3,635,604,1031


In [89]:
# remove \n and select first one in native_bio_metadata['english_residence']
native_bio_metadata['english_residence'] = native_bio_metadata['english_residence'].apply(lambda x: x.split('\n')[0])
native_bio_metadata.sample(10)

Unnamed: 0,language_num,sex,native_language,english_residence
420,english192,female,english\n(eng),usa
92,mandarin136,female,mandarin\n(cmn),usa
320,english46,male,english\n(eng),usa
892,italian35,male,italian\n(ita),usa
53,mandarin82,female,mandarin\n(cmn),usa
405,english175,male,english\n(eng),usa
417,english189,male,english\n(eng),usa
781,dutch27,female,dutch\n(nld),usa
367,english114,female,english\n(eng),usa
1151,english28,male,english\n(eng),canada


In [90]:
# value counts
native_bio_metadata['english_residence'].value_counts()

english_residence
usa       1031
uk         115
canada      78
Name: count, dtype: int64

In [91]:
native_bio_metadata.shape

(1224, 4)

In [92]:
# to categorical function
def to_categorical(y):
    lang_dict = {}
    for index, language in enumerate(set(y)):
        lang_dict[language] = index
    y = list(map(lambda x: lang_dict[x],y))
    return keras.utils.to_categorical(y, len(lang_dict)), lang_dict

In [93]:
# add a new column 'file' with the full path of the audio file, the audio files are location in './data/native_combined/'
native_bio_metadata.loc[:, 'file'] = native_bio_metadata['language_num'].apply(lambda x: f"data/audio/{x}.wav")
native_bio_metadata.head()

Unnamed: 0,language_num,sex,native_language,english_residence,file
0,mandarin1,female,mandarin\n(cmn),usa,data/audio/mandarin1.wav
1,mandarin2,female,mandarin\n(cmn),usa,data/audio/mandarin2.wav
2,mandarin3,male,mandarin\n(cmn),usa,data/audio/mandarin3.wav
3,mandarin4,female,mandarin\n(cmn),usa,data/audio/mandarin4.wav
4,mandarin5,female,mandarin\n(cmn),usa,data/audio/mandarin5.wav


In [94]:
native_bio_metadata['english_residence'].value_counts()

english_residence
usa       1031
uk         115
canada      78
Name: count, dtype: int64

In [None]:
# create an upsampler 
import random
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)
X = native_bio_metadata.drop(columns=['native_language', 'sex'])
y = native_bio_metadata['english_residence']
X_resampled, y_resampled = ros.fit_resample(X, y)


rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
print(sorted(Counter(y_resampled).items()))
print(sorted(Counter(y_resampled).items()))

In [None]:
X_resampled['english_residence'].value_counts()

In [None]:
X_resampled.head()

In [95]:
def get_transform_audio(file):
    audio,rate = torchaudio.load(str(file))
    transform = torchaudio.transforms.Resample(rate,RATE_HZ)
    audio = transform(audio).squeeze(0).numpy()
    audio = audio[:MAX_LENGTH]
    return audio

In [97]:
X = X_resampled['language_num'].values
y = y_resampled['english_residence'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((979,), (245,), (979,), (245,))

In [98]:
# create a counter for train and test labels
train_counter = Counter(y_train)
test_counter = Counter(y_test)

print(f"Train Count: {train_counter}")
print(f"Test Count: {test_counter}")

Train Count: Counter({'usa': 825, 'uk': 92, 'canada': 62})
Test Count: Counter({'usa': 206, 'uk': 23, 'canada': 16})


In [99]:
y_train_cat, _ = to_categorical(y_train)
y_test_cat, lang_dict = to_categorical(y_test)

In [100]:
# extract acoustic features
X_train_acoustic = []

for audio in X_train:
    # audio is a waveform, load it and extract the features
    # mfcc, chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    mfcc, chroma_stft, _, _, _, _ = extract_accoustic_features(audio + '.wav')
    
    X_train_acoustic.append([
        mfcc,
        chroma_stft,
    ])

In [101]:
X_test_acoustic = []

for file in X_test:
    audio = file
    mfcc, chroma_stft, _, _, _, _ = extract_accoustic_features(audio + '.wav')
    X_test_acoustic.append([
        mfcc,
        chroma_stft,
    ])

In [102]:
# create pytorch model to train the data on
class AcousticDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Concatenate the features within each sample
        features = torch.tensor(np.concatenate(self.X[idx], axis=0), dtype=torch.float32)
        label = torch.tensor(self.y[idx], dtype=torch.long)  # self.y[idx] is the label index
        return features, label

In [103]:
train_dataset = AcousticDataset(X_train_acoustic, y_train_cat)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# create the data loaders
val_dataset = AcousticDataset(X_test_acoustic, y_test_cat)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [113]:
class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(
            nn.Conv2d(2, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.ReLU(),
            nn.BatchNorm1d(8),
            nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.ReLU(),
            nn.BatchNorm1d(16),
            nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
            nn.ReLU(),
            nn.BatchNorm1d(32),
        )
        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.dropout = nn.Dropout(0.5)
        self.lin = nn.Linear(in_features=32, out_features=3)

    def forward(self, inp_x):
        inp_x = self.conv(inp_x)
        inp_x = self.ap(inp_x)
        inp_x = inp_x.view(inp_x.shape[0], -1)
        inp_x = self.dropout(inp_x)
        inp_x = self.lin(inp_x)
        return inp_x

In [114]:
class AccuracyMetric:
    def __init__(self):
        self.correct, self.total = None, None
        self.reset()

    def update(self, y_pred, y_true):
        self.correct += torch.sum(y_pred.argmax(-1) == y_true).item()
        self.total += y_true.size(0)

    def compute(self):
        return self.correct / self.total

    def reset(self):
        self.correct = 0
        self.total = 0