In [2]:
import os
import librosa
import random
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer
from concurrent.futures import ThreadPoolExecutor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import keras
from sklearn.preprocessing import MinMaxScaler
import soundfile as sf

In [3]:
# get and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
OUTPUT_DIR: str = "output/"
CSV_FILE_PATH: str = "bio_metadata.csv"
NATIVE_FILE_PATH: str = "native_bio_metadata.csv"
NON_NATIVE_FILE_PATH: str = "non_native_bio_metadata.csv"
NATIVE_LANGUAGES: list[str] = ['uk', 'usa', 'canada']
NON_NATIVE_LANGUAGES: list[str] = [
    'australia',
    'new zealand',
    'ireland',    
    'singapore',  
    'south',     
    'africa',   
    'jamaica',    
    'scotland',   
    'islands',
]
DATASET_DIR: str = "data/"
NATIVE_DIR: str = "data/native/"
NATIVE_COMBINED_DIR: str = "data/native_combined/"
NON_NATIVE_DIR: str = "data/non_native/"
AUDIO_DATA_DIR: str = "data/audio/"
AUDIO_FILE_PATH: str = "data/audio/{}.wav"
SILENCE_THRESHOLD: float = .01
RATE: int = 2400
N_MFCC: int = 13
COL_SIZE: int = 10
EPOCHS: int = 50 #35 #50 #250
LEARNING_RATE = 0.001
WAIT: float = 1.2
DEBUG: bool = True


In [5]:
# extract acoustic features from audio files function
def extract_mfcc_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    return mfccs_processed

# extract, chroma_stft, spectral_centroid, 
# spectral_bandwidth, spectral_rolloff, zero_crossing_rate function
def extract_accoustic_features(file_name):
    """
        Extracts accoustic features from audio file
        Takes in the file name and returns the following features:
        Args:
            :param file_name: str: name of the file to extract features from
        Returns:
            :return mfcc: np.array: Mel-frequency cepstral coefficients
            :return chroma_stft: np.array: Chroma short-time Fourier transform
            :return spectral_centroid: np.array: Spectral centroid
            :return spectral_bandwidth: np.array: Spectral bandwidth
            :return spectral_rolloff: np.array: Spectral rolloff
            :return zero_crossing_rate: np.array: Zero crossing rate
    """
    try:
        audio, sample_rate = librosa.load(f'./data/native_combined/{file_name}')
        mfcc = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=COL_SIZE).T, axis=0)
        chroma_stft = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate).T, axis=0)
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sample_rate).T, axis=0)
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sample_rate).T, axis=0)
        zero_crossing_rate = np.mean(librosa.feature.zero_crossing_rate(audio).T, axis=0)
        return mfcc, chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    

# extract pitch intensity, duration, loudness, jitter, shimmer, hnr function (prosodic features)
def extract_prosodic_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        pitch_intensity = np.mean(librosa.pyin.piptrack(y=audio, sr=sample_rate).T, axis=0)
        duration = np.mean(librosa.effects.time_stretch(audio, 1.0).T, axis=0)
        loudness = np.mean(librosa.feature.rms(y=audio).T, axis=0)
        jitter = np.mean(librosa.effects.jitter(y=audio).T, axis=0)
        shimmer = np.mean(librosa.effects.shimmer(y=audio).T, axis=0)
        hnr = np.mean(librosa.effects.harmonic(y=audio).T, axis=0)
        return pitch_intensity, duration, loudness, jitter, shimmer, hnr
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    

# extract plp features function
def extract_plp_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        plp = np.mean(librosa.beat.plp(y=audio, sr=sample_rate, n_mfcc=COL_SIZE).T, axis=0)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None
    return plp

In [6]:
# get wav from file function
def get_wav(file_name):
    try:
        audio, sample_rate = librosa.load(f'./data/native_combined/{file_name}.wav')
        return librosa.core.resample(y=audio, orig_sr=sample_rate, target_sr=RATE, scale=True)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None

# convert wave to mfcc function
def wave_to_mfcc(audio, sample_rate):
    try:
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=N_MFCC)
        mfccs_processed = np.mean(mfccs.T, axis=0)
    except Exception as e:
        print("Error encountered while parsing audio")
        return None
    return mfccs_processed

# normalize mfcc function
def normalize_mfcc(mfcc):
    mms = MinMaxScaler()
    return mms.fit_transform(np.abs(mfcc))

In [7]:
# to categorical function
def to_categorical(y):
    lang_dict = {}
    for index, language in enumerate(set(y)):
        lang_dict[language] = index
    y = list(map(lambda x: lang_dict[x],y))
    return keras.utils.to_categorical(y, len(lang_dict)), lang_dict


In [8]:
# load native speakers function
def load_native_speakers(path: str) -> list[str]:
    file_paths = [file for file in os.listdir(path) if file.endswith('.wav')]
    return file_paths

# combine native speakers function, take file paths, number of samples
def combine_native_speakers(
    file_path: str, 
    paths:list[str],
    n_samples: int = 2, 
    seed_duration:int = 1,
) -> list[np.array]:
    # load the primary audio
    primary_audio, sample_rate = librosa.load(f'./data/audio/{file_path}', sr=None)
    # get the primary one second seed
    primary_one_seed = librosa.util.fix_length(primary_audio, size=seed_duration)
    # randonly select the secondary audio in paths
    secondary_audio, _ = librosa.load(f'./data/audio/{random.choice(paths)}')
    # random 2 second seed
    secondary_one_seed = librosa.util.fix_length(secondary_audio, size=n_samples)
    
    # combine the primary and secondary audio
    combined_audio = np.concatenate((primary_one_seed, secondary_one_seed))
    return combined_audio, sample_rate


In [9]:
def split_people(
    dataframe: pd.DataFrame, 
    test_size: float = 0.2, 
    first_column: str = 'language_num',
    second_column: str = 'english_residence',
) -> any:
    """
        Create train test split of DataFrame
        Args:
            :param dataframe: DataFrame to be split
            :param test_size: Percentage of total files to be split into test
        Return:
            :return X_train, X_test, y_train, y_test (tuple): Xs are list of
            df['language_num'] and Ys are df['english_residence']
    """
    x_train, x_test, y_train, y_test = train_test_split(
        dataframe[first_column],
        dataframe[second_column],
        test_size=test_size,
        train_size= 1 - test_size,
        random_state=1234
    )
    return x_train, x_test, y_train, y_test

In [10]:
# load the native_bio_metadata.csv
native_bio_metadata = pd.read_csv(NATIVE_FILE_PATH)
native_bio_metadata.head()

Unnamed: 0,href,language_num,sex,birth_place,native_language,other_languages,age_sex,age_of_english_onset,english_learning_method,english_residence,length_of_english_residence,age
0,http://accent.gmu.edu/browse_language.php?func...,mandarin1,female,"['shanxi,', 'china']",mandarin\n(cmn),['none'],"['26,', 'female', '']",13.0,academic,usa,2.0,26.0
1,http://accent.gmu.edu/browse_language.php?func...,mandarin2,female,"['nanjing,', 'china']",mandarin\n(cmn),"['japanese', '']","['38,', 'female', '']",14.0,academic,usa,0.8,38.0
2,http://accent.gmu.edu/browse_language.php?func...,mandarin3,male,"['jilin,', 'china']",mandarin\n(cmn),"['italian', 'german', 'french', '']","['43,', 'male', '']",10.0,academic,usa,14.0,43.0
3,http://accent.gmu.edu/browse_language.php?func...,mandarin4,female,"['shanghai,', 'china']",mandarin\n(cmn),"['japanese', '']","['24,', 'female', '']",6.0,academic,usa,1.0,24.0
4,http://accent.gmu.edu/browse_language.php?func...,mandarin5,female,"['beijing,', 'china']",mandarin\n(cmn),['none'],"['31,', 'female', '']",12.0,academic,usa,2.0,31.0


In [11]:
# # audio feature extraction instance
# load native speaker files
native_speakers_data = load_native_speakers(path=NATIVE_DIR)
# native_speakers_data

In [12]:
# combine native speakers
native_speakers = []
for file in native_speakers_data:
    aud, sr = combine_native_speakers(file, native_speakers_data)
    fwx = file.split('.')[0]
    # get the english residence from the native_bio_metadata['english_residence'] based on the filename
    class_category = native_bio_metadata[native_bio_metadata['language_num'] == fwx]['english_residence'].values[0]
    # create three columns, audio, sample rate, and file name
    native_speakers.append([file, class_category])
    os.makedirs(NATIVE_COMBINED_DIR, exist_ok=True)
    sf.write(f'./data/native_combined/{file}', aud, sr, subtype='PCM_24')
    
# create a dataframe from the native speakers

In [14]:
# create a dataframe from the native speakers
df = pd.DataFrame(
    native_speakers, 
    columns=[
        'file_name',
        'english_residence',
    ]
)
df.head()

Unnamed: 0,file_name,english_residence
0,arabic1.wav,usa
1,arabic10.wav,usa
2,arabic100.wav,usa
3,arabic101.wav,usa
4,arabic102.wav,usa


In [15]:
df.describe()

Unnamed: 0,file_name,english_residence
count,1169,1169
unique,1169,3
top,arabic1.wav,usa
freq,1,1031


In [16]:
df['english_residence'].value_counts()

english_residence
usa       1031
uk          81
canada      57
Name: count, dtype: int64

In [17]:
# split the native_combined_df into train and test
X_train, X_test, y_train, y_test = split_people(
    df, 
    first_column='file_name', 
    second_column='english_residence', 
    test_size=0.2
)

In [18]:
print(X_test.shape, y_test.shape)

(234,) (234,)


In [19]:
y_test = y_test.apply(lambda x: x.split('\n')[0])
y_train = y_train.apply(lambda x: x.split('\n')[0])

In [20]:
# Get statistics
train_count = Counter(y_train)
test_count = Counter(y_test)


print(f"Train Count: {train_count}")
print(f"Test Count: {test_count}")

Train Count: Counter({'usa': 832, 'uk': 64, 'canada': 39})
Test Count: Counter({'usa': 199, 'canada': 18, 'uk': 17})


In [21]:
y_train_cat, _ = to_categorical(y_train)
y_test_cat, lang_dict = to_categorical(y_test)

In [22]:
lang_dict

{'usa': 0, 'canada': 1, 'uk': 2}

In [23]:
X_train[0]

'arabic1.wav'

In [24]:
# extract acoustic features
X_train_acoustic = []

for audio in X_train:
    # audio is a waveform, load it and extract the features
    # mfcc, chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, zero_crossing_rate
    mfcc, chroma_stft, _, _, _, _ = extract_accoustic_features(audio)
    
    X_train_acoustic.append([
        mfcc, 
        chroma_stft,
    ])

  return pitch_tuning(


In [25]:
X_train_acoustic[0]

[array([-7.3805859e+02, -7.8013401e+00,  3.7200711e+00, -1.6094973e+00,
         6.4021707e-01, -3.1855926e-01,  1.4145039e-01, -9.8202810e-02,
         4.3154776e-02, -4.5751110e-02], dtype=float32),
 array([0.9135525 , 0.94166386, 0.96907395, 0.99474573, 1.        ,
        0.8222989 , 0.74025804, 0.76661295, 0.7962481 , 0.8263944 ,
        0.8554899 , 0.8836181 ], dtype=float32)]

In [26]:
X_test_acoustic = []

for file in X_test:
    audio = file
    mfcc, chroma_stft, _, _, _, _ = extract_accoustic_features(audio)
    X_test_acoustic.append([
        mfcc,
        chroma_stft,
    ])

In [27]:
X_train_acoustic[0]

[array([-7.3805859e+02, -7.8013401e+00,  3.7200711e+00, -1.6094973e+00,
         6.4021707e-01, -3.1855926e-01,  1.4145039e-01, -9.8202810e-02,
         4.3154776e-02, -4.5751110e-02], dtype=float32),
 array([0.9135525 , 0.94166386, 0.96907395, 0.99474573, 1.        ,
        0.8222989 , 0.74025804, 0.76661295, 0.7962481 , 0.8263944 ,
        0.8554899 , 0.8836181 ], dtype=float32)]

In [68]:
# create pytorch model to train the data on
class AcousticDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Concatenate the features within each sample
        features = torch.tensor(np.concatenate(self.X[idx], axis=0), dtype=torch.float32)
        label = torch.tensor(self.y[idx], dtype=torch.long)  # Assuming self.y[idx] is the label index
        return features, label

class AcousticModelSequntial(nn.Module):
    def __init__(self, input_size, output_size):
        super(AcousticModelSequntial, self).__init__()
        # self.model = nn.Sequential(
        #     nn.Linear(input_size, 32),
        #     nn.Conv1d(32, 64, 3),
        #     nn.ReLU(),
        #     nn.Linear(64, output_size),
        #     nn.Softmax(dim=1),
        #     nn.Flatten()
        # )
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_size),
            nn.Softmax(dim=1)
        )
    
    def forward(self, x):
        return self.model(x)

# create the model
class AcousticModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(AcousticModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x



In [69]:
# get the dimension of input features
input_size = X_train_acoustic[0][0].shape[0] + X_train_acoustic[0][1].shape[0]  # + X_train_acoustic[0][2].shape[0] + X_train_acoustic[0][n].shape[0]

# model_1 = AcousticModelSequntial(input_size=input_size, output_size=len(lang_dict))
# model_1.to(device)
# optimizer_1 = optim.Adam(model_1.parameters(), lr=LEARNING_RATE)
# criterion = nn.CrossEntropyLoss()
# model_1.train()


# create the model
model = AcousticModel(input_size=input_size, output_size=len(lang_dict))
model.to(device)
# create the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
# create the loss function
criterion = nn.CrossEntropyLoss()
# create the data loaders
train_dataset = AcousticDataset(X_train_acoustic, y_train_cat)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# set the model to training mode
model.train()

AcousticModel(
  (fc1): Linear(in_features=22, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (softmax): Softmax(dim=1)
)

In [70]:
train_dataset = AcousticDataset(X_train_acoustic, y_train_cat)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [71]:
# create the data loaders
val_dataset = AcousticDataset(X_test_acoustic, y_test_cat)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [72]:
# # fit the model
# for epoch in range(EPOCHS):
#     for i, (X, y) in enumerate(train_loader):
#         X = X.clone().detach().to(device)  # Use clone().detach() to copy and detach the tensor
#         y = torch.argmax(y.clone().detach(), dim=1).to(device)  # Similarly, clone().detach() for y

#         optimizer_1.zero_grad()
#         outputs = model_1(X)
#         loss = criterion(outputs, y)
#         loss.backward()
        
#         optimizer_1.step()
        
#         if i % 10 == 0:
#             print(f"Epoch: {epoch} Loss: {loss.item()}")

In [73]:
# fit the model
for epoch in range(EPOCHS):
    for i, (X, y) in enumerate(train_loader):
        X = X.clone().detach().to(device)  # Use clone().detach() to copy and detach the tensor
        y = torch.argmax(y.clone().detach(), dim=1).to(device)  # Similarly, clone().detach() for y

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        
        optimizer.step()
        
        if i % 10 == 0:
            print(f"Epoch: {epoch} Loss: {loss.item()}")

Epoch: 0 Loss: 0.645207405090332
Epoch: 0 Loss: 0.6139446496963501
Epoch: 0 Loss: 0.6451946496963501
Epoch: 1 Loss: 0.6139446496963501
Epoch: 1 Loss: 0.6764446496963501
Epoch: 1 Loss: 0.7076947689056396
Epoch: 2 Loss: 0.6764446496963501
Epoch: 2 Loss: 0.6139446496963501
Epoch: 2 Loss: 0.6764446496963501
Epoch: 3 Loss: 0.7076947093009949
Epoch: 3 Loss: 0.7389447093009949
Epoch: 3 Loss: 0.7389446496963501
Epoch: 4 Loss: 0.6764446496963501
Epoch: 4 Loss: 0.6451946496963501
Epoch: 4 Loss: 0.5826946496963501
Epoch: 5 Loss: 0.6139446496963501
Epoch: 5 Loss: 0.6451946496963501
Epoch: 5 Loss: 0.7076947093009949
Epoch: 6 Loss: 0.6139446496963501
Epoch: 6 Loss: 0.6764446496963501
Epoch: 6 Loss: 0.6139446496963501
Epoch: 7 Loss: 0.6451946496963501
Epoch: 7 Loss: 0.6139446496963501
Epoch: 7 Loss: 0.7389446496963501
Epoch: 8 Loss: 0.6451946496963501
Epoch: 8 Loss: 0.6764446496963501
Epoch: 8 Loss: 0.7389447093009949
Epoch: 9 Loss: 0.8014447689056396
Epoch: 9 Loss: 0.6764446496963501
Epoch: 9 Loss: 

In [None]:
# Assuming you have a validation or test DataLoader called val_loader
correct = 0
total = 0

model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    for X_val, y_val in val_loader:
        X_val = X_val.to(device)
        y_val = y_val.to(device)

        outputs = model(X_val)
        _, predicted = torch.max(outputs, 1)  # Get the index of the max logit as prediction
        print(y_val.shape, predicted.shape)
        print(predicted)
        # correct += (predicted == y_val).sum().item()
        # total += y_val.size(0)

# accuracy = correct / total
# print(f"Accuracy: {accuracy}")
