In [15]:
import os
import librosa
import random
import numpy as np
import pandas as pd
import soundfile as sf

In [2]:
OUTPUT_DIR: str = "output/"
CSV_FILE_PATH: str = "bio_metadata.csv"
NATIVE_FILE_PATH: str = "native_bio_metadata.csv"
ALL_SPEAKERS_PATH: str = "speakers_all.csv"
NON_NATIVE_FILE_PATH: str = "non_native_bio_metadata.csv"
NATIVE_LANGUAGES: list[str] = ['uk', 'usa', 'canada']
NON_NATIVE_LANGUAGES: list[str] = [
    'australia',
    'new zealand',
    'ireland',    
    'singapore',  
    'south',     
    'africa',   
    'jamaica',    
    'scotland',   
    'islands',
]
DATASET_DIR: str = "data/"
NATIVE_DIR: str = "data/native/"
RECORDING_DIR: str = "data/recordings/"
NATIVE_COMBINED_DIR: str = "data/native_combined/"
NON_NATIVE_DIR: str = "data/non_native/"
AUDIO_DATA_DIR: str = "data/audio/"
AUDIO_FILE_PATH: str = "data/audio/{}.wav"
SILENCE_THRESHOLD: float = .01
RATE: int = 2400
N_MFCC: int = 13
COL_SIZE: int = 30
EPOCHS: int = 150 #35 #50 #250
LEARNING_RATE = 0.001
WAIT: float = 1.2
DEBUG: bool = True


In [6]:
# load native speakers function
def load_native_speakers(path: str) -> list[str]:
    file_paths = [file for file in os.listdir(path) if file.endswith('.wav')]
    return file_paths

# combine native speakers function, take file paths, number of samples
def combine_native_speakers(
    file_path: str, 
    paths:list[str],
    n_samples: int = 1, 
    seed_duration:int = 2,
) -> list[np.array]:
    # load the primary audio
    primary_audio, sample_rate = librosa.load(f'./data/audio/{file_path}', sr=None)
    # get the primary one second seed
    primary_one_seed = librosa.util.fix_length(primary_audio, size=seed_duration)
    # randonly select the secondary audio in paths
    secondary_audio, _ = librosa.load(f'./data/audio/{random.choice(paths)}')
    # random 2 second seed
    secondary_one_seed = librosa.util.fix_length(secondary_audio, size=n_samples)
    
    # combine the primary and secondary audio
    combined_audio = np.concatenate((primary_one_seed, secondary_one_seed))
    return combined_audio, sample_rate


In [7]:
# load the native_bio_metadata.csv
native_bio_metadata = pd.read_csv(NATIVE_FILE_PATH)
native_bio_metadata.head()

Unnamed: 0,href,language_num,sex,birth_place,native_language,other_languages,age_sex,age_of_english_onset,english_learning_method,english_residence,length_of_english_residence,age
0,http://accent.gmu.edu/browse_language.php?func...,mandarin1,female,"['shanxi,', 'china']",mandarin\n(cmn),['none'],"['26,', 'female', '']",13.0,academic,usa,2.0,26.0
1,http://accent.gmu.edu/browse_language.php?func...,mandarin2,female,"['nanjing,', 'china']",mandarin\n(cmn),"['japanese', '']","['38,', 'female', '']",14.0,academic,usa,0.8,38.0
2,http://accent.gmu.edu/browse_language.php?func...,mandarin3,male,"['jilin,', 'china']",mandarin\n(cmn),"['italian', 'german', 'french', '']","['43,', 'male', '']",10.0,academic,usa,14.0,43.0
3,http://accent.gmu.edu/browse_language.php?func...,mandarin4,female,"['shanghai,', 'china']",mandarin\n(cmn),"['japanese', '']","['24,', 'female', '']",6.0,academic,usa,1.0,24.0
4,http://accent.gmu.edu/browse_language.php?func...,mandarin5,female,"['beijing,', 'china']",mandarin\n(cmn),['none'],"['31,', 'female', '']",12.0,academic,usa,2.0,31.0


In [8]:
# # audio feature extraction instance
# load native speaker files
native_speakers_data = load_native_speakers(path=NATIVE_DIR)
# native_speakers_data

In [9]:
# combine native speakers
native_speakers = []
for file in native_speakers_data:
    aud, sr = combine_native_speakers(file, native_speakers_data)
    fwx = file.split('.')[0]
    # get the english residence from the native_bio_metadata['english_residence'] based on the filename
    class_category = native_bio_metadata[native_bio_metadata['language_num'] == fwx]['english_residence'].values[0]
    # create three columns, audio, sample rate, and file name
    native_speakers.append([file, class_category])
    os.makedirs(NATIVE_COMBINED_DIR, exist_ok=True)
    sf.write(f'./data/native_combined/{file}', aud, sr, subtype='PCM_24')
    
# create a dataframe from the native speakers

In [10]:
# create a dataframe from the native speakers
df = pd.DataFrame(
    native_speakers, 
    columns=[
        'file_name',
        'english_residence',
    ]
)
df.head()

Unnamed: 0,file_name,english_residence
0,arabic1.wav,usa
1,arabic10.wav,usa
2,arabic100.wav,usa
3,arabic101.wav,usa
4,arabic102.wav,usa


In [13]:
df.describe()

Unnamed: 0,file_name,english_residence
count,1169,1169
unique,1169,3
top,arabic1.wav,usa
freq,1,1031


In [14]:
df['english_residence'].value_counts()

english_residence
usa       1031
uk          81
canada      57
Name: count, dtype: int64