Import wav audio samples into a pandas dataframe

Create base dataset csv file with columns:
language, speaker, audio_raw_data

In [2]:
#import libraries
import numpy as np
from scipy.io import wavfile
import os
import pandas as pd

sample_length = 60 * 16000 # 16kHz sampling of 60 seconds of audio

# print size of list of audio files
folder = os.getcwd() + "/../rec/"
audio_files_list = os.listdir(folder)
print(len(audio_files_list))

# memorize audio recordings and thei associated language and speaker
audio_recs = np.zeros( (len(audio_files_list), sample_length), dtype="int16")
languages = []
speakers = []
print(audio_recs.shape)

# create starting point: dataset with all the audio recordings
for i, audio in enumerate(audio_files_list):
    sample_rate, audio_rec = wavfile.read(folder + audio)
    audio_recs[i] = audio_rec
    languages.append(audio[4:7])
    speakers.append(audio[8 : len(audio) - 7])

# print unique speakers and languages
print(np.unique(speakers))
print(np.unique(languages))


# Create a dictionary with the structured data
data_dictionary = {
    'language': languages,
    'speaker': speakers,
    'audio_raw_data': audio_recs.tolist()
}

'''
# Specify the data types for each column
data_types = {
    'audio_raw_data': 'object',
    'language': 'str'
}
'''
dataset = pd.DataFrame(data_dictionary)
# Save the DataFrame to a CSV file
#df.to_csv('dataset0.csv', index=False)

print(dataset.head())


53
(53, 960000)
['alessandro' 'donia' 'elena' 'francesco' 'gabriele' 'lorenzo' 'omar'
 'thiago']
['ara' 'eng' 'esp' 'ita' 'por']
  language     speaker                                     audio_raw_data
0      ita  alessandro  [21, -4, -25, -74, -23, -43, -177, -195, -236,...
1      por      thiago  [-561, -550, -536, -539, -545, -573, -573, -58...
2      ita       elena  [43, 41, 39, 39, 39, 38, 39, 31, 34, 38, 34, 3...
3      eng     lorenzo  [-653, -649, -667, -681, -701, -673, -694, -70...
4      esp       elena  [-415, -409, -412, -411, -407, -413, -421, -43...


In [10]:
window = 5.6 # seconds of audio in input
hop = 1.875 # overlapping window time in seconds
frequency = 16000

# create dataset where the audio files are split in windows

audio_splits = np.empty( (int(60 // hop - window // hop) * len(audio_files_list), int(frequency * window) ), dtype="int16" )
print("audio splits = ", audio_splits.shape)
languages = []
speakers = []
split_index = 0

for index, sample in dataset.iterrows():
    #raw_data = np.array(literal_eval(sample["audio_raw_data"]), dtype="int16")
    #raw_data = np.fromstring( (sample["audio_raw_data"].replace(' ', ''))[1:-1], dtype="int16", sep=',')
    raw_data = sample["audio_raw_data"]
    print("file n: ", index, " with length ", len(raw_data))
    for time in range(0, int((60-window) * frequency), int(hop * frequency)):
        audio_splits[split_index] = raw_data[time : int(time + window * frequency) ]
        languages.append(sample["language"])
        speakers.append(sample["speaker"])
        split_index = split_index + 1
        

# Create a dictionary with the structured data
data_dictionary = {
    'audio_raw_data': audio_splits.tolist(),
    'language': languages,
    'speaker': speakers
}

# Create a DataFrame from the dictionary
dataset_windowed = pd.DataFrame(data_dictionary)




audio splits =  (1590, 89600)
file n:  0  with length  960000
file n:  1  with length  960000
file n:  2  with length  960000
file n:  3  with length  960000
file n:  4  with length  960000
file n:  5  with length  960000
file n:  6  with length  960000
file n:  7  with length  960000
file n:  8  with length  960000
file n:  9  with length  960000
file n:  10  with length  960000
file n:  11  with length  960000
file n:  12  with length  960000
file n:  13  with length  960000
file n:  14  with length  960000
file n:  15  with length  960000
file n:  16  with length  960000
file n:  17  with length  960000
file n:  18  with length  960000
file n:  19  with length  960000
file n:  20  with length  960000
file n:  21  with length  960000
file n:  22  with length  960000
file n:  23  with length  960000
file n:  24  with length  960000
file n:  25  with length  960000
file n:  26  with length  960000
file n:  27  with length  960000
file n:  28  with length  960000
file n:  29  with lengt

## Training - Validation - Test splitting criteria

Create training and validation dataset by splitting the data based on the speaker and language:

- all the speakers having 1 or 2 languages associated will have their data split 75-25 between training and validation. Each audio sample is split in a piece of 45 seconds (used for training) and another separate piece of 15 seconds.
- speakers having more than 2 languages associated will have 75% of languages in training and 25% of languages in validation. Each audio sample here is taken entirely without splitting, and placed in the corresponding dataset.

The data collected are audio samples of 60 seconds. The split between training and validation is done in such a way to have completely separated data frames between the sets. 
This way the audio frames computed will never overlap between training and validation. This is important in order to have validation data that is unseen in training set. T
his is due to the frames of the audio samples computed in sequences of frame_size with some overlap (hop_size). 
In order to have zero overlap between frames in the 2 datasets, we divide an audio samples in 2 separate non-overlapping pieces, where the frames are computed.

This way the validation set can be used to estimate:
- known speakers speaking in languages that have been already heard from them
- known speakers speaking in languages that were never heard from them -> useful to understand quality of model's knowledge

The test set will be created ad-hoc when a lot of data is collected. A few separate test sets will be created, needed to evaluate the performance of the model in different scenarios:

1. known speakers in heard languages -> evaluate model performance in tested scenarios
2. known speakers in un-heard languages -> evaluate performance for recognizing language instead of the speaker vocal characteristics
3. unknown speakers -> evaluate performance for recognizing language from an unseen speaker

The forecast for the test task is having the performance in the case 3 being lower than the case 2. Having separate test sets is useful in order to have an unbiased estimate of the model's performance on different tasks.

In [50]:

split_ratio = 0.75 # 75% of training, 25% of validation

classes_list = ["ita", "eng"] # substitute with np.unique(languages) to obtain whole set of languages
speakers_list = np.unique(speakers)
print(speakers_list)


['alessandro' 'donia' 'elena' 'francesco' 'gabriele' 'lorenzo' 'omar'
 'thiago']


In [77]:
# select the samples that are in dataset_windowed and whose language is in classes_list
valid_samples = []
for sample in dataset_windowed.iterrows():
    if sample[1]['language'] in classes_list:
        valid_samples.append(sample[1])

valid_samples = pd.DataFrame(valid_samples)

print("samples considered: ", len(valid_samples))
print("entire dataset: ", dataset_windowed.shape)

samples considered:  1050
entire dataset:  (1590, 3)


In [91]:
dataset_train = []
dataset_validation = []

for speaker in speakers_list:
	# take slice of the dataset containing the samples associated to one speaker
	data_speaker = valid_samples[ valid_samples["speaker"] == speaker]
	print("speaker: ", speaker, " speaks: ", np.unique(data_speaker["language"]) )
	print("samples quantity: ", data_speaker.shape)

	# compute number of languages spoken by the speaker
	langs_spoken = np.unique(data_speaker["language"])

	print("langs_spoken: ", langs_spoken)

	if (langs_spoken.shape[0] == 0):
		# if the speaker doesn't speak any of the supported languages than doesn't add any sample
		print("skipping speaker: ", speaker)
		print("")
		continue # skip to next speaker
	
	choice = np.random.uniform(0, 1, size=1)
	print("choice value: ", choice)

	samples_number = len(data_speaker)
	print("selected ", samples_number, " samples from speaker ", speaker)

	if (choice < 0.5 or (choice >= 0.5 and choice < 0.85 and langs_spoken.shape[0] == 1)):
		# pick which samples go in training and which in validation
		
		random_split = np.random.uniform(0, 1, size=samples_number)
		
		count_valid = 0
		count_train = 0
		data_speaker = np.array(data_speaker)
		for i in range(samples_number):
			# add one sample at a time in the datasets lists
			if (random_split[i] < split_ratio):
				dataset_train.append(data_speaker[i])
				count_train += 1
			else:
				dataset_validation.append(data_speaker[i])
				count_valid += 1
		
		print("added ", count_train, " samples to training")
		print("added ", count_valid, " samples to validation")

	elif (choice >= 0.5 and choice < 0.85):
		# data from one language goes in validation, the rest in training

		if (langs_spoken.shape[0] > 1):
			# if the speaker speaks more than one language
			random_split = np.random.uniform(0, 1, size=1)

			lang_choice = langs_spoken[int(random_split * langs_spoken.shape[0])]

			for lang in langs_spoken:
				if (lang == lang_choice):
					add_valid = data_speaker[ data_speaker["language"] == lang ]
					add_valid = np.array(add_valid)
					print("adding ", add_valid.shape[0], " samples of language ", lang, " to validation")
					dataset_validation.extend(add_valid)
				else:
					add_train = data_speaker[ data_speaker["language"] == lang ]
					add_train = np.array(add_train)
					print("adding ", add_train.shape[0], " samples of language ", lang, " to training")
					dataset_train.extend(add_train)

		else: 
			# if the speaker doesn't speak any of the supported languages than doesn't add any sample
			pass # do nothing
	else:
		# data from this speaker goes entirely in validation
		print("adding ", samples_number, " samples to validation")
		data_speaker = np.array(data_speaker)
		dataset_validation.extend(data_speaker)
	
	print("")


len_train = len(dataset_train)
len_valid = len(dataset_validation)
total = len_train + len_valid
print("dataset_train: ", len_train)
print("dataset_validation: ", len_valid)
print("training ratio: ", len_train / total * 100.0, "%")
print("validation ratio: ", len_valid / total * 100.0, "%")
print("total number of samples: ", total)

speaker:  alessandro  speaks:  ['eng' 'ita']
samples quantity:  (210, 3)
langs_spoken:  ['eng' 'ita']
choice value:  [0.31184309]
selected  210  samples from speaker  alessandro
added  159  samples to training
added  51  samples to validation

speaker:  donia  speaks:  []
samples quantity:  (0, 3)
langs_spoken:  []
skipping speaker:  donia

speaker:  elena  speaks:  ['eng' 'ita']
samples quantity:  (300, 3)
langs_spoken:  ['eng' 'ita']
choice value:  [0.6799766]
selected  300  samples from speaker  elena
adding  150  samples of language  eng  to training
adding  150  samples of language  ita  to validation

speaker:  francesco  speaks:  ['eng' 'ita']
samples quantity:  (120, 3)
langs_spoken:  ['eng' 'ita']
choice value:  [0.29999873]
selected  120  samples from speaker  francesco
added  89  samples to training
added  31  samples to validation

speaker:  gabriele  speaks:  ['eng' 'ita']
samples quantity:  (120, 3)
langs_spoken:  ['eng' 'ita']
choice value:  [0.96788489]
selected  120  s

In [92]:
# convert dataset_train to dataframe and save it into a csv file
# convert dataset_validation to dataframe and save it into a csv file
folder = os.path.dirname(os.getcwd()) + "/datasets/"

train_df = pd.DataFrame(dataset_train)
train_df.to_csv(folder + 'dataset_train.csv', index=False)

valid_df = pd.DataFrame(dataset_validation)
valid_df.to_csv(folder + 'dataset_validation.csv', index=False)