In [32]:
import os
import IPython.display as ipd
import numpy as np
import librosa, librosa.display
from scipy.fft import rfft
from sklearn.neighbors import KNeighborsClassifier

#### The raw training/validation/test data are the paths to the audio files, labels for now are just the emotions. The gender and intensity lists maybe further used as features

In [5]:
raw_train_data = []
raw_val_data = []
raw_test_data = []

train_label = []
val_label = []
test_label = []

train_gender = []
val_gender = []
test_gender = []

train_intensity = []
val_intensity = []
test_intensity = []

for i in range(0, 16):
    # Get all file names in the dir
    actor = "Actor_%02d" % (i + 1)
    inputs = os.listdir('./data/RAV/' + actor)
    
    for ele in inputs:
        raw_train_data.append('./data/RAV/' + actor + '/' + ele) # Form paths to the files
        
        file_name = ele.split('-')
        train_label.append(int(file_name[2]))                    # Get emotion label
        train_intensity.append(int(file_name[3]))                # Get intensity
        train_gender.append((i + 1) % 2)                         # Get gender (1 for male, 0 for female)
    
for i in range(16, 20):
    # Get all file names in the dir
    actor = "Actor_%02d" % (i + 1)
    inputs = os.listdir('./data/RAV/' + actor)

    for ele in inputs:
        raw_val_data.append('./data/RAV/' + actor + '/' + ele) # Form paths to the files
        
        file_name = ele.split('-')
        val_label.append(int(file_name[2]))                    # Get emotion label
        val_intensity.append(int(file_name[3]))                # Get intensity
        val_gender.append((i + 1) % 2)                         # Get gender (1 for male, 0 for female)
        
for i in range(20, 24):
    # Get all file names in the dir
    actor = "Actor_%02d" % (i + 1)
    inputs = os.listdir('./data/RAV/' + actor)

    for ele in inputs:
        raw_test_data.append('./data/RAV/' + actor + '/' + ele) # Form paths to the files
        
        file_name = ele.split('-')
        test_label.append(int(file_name[2]))                    # Get emotion label
        test_intensity.append(int(file_name[3]))                # Get intensity
        test_gender.append((i + 1) % 2)                         # Get gender (1 for male, 0 for female)

#### Use librosa to load the audio files

In [47]:
lbr_train_data = []
for ele in raw_train_data:
    temp, _ = librosa.load(ele)
    lbr_train_data.append(np.pad(temp, (0, 116247 - len(temp))))

In [48]:
lbr_val_data = []
for ele in raw_val_data:
    temp, _ = librosa.load(ele)
    lbr_val_data.append(np.pad(temp, (0, 116247 - len(temp))))

In [49]:
lbr_test_data = []
for ele in raw_test_data:
    temp, _ = librosa.load(ele)
    lbr_test_data.append(np.pad(temp, (0, 116247 - len(temp))))

In [46]:
np.pad([1,2,3], (0, 5))

array([1, 2, 3, 0, 0, 0, 0, 0])

#### Get the Fourier transform of the data as features
The result of from the Fourier transform on the entire data set may not offer much information since all information about time in the sound is inherently lost. In human speech, having a high pitch at the beginning may indicate very different emotions from having a high pitch at the end. I do not expect great results from such features.

In [50]:
fft_train_data = []
for ele in lbr_train_data:
    temp = rfft(ele)
    fft_train_data.append(temp.real)
        
fft_val_data = []
for ele in lbr_val_data:
    temp = rfft(ele)
    fft_val_data.append(temp.real)
        
fft_test_data = []
for ele in lbr_test_data:
    temp = rfft(ele)
    fft_test_data.append(temp.real)

#### Using KNN classifier for FFT features
As seen here, FFT+KNN yields validation errors slightly better than chance, I would expect similar results for the test error, therefore, I did not evaluate based on the test data.

In [51]:
# Training the knn classifier
knn_classifier = KNeighborsClassifier(n_neighbors = 30)
knn_classifier.fit(fft_train_data, train_label)

KNeighborsClassifier(n_neighbors=30)

In [52]:
# optimizing the knn classifier
knn_classifier.score(fft_val_data, val_label)

0.14166666666666666

#### Results from testing with different number of neighbors for FFT features
n = 1: 0.1167  
n = 3: 0.1167  
n = 5: 0.125  
n = 10: 0.1   
n = 15: 0.125  
n = 20: 0.1083  
n = 25: 0.1208  
n = 30: 0.1417  
n = 35: 0.1333  
n = 40: 0.1333  
n = 50: 0.1333  

#### Get spectrograms of the audio input as features as to preserve the time dimension of the input

In [73]:
stft_train_data = []
for ele in lbr_train_data:
    temp = librosa.stft(ele, n_fft = 512)
    stft_train_data.append(temp.real.reshape(-1))
        
stft_val_data = []
for ele in lbr_val_data:
    temp = librosa.stft(ele, n_fft = 512)
    stft_val_data.append(temp.real.reshape(-1))
        
stft_test_data = []
for ele in lbr_test_data:
    temp = librosa.stft(ele, n_fft = 512)
    stft_test_data.append(temp.real.reshape(-1))

#### Using KNN with spectrogram as features

In [74]:
# Training the knn classifier
knn_classifier = KNeighborsClassifier(n_neighbors = 30)
knn_classifier.fit(stft_train_data, train_label)

KNeighborsClassifier(n_neighbors=30)

In [None]:
for n in [1,3,5,10,15,20,30,40,50]:
    knn_classifier = KNeighborsClassifier(n_neighbors = n)
    knn_classifier.fit(stft_train_data, train_label)
    print(str(n) + ":" + str(knn_classifier.score(stft_val_data, val_label)))

n = 1:  
n = 3  
n = 5  
n = 10  
n = 15  
n = 20  
n = 30  
n = 40  
n = 50  