In [10]:
import soundfile, os, glob, librosa
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings; warnings.filterwarnings('ignore')

In [11]:
emotions ={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

### Data for binary classification


In [12]:
def load_extract_features(data_path):

    '''
    load_extract_features() is a function that is used to load all the audio files one at a time, compute their features and return the features as well as the target values.

    There are around 8-10 audio files which are corrupted. We hardcode zero values for such files in order to maintain consistency.

    ['calm', 'happy'] emotion data is categorized into 'positive' and  ['angry', 'fearful'] into 'negative'

    Returns:
    1. Features
    2. Binary Target Values
    '''
    final_features,target_emotions, binary_label = [],[], []
    count = 0

    for i in glob.glob(data_path + "/Actor_*/*.wav"): #Loop to read every file.

        name = os.path.basename(i)
        #We split the name of the file to understand the emotion associated with the file.
        split = name.split("-")
        #We know that the third identifier is associated with the emotion of the audio file. Hence, we use [2] as it represents the third identifier.
        emotion = emotions[split[2]]

        #Below is the code to categorize the emotions into two classes to make this a binary problem.
        if emotion in ['calm', 'happy']:
            binary_label.append(-1)
        elif emotion in ['angry', 'fearful']:
            binary_label.append(1)
        else:
            continue

        with soundfile.SoundFile(i) as audio:
            waveform = audio.read(dtype="float32")
            sr = audio.samplerate

            #Below is the code to extract the Mel spectrogram features
            #128 is the standard for machine learning applications using Mel spectrograms
            m_feature = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, fmax=sr / 2.0).T
            melspectrogram = np.mean(m_feature,axis=0)
            if melspectrogram.shape != (128,):
                melspectrogram = np.zeros(128)

            #Below is the code to extract the chromagram features
            stft_wave = librosa.stft(waveform)
            stft = np.abs(stft_wave)
            c_feature = librosa.feature.chroma_stft(S=stft, sr=sr).T
            chromagram = np.mean(c_feature,axis=0)

            #12 is the number of pitch classes
            if chromagram.shape != (12,):
                chromagram = np.zeros(12)

            features=np.array([])
            features=np.hstack((chromagram, melspectrogram))

            final_features.append(features)
            target_emotions.append(emotion)

            count += 1
            if count % 100 == 0:
                print("Processed Audio File Number: ", count)

    #We return the features and the binary target values.
    return np.array(final_features), np.array(binary_label)

In [13]:
#Please change the path below to the path of the folder saved in your computer.
data_path = './Audio_Speech_Actors_01-24'
X, y = load_extract_features(data_path)

Processed Audio File Number:  100
Processed Audio File Number:  200
Processed Audio File Number:  300
Processed Audio File Number:  400
Processed Audio File Number:  500
Processed Audio File Number:  600
Processed Audio File Number:  700


## SVM

In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=2)

STEP_SIZE = 0.001

def train(X, y, max_iters=1000):
    N, N_f = X.shape

    w = np.zeros((N_f, ))
    b = 0

    for _ in range(max_iters):
        dw = np.zeros((N_f, ))
        db = 0

        distances = 1 - y * (np.dot(X, w) + b)
        distances[distances < 0] = 0

        for idx, distance in enumerate(distances):
            if distance == 0:
                dw += STEP_SIZE * w
                db += 0
            else:
                dw += STEP_SIZE * w - y[idx] * X[idx]
                db += -y[idx]

        w -= STEP_SIZE/N * dw
        b -= STEP_SIZE/N * db


    return w, b

def predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

w, b = train(X_train, y_train)

y_pred = predict(X_train, w, b)
print("Train Accuracy: ", accuracy_score(y_train, y_pred))

y_pred = predict(X_test, w, b)
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

Train Accuracy:  0.6629422718808193
Test Accuracy:  0.6493506493506493


## PCA

In [15]:
def PCA(X, threshold=0.99):
    X_meaned = X - np.mean(X, axis = 0)
    cov = np.cov(X_meaned, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eig(cov)

    sort_order = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalue = eigenvalues[sort_order]
    sorted_eigenvectors = eigenvectors[sort_order]

    num_components = X_meaned.shape[-1]
    for num_feature in range(1, X.shape[-1] + 1):
        h = np.sum(sorted_eigenvalue[:num_feature])/np.sum(sorted_eigenvalue)
        if h >= threshold:
            num_components = num_feature
            break

    W = sorted_eigenvectors[:,0:num_components]
    return np.dot(W.T, X_meaned.T).T

X_reduced = PCA(X)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, train_size=0.7, test_size=0.3, random_state=2)

w, b = train(X_train, y_train)

y_pred = predict(X_train, w, b)
print("Train Accuracy: ", accuracy_score(y_train, y_pred))

y_pred = predict(X_test, w, b)
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

Train Accuracy:  0.6610800744878957
Test Accuracy:  0.6623376623376623
