In [1]:
import warnings; warnings.filterwarnings('ignore')
import soundfile, os, glob, librosa
import numpy as np
from pandas import DataFrame
from copy import deepcopy
from IPython.display import display_html
from sklearn.model_selection import train_test_split

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [2]:
EMOTIONS ={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

LABELS = {
  'positive': 1,
  'negative': -1
}

### Data for binary classification


In [3]:
def load_extract_features(data_path):
    """
    Loads all Audio Files, Computes their features and target values.

    There are around 8-10 audio files which are corrupted. We hardcode zero values for such files in order to maintain consistency.

    ['calm', 'happy'] emotion data is categorized into 'positive' and  ['angry', 'fearful'] into 'negative'

    Args:
        data_path (str): _description_

    Returns:
        tuple: Features and Binary Target Values
    """

    final_features, binary_label = [], []
    count = 0

    for i in glob.glob(data_path + "/Actor_*/*.wav"): #Loop to read every file.

        name = os.path.basename(i)
        # We split the name of the file to understand the emotion associated with the file.
        # We know that the third identifier is associated with the emotion of the audio file.
        # Hence, we use [2] as it represents the third identifier.
        emotion = EMOTIONS[name.split("-")[2]]

        #Below is the code to categorize the emotions into two classes to make this a binary problem.
        if emotion in ['calm', 'happy']:
            binary_label.append(LABELS['positive'])
        elif emotion in ['angry', 'fearful']:
            binary_label.append(LABELS['negative'])
        else:
            continue

        with soundfile.SoundFile(i) as audio:
            waveform = audio.read(dtype="float32")
            sr = audio.samplerate

            # Below is the code to extract the Mel spectrogram features
            # 128 is the standard for machine learning applications using Mel spectrograms
            m_feature = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, fmax=sr/2.0).T
            melspectrogram = np.mean(m_feature,axis=0)
            if melspectrogram.shape != (128,): melspectrogram = np.zeros(128)

            # Below is the code to extract the chromagram features
            stft_wave = librosa.stft(waveform)
            stft = np.abs(stft_wave)
            c_feature = librosa.feature.chroma_stft(S=stft, sr=sr).T
            chromagram = np.mean(c_feature,axis=0)

            # 12 is the number of pitch classes
            if chromagram.shape != (12,): chromagram = np.zeros(12)

            final_features.append(np.array([*chromagram, *melspectrogram]))

            count += 1
            if count % 100 == 0: print(f"Processed {count} Audio Files")

    #We return the features and the binary target values.
    return np.array(final_features), np.array(binary_label)

#Please change the path below to the path of the folder saved in your computer.
data_path = './Audio_Speech_Actors_01-24'
X, y = load_extract_features(data_path)

Processed 100 Audio Files
Processed 200 Audio Files
Processed 300 Audio Files
Processed 400 Audio Files
Processed 500 Audio Files
Processed 600 Audio Files
Processed 700 Audio Files


## SVM

In [4]:
STEP_SIZE = 0.0001  # Learning Rate for the Gradient Descent

def train(X, y, max_iters=1000):
    """
    Finds the Support Vector Machine model parameter estimations using Gradient Descent.

    Args:
        X (np.ndarray (Shape: (N, k))): A Nxk matrix containing Mel spectrogram and chromagram data.
        y (np.ndarray (Shape: (N, ))): A array representing 1 as 'positive' emotion and -1 as 'negative' emotion.
        max_iters (int, optional): Number of iterations for the Gradient Descent Algorithm. Defaults to 1000.

    Returns:
        w, b: (np.ndarray (Shape: (k, )), float): Estimated Weights and Bias Term for the model.
    """

    assert X.shape[0] == y.shape[0], f"Number of inputs and outputs are different. (X: {X.shape[0]}, y: {y.shape[0]})"
    assert np.array_equal(np.unique(y), [-1, 1]), f"Labels must be either -1 or 1. (labels: {np.unique(y)}"

    N, N_f = X.shape

    # Initializes the weights and bias
    w, b = np.zeros((N_f, )), 0

    for _ in range(max_iters):
        sw = np.zeros((N_f, ))
        sb = 0

        # Calculates the max(0, 1 - y * (X @ w + b))
        distances = 1 - y * (np.dot(X, w) + b)
        distances[distances < 0] = 0

        # Sums up the delta for weights and bias gradients
        for idx, distance in enumerate(distances):
            if distance == 0:
                sw += STEP_SIZE * w
                sb += 0
            else:
                sw += STEP_SIZE * w - y[idx] * X[idx]
                sb += -y[idx]

       # Moves the weights and bias closer to the estimate
        w -= STEP_SIZE * sw/N
        b -= STEP_SIZE * sb/N

    return w, b

def predict(X, w, b):
    """
    Returns predictions using estimated weights and bias term for the SVM model.

    Args:
        X (np.ndarray (Shape: (N, k))): A Nxk matrix containing Mel spectrogram and chromagram data.
        w (ndarray (Shape: (k, ))): Estimated weights term for the SVM model.
        b (float): Estimated bias term for the SVM model.

    Returns:
        (np.ndarray (Shape: (N, ))): Array representing 1 as 'positive' emotion and -1 as 'negative' emotion.
    """

    assert X.shape[1] == w.shape[0], f"Number of inputs and weights are different. (X: {X.shape[1]}, y: {w.shape[0]})"

    return np.asarray(np.sign(np.dot(X, w) + b), dtype=int)

## PCA

In [5]:
def PCA(X, threshold=0.99):
    """
    Runs a PCA on X with a certain threshold

    Args:
        X (np.ndarray (Shape: (N, d))): A Nxd matrix containing Mel spectrogram and chromagram data.
        threshold (float, optional): Threshold for Varaince Maximization. Defaults to 0.99.

    Returns:
        np.ndarray (Shape: (N, k)): A Nxk matrix containing the basis eigenvectors where k << d.
    """

    assert 0.01 <= threshold <= 0.99 , f"threshold should be a percent between [0.01 , 0.99] (threshold: {threshold})"

    N, _ = X.shape
    X_meaned = X - np.mean(X, axis=0)
    cov = np.cov(X, rowvar=False) * (N-1)/N
    eigenvalues, eigenvectors = np.linalg.eig(cov)

    sort_order = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalue = eigenvalues[sort_order]
    sorted_eigenvectors = eigenvectors[sort_order]

    num_components = X_meaned.shape[-1]
    for k in range(1, X.shape[-1] + 1):
        h = np.sum(sorted_eigenvalue[:k])/np.sum(sorted_eigenvalue)
        if h >= threshold:
            num_components = k
            break

    W = sorted_eigenvectors[:,0:num_components]
    return np.dot(W.T, X_meaned.T).T

## Helper

In [6]:
def get_accuracy(expected: np.ndarray, actual: np.ndarray) -> tuple:
    """
    Calculates the accuracy of each label and overall accuracy

    Args:

        expected (np.ndarray (Shape: (N, ))): Expected Labels
        actual (np.ndarray (Shape: (N, ))): Predicted Labels

    Returns:
        (DataFrame, str): Label Accuracy, Overall Accuracy
    """

    label_accuracies = []

    for label, label_num in LABELS.items():
        # Filters the actual values based on the expected index for each label
        label_accuracy = np.mean(actual[expected == label_num] == label_num)
        label_accuracies.append((label, f'{round(label_accuracy * 100, 2)} %' ))

    label_acc = DataFrame(label_accuracies, columns=['Label','Accuracy'])
    overall_acc = f'{round(np.average(expected == actual) * 100, 2)}'

    return label_acc, overall_acc

def dataframe_to_html(df: DataFrame, caption: str) -> str:
    """
    Converts the DataFrame to Inline HTML for Jupyter Notebook

    Args:
        df (DataFrame): DataFrame to convert
        caption (str): Short Description about the DataFrame

    Returns:
        str: Inline HTML for Jupyter Notebook
    """

    return df.style.set_table_attributes("style='display:inline;'") \
		.set_properties(**{'text-align': 'left'}) \
		.set_table_styles([dict(selector = 'th', props=[('text-align', 'left')])]) \
		.set_caption(f'{caption} Accuracy').hide(axis='index')._repr_html_()

def create_report(label_acc_train: DataFrame, label_acc_test: DataFrame, \
    overall_acc_train: str, overall_acc_test: str) -> None:
    """
    Creates a Report for Training and Testing Accuracy Comparison

    Args:
        label_acc_train (DataFrame): Training Label Accuracy in percentage
        label_acc_test (DataFrame): Testing Label Accuracy in percentage
        overall_acc_train (str): Overall Training Accuracy in percentage
        overall_acc_test (str): Overall Testing Accuracy in percentage
    """

    # Converts DataFrames to Inline HTML
    train_df_html = dataframe_to_html(label_acc_train, 'Training')
    test_df_html = dataframe_to_html(label_acc_test, 'Testing')

    # Organizes the Tables and Text
    df_html = f"<center>{train_df_html}{'&nbsp;'*5}{test_df_html}</center>\n"
    acc_html = f"<center><p>Training Overall Accuracy: {overall_acc_train} %</p><p>Testing Overall Accuracy: {overall_acc_test} %</p></center>"

    # Renders the raw HTML onto a Jupyter Notebook
    display_html(df_html + acc_html, raw=True)

## Part 2a - SVM

In [7]:
# Potential Kernal Trick
# X = np.mod(X, 12) - Based on # of Pitches improves about 1%

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=2)

w, b = train(X_train, y_train)

y_pred_train = predict(X_train, w, b)
y_pred_test = predict(X_test, w, b)

label_acc_train, overall_acc_train = get_accuracy(y_train, y_pred_train)
label_acc_test, overall_acc_test = get_accuracy(y_test, y_pred_test)

create_report(label_acc_train, label_acc_test, overall_acc_train, overall_acc_test)

Label,Accuracy
positive,73.61 %
negative,70.15 %

Label,Accuracy
positive,67.83 %
negative,62.93 %


## Part 2b - SVM with PCA

In [8]:
X_reduced = PCA(X)

print('Inital Feature Size of X:', X.shape[-1])
print('New Feature Size of X:', X_reduced.shape[-1])

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, train_size=0.7, test_size=0.3, random_state=2)

w, b = train(X_train, y_train)

y_pred_train = predict(X_train, w, b)
y_pred_test = predict(X_test, w, b)

label_acc_train, overall_acc_train = get_accuracy(y_train, y_pred_train)
label_acc_test, overall_acc_test = get_accuracy(y_test, y_pred_test)

create_report(label_acc_train, label_acc_test, overall_acc_train, overall_acc_test)

Inital Feature Size of X: 140
New Feature Size of X: 23


Label,Accuracy
positive,92.19 %
negative,38.06 %

Label,Accuracy
positive,94.78 %
negative,34.48 %
