# Dependencies

In [25]:
import scipy
from scipy.io import wavfile
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import signal
import osascript
from gtts import gTTS 
import os 
import pyaudio
import wave
import keyboard  # using module keyboard
import soundfile as sf
import math
import pyloudnorm as pyln
import sys
from sys import byteorder
from array import array
from struct import pack
import librosa
from scipy.signal import butter, sosfiltfilt
import python_speech_features
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 
import pysptk
from  conch.analysis.formants import lpc
import os

# Constants

In [26]:
BANDPASS_FREQ = [300, 3400]
NUM_EMOTIONS = 6
NUM_FEATURES = 131
NUM_SAMPLES_CREMA = 7441
NUM_SAMPLES_TESS = 2400
NUM_SAMPLES_RAVDESS = 1196
NUM_SAMPLES_LDC = 1379
NUM_SAMPLES = NUM_SAMPLES_CREMA + NUM_SAMPLES_TESS + NUM_SAMPLES_RAVDESS + NUM_SAMPLES_LDC
#CHUNK_LENGTH = 512 #ms
FS = 16000

#global variables
mean_vector = []
std_vector = []

In [27]:
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

# Emotion Classification

In [28]:
EMOTIONS = {
    #CREMA
    "ANG": 0,
    "DIS": 1,
    "FEA": 2,
    "HAP": 3,
    "NEU": 4,
    "SAD": 5,
    #TESS
    "angry": 0,
    "disgust": 1,
    "fear": 2,
    "happy": 3,
    "neutral": 4,
    "sad": 5,
    #RAVDESS
    "05": 0, #angry ravdess
    "07": 1, #disgust ravdess
    "06": 2, #fear ravdess
    "03": 3, #happy ravdess
    "01": 4, #neutral ravdess
    "02": 4, #calm ravdess (neutral)
    "04": 5, #sad ravdess
    
    #LDC
    "panic": 2,   #fear LDC
    "hot anger": 0,
    "cold anger": 0,
    "despair": 5,
    "sadness": 5,
    "elation": 3
}

In [5]:
def get_emotion_vector(emotion_index, gender):
    vec = np.zeros((1, NUM_EMOTIONS*2))
    vec[0][emotion_index + gender*NUM_EMOTIONS] = 1
    return vec

# Process Sound

In [6]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    sos = butter(order, [low, high], btype='band', analog=False, output='sos')
    return sos

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    sos = butter_bandpass(lowcut, highcut, fs, order=order)
    y = sosfiltfilt(sos, data)
    return y
def normalize(snd_data):
    "Average the volume out"
    MAXIMUM = 16384
    times = float(MAXIMUM)/max(abs(i) for i in snd_data)

    r = array('h')
    for i in snd_data:
        r.append(int(i*times))
    return r

In [7]:
def remove_silence_from(amplitudes, threshold):
    silenced = []
    for x in amplitudes:
        if x >= threshold:
            silenced.append(x)
    return silenced

In [8]:
def clean_sound(data, fs):
    data = butter_bandpass_filter(data, BANDPASS_FREQ[0], BANDPASS_FREQ[1], fs)
    data = normalize(data)
    data = np.asarray(data)
    return data, fs

# Extract Individual Datasets

In [9]:
def data_extract_LDC(npload=False, CHUNK_LENGTH=512):
    LOCAL_PATH = "../../LDC/"
    
    dataset = []
    y = np.empty((0, NUM_EMOTIONS*2), int)
    
    if not npload:
        prog = 0 #for the progress bar
        for file in os.listdir(LOCAL_PATH + "transcr/"): #loop through each text file
            if file.endswith(".txt"):
                metadata = pd.read_csv(LOCAL_PATH + "transcr/" + file, sep="A:", skiprows=1, header=None, engine="python")
                #remove non-data entries
                metadata = metadata[metadata[1] != ' [MISC]']
                metadata = metadata[metadata[1].str.startswith(' (') == False]

                #split into start, stop, and emotion
                metadata[2] = metadata[0].str.split(' ', expand=True)[0]
                metadata[3] = metadata[0].str.split(' ', expand=True)[1]
                metadata[4] = metadata[1].str.split(',', expand=True)[0]
                metadata[4] = metadata[4].str[1:]
                metadata = metadata.drop([0,1], axis=1)
                metadata.columns = ["Start", "Stop", "Emotion"]

                #load sound file
                soundfile_name = file[:-3] + "wav"
                soundfile, fs = librosa.load(LOCAL_PATH + "speech/" + soundfile_name)

                for index, row in metadata.iterrows(): #loop through each sound clip
                    emotion = row.Emotion
                    #Check if we're using this emotion
                    e = EMOTIONS.get(emotion, None)
                    if(e != None):
                        progress(prog, NUM_SAMPLES_LDC, status="Reading files")
                        #male = 0, female  = 1
                        gender = 0 if (file.startswith("cc") or
                                       file.startswith("mf") or
                                       file.startswith("cl")) else 1
                                              
                        #process sound
                        data = soundfile[int(float(metadata.Start[index])*fs):int(float(metadata.Stop[index])*fs)]
                        data = librosa.resample(data, fs, FS)
                        fs = FS
                        data, fs = clean_sound(data, fs)

                        chunk_size = int(CHUNK_LENGTH * fs / 1000)
                        num_chunks = int(len(data) // chunk_size)
                        for i in range(num_chunks):
                            snippet = data[i*chunk_size:i*chunk_size+chunk_size]
                            y = np.append(y, e+ gender*NUM_EMOTIONS)
                            dataset.append(snippet)
                        if (len(data) % chunk_size) > 0:
                            snippet = data[num_chunks*chunk_size:]
                            snippet = np.resize(snippet, chunk_size)
                            y = np.append(y, e + gender*NUM_EMOTIONS)
                            dataset.append(snippet)
                        prog+=1 #for progress bar
        dataset = np.array(dataset)
        print("dataset shape: " + str(dataset.shape))
        np.save(LOCAL_PATH + "LDCdatafsy", (dataset, y, 0))
    else:
        dataset, y, useless = np.load(LOCAL_PATH + "LDCdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    #features = get_all_features(np.array(dataset), fs)
    features = np.array(dataset)
    print("Feature Vector Shape: " + str(features.shape))
    return features, y

In [10]:
def data_extract_RAVDESS(npload=False, CHUNK_LENGTH=512):
    LOCAL_PATH = '../../RAVDESS/'
    
    dataset = []
    y = np.empty((0, NUM_EMOTIONS*2), int)
    
    if not npload:
        prog = 0 # for progress bar
        for file in os.listdir(LOCAL_PATH):
            if  file.endswith('.wav'):
                emotion = (file.split('-')[2])
                #Check if we're using this emotion
                e = EMOTIONS.get(emotion, None)
                if(e != None):
                    progress(prog, NUM_SAMPLES_RAVDESS, status="Reading files")
                    #male = 0, female  = 1
                    gender = (int(file.split('-')[-1].split('.')[0]) + 1) % 2
                    
                    #process sound
                    data, fs = librosa.load(LOCAL_PATH + file, sr=None)
                    data = librosa.resample(data, fs, FS)
                    fs = FS
                    data, fs = clean_sound(data, fs)
                    
                    chunk_size = int(CHUNK_LENGTH * fs / 1000)
                    num_chunks = int(len(data) // chunk_size)
                    for i in range(num_chunks):
                        snippet = data[i*chunk_size:i*chunk_size+chunk_size]
                        y = np.append(y, e+ gender*NUM_EMOTIONS)
                        dataset.append(snippet)
                    if (len(data) % chunk_size) > 0:
                        snippet = data[num_chunks*chunk_size:]
                        snippet = np.resize(snippet, chunk_size)
                        y = np.append(y, e + gender*NUM_EMOTIONS)
                        dataset.append(snippet)
                    prog+=1 #for progress bar
        np.save(LOCAL_PATH + "RAVDESSdatafsy", (dataset, y, 0))
    else:
        dataset, y, useless= np.load(LOCAL_PATH + "RAVDESSdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    #features = get_all_features(np.array(dataset), fs)
    features = np.array(dataset)
    print("Feature Vector Shape: " + str(features.shape))
    return features, y

In [11]:
def data_extract_CREMA(npload=False, CHUNK_LENGTH=512):
    LOCAL_PATH = "../../CREMA/"
    WAV_PATH = LOCAL_PATH + "AudioWAV/"
    demographics = pd.read_csv(LOCAL_PATH + "VideoDemographics.csv")
    
    dataset = []
    y = np.empty((0, 1), int)
    
    if not npload:
        prog = 0 # for progress bar
        for file in os.listdir(WAV_PATH):
            if file.endswith('.wav'):
                emotion = file[9:12]
                #Check if we're using this emotion
                e = EMOTIONS.get(emotion, None)
                if(e != None):
                    progress(prog, NUM_SAMPLES_CREMA, status="Reading files")
                     #Get actor ID from filename
                    actor_id = int(file[0:4])
                    #get the gender from demographics pd dataframe. 0 for Male, 1 for female
                    gender = 0 if demographics["Sex"][actor_id - 1001] == "Male" else 1
                    
                    #process sound
                    data, fs = librosa.load(WAV_PATH + file, sr=None)
                    data = librosa.resample(data, fs, FS)
                    fs = FS
                    data, fs = clean_sound(data, fs)
                    
                    chunk_size = int(CHUNK_LENGTH * fs / 1000)
                    num_chunks = int(len(data) // chunk_size)
                    for i in range(num_chunks):
                        snippet = data[i*chunk_size:i*chunk_size+chunk_size]
                        y = np.append(y, e+ gender*NUM_EMOTIONS)
                        dataset.append(snippet)
                    if (len(data) % chunk_size) > 0:
                        snippet = data[num_chunks*chunk_size:]
                        snippet = np.resize(snippet, chunk_size)
                        y = np.append(y, e + gender*NUM_EMOTIONS)
                        dataset.append(snippet)
                    prog+=1 #for progress bar
        np.save(LOCAL_PATH + "CREMAdatafsy", (dataset, y, 0))
    else:
        dataset, y, useless = np.load(LOCAL_PATH + "CREMAdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    #features = get_all_features(np.array(dataset), fs)
    features = np.array(dataset)
    print("Feature Vector Shape: " + str(features.shape))
    return features, y


In [12]:
def data_extract_TESS(npload=False, CHUNK_LENGTH=512):
    LOCAL_PATH = "../../TESS/"
    
    dataset = []
    y = np.empty((0, NUM_EMOTIONS*2), int)
    
    if not npload:
        prog = 0 # for progress bar
        for file in os.listdir(LOCAL_PATH):
            if file.endswith('.wav'):
                emotion = file[file.index('_', 4) + 1 : file.index('.')]
                #Check if we're using this emotion
                e = EMOTIONS.get(emotion, None)
                if(e != None):
                    progress(prog, NUM_SAMPLES_TESS, status="Reading files")
                    #TESS is all female so gender = 1
                    gender = 1
                    
                    #process sound
                    data, fs = librosa.load(LOCAL_PATH + file, sr=None)
                    data = librosa.resample(data, fs, FS)
                    fs = FS
                    data, fs = clean_sound(data, fs)
                    
                    chunk_size = int(CHUNK_LENGTH * fs / 1000)
                    num_chunks = int(len(data) // chunk_size)
                    for i in range(num_chunks):
                        snippet = data[i*chunk_size:i*chunk_size+chunk_size]
                        y = np.append(y, e+ gender*NUM_EMOTIONS)
                        dataset.append(snippet)
                    if (len(data) % chunk_size) > 0:
                        snippet = data[num_chunks*chunk_size:]
                        snippet = np.resize(snippet, chunk_size)
                        y = np.append(y, e + gender*NUM_EMOTIONS)
                        dataset.append(snippet)
                    prog+=1 #for progress bar
        np.save(LOCAL_PATH + "TESSdatafsy", (dataset, y, 0))
    else:
        dataset, y, useless = np.load(LOCAL_PATH + "TESSdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    #features = get_all_features(np.array(dataset), fs)
    features = np.array(dataset)
    print("Feature Vector Shape: " + str(features.shape))
    return features, y

In [13]:
def data_extract_all(CHUNK_LENGTH=512):
    y = np.empty((0, 1), int)
    chunked = np.empty((0, int(CHUNK_LENGTH * FS / 1000)))
    print("----------------------------------------------\n")
    print("Extracting LDC....")
    chunk_arr, y_arr = data_extract_LDC(npload=False, CHUNK_LENGTH)
    print(chunked.shape)
    print(chunk_arr.shape)
    chunked = np.append(chunked, chunk_arr, axis=0)
    y = np.append(y, y_arr)
    print("Done extracting LDC")
    print("----------------------------------------------\n")
    print("Extracting CREMA....")
    chunk_arr, y_arr = data_extract_CREMA(npload=False, CHUNK_LENGTH)
    chunked = np.append(chunked, chunk_arr, axis=0)
    y = np.append(y, y_arr)
    print("Done extracting CREMA")
    print("\n----------------------------------------------\n")
    print("Extracting TESS....")
    chunk_arr, y_arr = data_extract_TESS(npload=False, CHUNK_LENGTH)
    chunked = np.append(chunked, chunk_arr, axis=0)
    y = np.append(y, y_arr)
    print("Done extracting TESS")
    print("\n----------------------------------------------\n")
    print("Extracting RAVDESS....")
    chunk_arr, y_arr = data_extract_RAVDESS(npload=False, CHUNK_LENGTH=1000)
    chunked = np.append(chunked, chunk_arr, axis=0)
    y = np.append(y, y_arr)
    print("Done extracting RAVDESS")
    print("\n----------------------------------------------")
#     mean_vector = np.mean(features, axis=0)
#     std_vector = np.std(features, axis=0)
    
    #normalize the data
    #features = (features - mean_vector) / std_vector
    return chunked, y
    

SyntaxError: positional argument follows keyword argument (<ipython-input-13-dfd0c8fef4bd>, line 6)

# Feature Extraction

In [14]:
def MFCC_algorithm(np_data, fs):
    MFCC2 = []
    #for progess bar
    i = 0
    prog = np_data.shape[0]
    for one_sound in np_data:
        progress(i, prog, "Calculating MFCC's")
        one_sound = np.asarray(one_sound)
        MFCC2.append(python_speech_features.base.mfcc(one_sound, samplerate=fs, 
                                     winlen=0.025, winstep=0.01, numcep=13, 
                                     nfilt=26, nfft=1200).T)
        i+=1
    MFCC3 = []
    cached_variables = []
    for one_point in MFCC2:
        cache_grad = (np.gradient(one_point, axis = 1))
        cached_variables.append(np.asarray([np.mean(one_point, axis = 1), np.median(one_point, axis = 1),
                                 np.var(one_point, axis = 1), 
                           np.min(one_point, axis = 1), np.max(one_point, axis = 1), 
                                 np.mean(cache_grad, axis = 1), np.var(cache_grad, axis = 1)]).flatten()
                               )
    return np.array(cached_variables)
    

In [15]:
def get_pitch_vector(data, fs):
    data = np.float32(data)
    pitch = pysptk.sptk.rapt(data, fs, hopsize = 50)
    silenced = remove_silence_from(pitch, np.mean(pitch))
    return silenced

In [16]:
def get_spectral_vector(data, fs):
    data = np.float32(data)
    cent = librosa.feature.spectral_centroid(y=data, sr=fs)
    return cent

In [17]:
def get_lpc_vector(data):
    vec = lpc.lpc_ref(data, 12)
    return vec

In [18]:
def get_rms_vector(data):
    temp_data = np.float32(data)
    cent = librosa.feature.rms(y=temp_data)
    return cent

In [19]:
def get_zero_vector(data):
    temp_data = np.float32(data)
    cent = librosa.feature.zero_crossing_rate(y=temp_data)
    return cent

In [20]:
def get_sr_vector(data):
    temp_data = np.float32(data)
    cent = librosa.feature.spectral_rolloff(y=temp_data)
    return cent

In [21]:
def get_stats(pitch_vector):
    mean = np.mean(pitch_vector)
    median = np.median(pitch_vector)
    low = np.min(pitch_vector)
    high = np.max(pitch_vector)
    variance = np.var(pitch_vector)
    
    #derivative
    derivative = np.diff(pitch_vector)
    d_mean = np.mean(derivative)
    d_min = np.min(derivative)
    d_max = np.max(derivative)
    return [mean, median, low, high, variance, d_mean, d_min, d_max]

In [22]:
def get_all_features(np_array, fs):
    x = MFCC_algorithm(np_array, fs)
    print("MFCC DONE", end=" ")
    print("dimensions are "+str([len(x), len(x[0])]))
    x1 = []
    x2 = []
    x3 = []
    x4 = []
    x5 = []
    #for progress bar
    size = np_array.shape[0]
    i = 0
    for data in np_array:
        progress(i, size, status="Calculating stats")
        pitch_vector = get_pitch_vector(data, fs)
        stats = get_stats(pitch_vector)
        x1.append(stats)
        
        spectral_vector = get_spectral_vector(data, fs)
        stats = get_stats(spectral_vector)
        x2.append(stats)
        
        rms_vector = get_rms_vector(data)
        stats = get_stats(rms_vector)
        x3.append(stats)
        
        sr_vector = get_sr_vector(data)
        stats = get_stats(sr_vector)
        x4.append(stats)
    
        zero_vector = get_zero_vector(data)
        stats = get_stats(zero_vector)
        x5.append(stats)
        
        i+=1
    print("MFCC dimensions:" + str([len(x), len(x[0])]))
    print("Pitch dimensions:" + str([len(x1), len(x1[0])]))
    print("Spectral dimensions:" + str([len(x2), len(x2[0])]))
    print("RMS dimensions:" + str([len(x3), len(x3[0])]))
    print("SR dimensions:" + str([len(x4), len(x4[0])]))
    print("Zero dimensions:" + str([len(x5), len(x5[0])]))
    x = np.concatenate((x,x1,x2,x3,x4,x5), axis=1)
    return x

# Ready Dataset and output
Put all of the extracted features into X and the classifications into y and split into training and testing group

In [23]:
def x_y_split(x, y):
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)
    
    num_labels = y_train.shape[1]
    num_features = X_train.shape[1]
    print("x train shape: " +str(X_train.shape))
    print("y train shape: " +str(y_train.shape))
    print("x test shape: " +str(X_test.shape))
    print("y test shape: " +str(y_test.shape))
    print("x validation shape: " +str(X_val.shape))
    print("y validation shape: " +str(y_val.shape))
    for i in range(num_labels):
        print("y_train for emotion "+str(i)+": "+ str(np.sum(y_train[:,i])))
    for i in range(num_labels): 
        print("y_test for emotion "+str(i)+": "+ str(np.sum(y_test[:,i])))
    return X_train, X_val, X_test, y_train, y_val, y_test

In [24]:
features, y = data_extract_all(CHUNK_LENGTH=1000)

NameError: name 'data_extract_all' is not defined

In [60]:
chunk_arr, y_arr = data_extract_CREMA(npload=False)

Feature Vector Shape: (40624, 8192)


In [68]:
chunk_arr, y_arr, fs_arr

(array([[    7,    -3,    -7, ...,    47,   -71,  -113],
        [  -86,   -24,    67, ...,   845,   170,  -124],
        [  186,   604,   624, ..., -1955, -1108,  -103],
        ...,
        [-1610,  -631,   288, ...,    -3,    44,   144],
        [  229,   232,   127, ...,   -26,   -50,   -81],
        [  -88,   -61,   -18, ...,   -76,   -75,   -73]], dtype=int16),
 array([ 0,  0,  0, ..., 10, 10, 10]),
 array([16000., 16000., 16000., ..., 16000., 16000., 16000.]))

In [318]:
features.shape

(65504, 8192)

1

In [35]:
y = np.append(y, y)

In [36]:
y

array([1, 2, 1, 2])

In [258]:
len(features[0])

8192

In [259]:
np.save("../../CREMA_chunked", (features, y, 0))

In [None]:
features, y, useless_number = np.load("../../CREMA_chunked.npy", allow_pickle=True)

In [60]:
x = np.zeros((1, 4))

In [63]:
x[0][2] = 1

In [64]:
x

array([[0., 0., 1., 0.]])

In [66]:
y = np.zeros((1,4))
y[0][1] = 1

In [67]:
y

array([1, 2, 1, 2])

In [28]:
#np.save("../../most_of_the_data", (y,features, 0))

#X_train, X_val, X_test, y_train, y_val, y_test = x_y_split(features, y)

In [29]:
#np.save("../../splitdata", (X_train, X_val, X_test, y_train, y_val, y_test))

In [126]:
#X_train, X_val, X_test, y_train, y_val, y_test = np.load("../../splitdata.npy", allow_pickle=True)