# Dependencies

In [80]:
import scipy
from scipy.io import wavfile
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import signal
import osascript
from gtts import gTTS 
import os 
import pyaudio
import wave
import keyboard  # using module keyboard
import soundfile as sf
import math
import pyloudnorm as pyln
import sys
from sys import byteorder
from array import array
from struct import pack
import librosa
from scipy.signal import butter, sosfiltfilt
import python_speech_features
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 
import pysptk
from  conch.analysis.formants import lpc
import os

# Constants

In [81]:
BANDPASS_FREQ = [300, 3400]
NUM_EMOTIONS = 6
NUM_SAMPLES_CREMA = 7441
NUM_SAMPLES_TESS = 2400
NUM_SAMPLES_RAVDESS = 1196
NUM_SAMPLES_LDC = 1379
NUM_SAMPLES = NUM_SAMPLES_CREMA + NUM_SAMPLES_TESS + NUM_SAMPLES_RAVDESS + NUM_SAMPLES_LDC

EMOTIONS = {
    #CREMA
    "ANG": 0,
    "DIS": 1,
    "FEA": 2,
    "HAP": 3,
    "NEU": 4,
    "SAD": 5,
    #TESS
    "angry": 0,
    "disgust": 1,
    "fear": 2,
    "happy": 3,
    "neutral": 4,
    "sad": 5,
    #RAVDESS
    "05": 0, #angry ravdess
    "07": 1, #disgust ravdess
    "06": 2, #fear ravdess
    "03": 3, #happy ravdess
    "01": 4, #neutral ravdess
    "02": 4, #calm ravdess (neutral)
    "04": 5, #sad ravdess
    
    #LDC
    "panic": 2,   #fear LDC
    "hot anger": 0,
    "cold anger": 0,
    "despair": 5,
    "sadness": 5,
    "elation": 3
}

#global variables
mean_vector = []
std_vector = []

In [82]:
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()

# Process Sound

In [83]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    sos = butter(order, [low, high], btype='band', analog=False, output='sos')
    return sos

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    sos = butter_bandpass(lowcut, highcut, fs, order=order)
    y = sosfiltfilt(sos, data)
    return y
def normalize(snd_data):
    "Average the volume out"
    MAXIMUM = 16384
    times = float(MAXIMUM)/max(abs(i) for i in snd_data)

    r = array('h')
    for i in snd_data:
        r.append(int(i*times))
    return r

In [84]:
def remove_silence_from(amplitudes, threshold):
    silenced = []
    for x in amplitudes:
        if x >= threshold:
            silenced.append(x)
    return silenced

In [85]:
def clean_sound(data, fs):
    data = butter_bandpass_filter(data, BANDPASS_FREQ[0], BANDPASS_FREQ[1], fs)
    data = normalize(data)
    data = np.asarray(data)
    return data, fs

# Extract Individual Datasets

In [86]:
def extract_data(file_location):
    BANDPASS_FREQ = [300, 3400]
    fs, data = wavfile.read(file_location)
    number_of_samples = data.shape[0]
    meta_data = open(r"LDC2002S28band-txt.txt")
    meta_data = pd.read_csv("LDC2002S28-txt.txt", sep="A:", header=None, engine='python')
    meta_data.columns = ["sound limits","description"]
    
    #dual channel to one channel
    data = np.average(data, axis = 1)
    #remove noise
    data = butter_bandpass_filter(data, BANDPASS_FREQ[0], BANDPASS_FREQ[1], fs)
    
    # removee extra data pointts
    meta_data = meta_data[meta_data.description != ' [MISC]']
    meta_data = meta_data[~meta_data['description'].astype(str).str.startswith(' (')]
    meta_data = meta_data[~meta_data['description'].astype(str).str.startswith(' Emotion category elation')]
    meta_data = meta_data[~meta_data['description'].astype(str).str.startswith('  [MISC]')]

    # description and time limits 
    voice_time_limits = meta_data["sound limits"]
    voice_time_limits = [i.split(" ")[0:2] for i in voice_time_limits]
    voice_time_limits = np.array(voice_time_limits)
    voice_time_limits = voice_time_limits.astype(np.float)
    description = meta_data["description"]
    description = [i.split(",")[0].strip() for i in description]

    #divide the dataa set
    divided_data = []
    for i in voice_time_limits:
        startingpoint = int(i[0]*fs)
        endingpoint = int(i[1]*fs)
        divided_data.append(data[startingpoint:endingpoint])
    np_data = np.asarray(divided_data)
    return np_data, description, len(divided_data), fs

In [87]:
def data_extract_LDC(features, y, counter, npload=False):
    LOCAL_PATH = "../../LDC/"
    dataset = []
    fs = 0
    if not npload:
        i = 0 #for the progress bar
        for file in os.listdir(LOCAL_PATH + "transcr/"): #loop through each text file
            if file.endswith(".txt"):
                metadata = pd.read_csv(LOCAL_PATH + "transcr/" + file, sep="A:", skiprows=1, header=None, engine="python")
                #remove non-data entries
                metadata = metadata[metadata[1] != ' [MISC]']
                metadata = metadata[metadata[1].str.startswith(' (') == False]

                #split into start, stop, and emotion
                metadata[2] = metadata[0].str.split(' ', expand=True)[0]
                metadata[3] = metadata[0].str.split(' ', expand=True)[1]
                metadata[4] = metadata[1].str.split(',', expand=True)[0]
                metadata[4] = metadata[4].str[1:]
                metadata = metadata.drop([0,1], axis=1)
                metadata.columns = ["Start", "Stop", "Emotion"]

                #load sound file
                soundfile_name = file[:-3] + "wav"
                soundfile, freq = librosa.load(LOCAL_PATH + "speech/" + soundfile_name)

                for index, row in metadata.iterrows(): #loop through each sound clip
                    emotion = row.Emotion
                    #Check if we're using this emotion
                    e = EMOTIONS.get(emotion, None)
                    if(e != None):
                        progress(i, NUM_SAMPLES_LDC, status="Reading files")
                        #male = 0, female  = 1
                        gender = 0 if (file.startswith("cc") or
                                       file.startswith("mf") or
                                       file.startswith("cl")) else 1
                        emotion_index = e + gender*NUM_EMOTIONS
                        y[counter][emotion_index] = 1
                        i+=1
                        counter+=1

                        #process sound
                        data = soundfile[int(float(metadata.Start[index])*freq):int(float(metadata.Stop[index])*freq)]
                        data, fs = clean_sound(data, freq)
                        dataset.append(data)
        np.save(LOCAL_PATH + "LDCdatafsy", (dataset, fs, y))
        
    else:
        dataset, fs, y = np.load(LOCAL_PATH + "LDCdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    feature_vector = get_all_features(np.array(dataset), fs)
    print("Feature Vector Shape: " + str(feature_vector.shape))
    
    if features.shape[0] == 0:
        features = feature_vector
    else:
        features = np.concatenate((np.array(features), np.array(feature_vector)))
    print("Features shape: " + str(features.shape))
    return features, y, counter

In [88]:
PATH = "../../LDC/speech/" + "cc_001.wav"

LOCAL_PATH = "../../LDC/"

In [89]:
soundfile, fs = librosa.load(LOCAL_PATH + "speech/" + PATH)

In [90]:
def data_extract_RAVDESS(features, y, counter, npload=False):
    LOCAL_PATH = '../../RAVDESS/'
    dataset = []
    fs = 0
    if not npload:
        i = 0 # for progress bar
        for file in os.listdir(LOCAL_PATH):
            if  file.endswith('.wav'):
                emotion = (file.split('-')[2])
                #Check if we're using this emotion
                e = EMOTIONS.get(emotion, None)
                if(e != None):
                    progress(i, NUM_SAMPLES_RAVDESS, status="Reading files")
                    #male = 0, female  = 1
                    gender = (int(file.split('-')[-1].split('.')[0]) + 1) % 2
                    emotion_index = e + gender*NUM_EMOTIONS
                    y[counter][emotion_index] = 1
                    i+=1
                    counter+=1
                    #process sound
                    data, fs = librosa.load(LOCAL_PATH + file, sr=None)
                    data, fs = clean_sound(data, fs)
                    dataset.append(data)
        np.save(LOCAL_PATH + "RAVDESSdatafsy", (dataset, fs, y))
    else:
        dataset, fs, y = np.load(LOCAL_PATH + "RAVDESSdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    feature_vector = get_all_features(np.array(dataset), fs)
    print("Feature Vector Shape: " + str(feature_vector.shape))
    
    if features.shape[0] == 0:
        features = feature_vector
    else:
        features = np.concatenate((np.array(features), np.array(feature_vector)))
    print("Features shape: " + str(features.shape))
    return features, y, counter

In [91]:
def data_extract_CREMA(features, y, counter, npload=False):
    LOCAL_PATH = "../../CREMA/"
    WAV_PATH = LOCAL_PATH + "AudioWAV/"
    demographics = pd.read_csv(LOCAL_PATH + "VideoDemographics.csv")
    dataset = []
    fs = 0
    if not npload:
        i = 0 # for progress bar
        for file in os.listdir(WAV_PATH):
            if file.endswith('.wav'):
                emotion = file[9:12]
                #Check if we're using this emotion
                e = EMOTIONS.get(emotion, None)
                if(e != None):
                    progress(i, NUM_SAMPLES_CREMA, status="Reading files")
                     #Get actor ID from filename
                    actor_id = int(file[0:4])
                    #get the gender from demographics pd dataframe. 0 for Male, 1 for female
                    gender = 0 if demographics["Sex"][actor_id - 1001] == "Male" else 1
                    emotion_index = e + gender*NUM_EMOTIONS
                    y[counter][emotion_index] = 1
                    i+=1
                    counter+=1 
                    #process sound
                    data, fs = librosa.load(WAV_PATH + file, sr=None)
                    data, fs = clean_sound(data, fs)
                    dataset.append(data)
        np.save(LOCAL_PATH + "CREMAdatafsy", (dataset, fs, y))
    else:
        dataset, fs, y = np.load(LOCAL_PATH + "CREMAdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    feature_vector = get_all_features(np.array(dataset), fs)
    print("Feature Vector Shape: " + str(feature_vector.shape))
    print("Features shape: " + str(len(features)))
    if features.shape[0] == 0:
        features = feature_vector
    else:
        features = np.concatenate((np.array(features), np.array(feature_vector)))
    return features, y, counter


In [92]:
def data_extract_TESS(features, y, counter, npload=False):
    LOCAL_PATH = "../../TESS/"
    fs = 0
    dataset = []
    if not npload:
        i = 0 # for progress bar
        for file in os.listdir(LOCAL_PATH):
            if file.endswith('.wav'):
                emotion = file[file.index('_', 4) + 1 : file.index('.')]
                #Check if we're using this emotion
                e = EMOTIONS.get(emotion, None)
                if(e != None):
                    progress(i, NUM_SAMPLES_TESS, status="Reading files")
                    #TESS is all female so gender = 1
                    gender = 1
                    emotion_index = e + gender*NUM_EMOTIONS
                    y[counter][emotion_index] = 1
                    i+=1
                    counter+=1
                    
                    #process sound
                    data, fs = librosa.load(LOCAL_PATH + file, sr=None)
                    data, fs = clean_sound(data, fs)
                    dataset.append(data)
        np.save(LOCAL_PATH + "TESSdatafsy", (dataset, fs, y))
    else:
        dataset, fs, y = np.load(LOCAL_PATH + "TESSdatafsy.npy", allow_pickle=True)
    #get feature vector
    print("Getting all features...")
    feature_vector = get_all_features(np.array(dataset), fs)
    print("Feature Vector Shape: " + str(feature_vector.shape))
    print("Features shape: " + str(len(features)))
    if features.shape[0] == 0:
        features = feature_vector
    else:
        features = np.concatenate((np.array(features), np.array(feature_vector)))
    return features, y, counter

In [93]:
def data_extract_all():
    y_index = 0
    y = np.zeros((NUM_SAMPLES, NUM_EMOTIONS*2))
    print(y.shape)
    features = np.array([])
    counter = 0
    print("----------------------------------------------\n")
    print("Extracting LDC....")
    features, y, counter = data_extract_LDC(features, y, counter, npload=False)
    print("Done extracting LDC")
    print("----------------------------------------------\n")
    print("Extracting CREMA....")
    features, y, counter = data_extract_CREMA(features, y, counter, npload=False)
    print("Done extracting CREMA")
    print("\n----------------------------------------------\n")
    print("Extracting TESS....")
    features, y, counter = data_extract_TESS(features, y, counter, npload=False)
    print("Done extracting TESS")
    print("\n----------------------------------------------\n")
    print("Extracting RAVDESS....")
    features, y, counter = data_extract_RAVDESS(features, y, counter, npload=False)
    print("Done extracting RAVDESS")
    print("\n----------------------------------------------")
    mean_vector = np.mean(features, axis=0)
    std_vector = np.std(features, axis=0)
    
    #normalize the data
    features = (features - mean_vector) / std_vector
    return features, y
    


# Feature Extraction

In [94]:
def MFCC_algorithm(np_data, fs):
    MFCC2 = []
    #for progess bar
    i = 0
    prog = np_data.shape[0]
    for one_sound in np_data:
        progress(i, prog, "Calculating MFCC's")
        one_sound = np.asarray(one_sound)
        MFCC2.append(python_speech_features.base.mfcc(one_sound, samplerate=fs, 
                                     winlen=0.025, winstep=0.01, numcep=13, 
                                     nfilt=26, nfft=1200).T)
        i+=1
    MFCC3 = []
    cached_variables = []
    for one_point in MFCC2:
        cache_grad = (np.gradient(one_point, axis = 1))
        cached_variables.append(np.asarray([np.mean(one_point, axis = 1), np.median(one_point, axis = 1),
                                 np.var(one_point, axis = 1), 
                           np.min(one_point, axis = 1), np.max(one_point, axis = 1), 
                                 np.mean(cache_grad, axis = 1), np.var(cache_grad, axis = 1)]).flatten()
                               )
    return np.array(cached_variables)
    

In [95]:
def get_pitch_vector(data, fs):
    data = np.float32(data)
    pitch = pysptk.sptk.rapt(data, fs, hopsize = 50)
    silenced = remove_silence_from(pitch, np.mean(pitch))
    return silenced

In [96]:
def get_spectral_vector(data, fs):
    data = np.float32(data)
    cent = librosa.feature.spectral_centroid(y=data, sr=fs)
    return cent

In [97]:
def get_lpc_vector(data):
    vec = lpc.lpc_ref(data, 12)
    return vec

In [98]:
def get_rms_vector(data):
    temp_data = np.float32(data)
    cent = librosa.feature.rms(y=temp_data)
    return cent

In [99]:
def get_zero_vector(data):
    temp_data = np.float32(data)
    cent = librosa.feature.zero_crossing_rate(y=temp_data)
    return cent

In [100]:
def get_sr_vector(data):
    temp_data = np.float32(data)
    cent = librosa.feature.spectral_rolloff(y=temp_data)
    return cent

In [101]:
def get_stats(pitch_vector):
    mean = np.mean(pitch_vector)
    median = np.median(pitch_vector)
    low = np.min(pitch_vector)
    high = np.max(pitch_vector)
    variance = np.var(pitch_vector)
    
    #derivative
    derivative = np.diff(pitch_vector)
    d_mean = np.mean(derivative)
    d_min = np.min(derivative)
    d_max = np.max(derivative)
    return [mean, median, low, high, variance, d_mean, d_min, d_max]

In [102]:
def get_all_features(np_array, fs):
    x = MFCC_algorithm(np_array, fs)
    print("MFCC DONE", end=" ")
    print("dimensions are "+str([len(x), len(x[0])]))
    x1 = []
    x2 = []
    x3 = []
    x4 = []
    x5 = []
    #for progress bar
    size = np_array.shape[0]
    i = 0
    for data in np_array:
        progress(i, size, status="Calculating stats")
        pitch_vector = get_pitch_vector(data, fs)
        stats = get_stats(pitch_vector)
        x1.append(stats)
        
        spectral_vector = get_spectral_vector(data, fs)
        stats = get_stats(spectral_vector)
        x2.append(stats)
        
        rms_vector = get_rms_vector(data)
        stats = get_stats(rms_vector)
        x3.append(stats)
        
        sr_vector = get_sr_vector(data)
        stats = get_stats(sr_vector)
        x4.append(stats)
    
        zero_vector = get_zero_vector(data)
        stats = get_stats(zero_vector)
        x5.append(stats)
        
        i+=1
    print("MFCC dimensions:" + str([len(x), len(x[0])]))
    print("Pitch dimensions:" + str([len(x1), len(x1[0])]))
    print("Spectral dimensions:" + str([len(x2), len(x2[0])]))
    print("RMS dimensions:" + str([len(x3), len(x3[0])]))
    print("SR dimensions:" + str([len(x4), len(x4[0])]))
    print("Zero dimensions:" + str([len(x5), len(x5[0])]))
    x = np.concatenate((x,x1,x2,x3,x4,x5), axis=1)
    return x

# Ready Dataset and output
Put all of the extracted features into X and the classifications into y and split into training and testing group

In [103]:
def x_y_split(x, y):
    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)
    
    num_labels = y_train.shape[1]
    num_features = X_train.shape[1]
    print("x train shape: " +str(X_train.shape))
    print("y train shape: " +str(y_train.shape))
    print("x test shape: " +str(X_test.shape))
    print("y test shape: " +str(y_test.shape))
    print("x validation shape: " +str(X_val.shape))
    print("y validation shape: " +str(y_val.shape))
    for i in range(num_labels):
        print("y_train for emotion "+str(i)+": "+ str(np.sum(y_train[:,i])))
    for i in range(num_labels): 
        print("y_test for emotion "+str(i)+": "+ str(np.sum(y_test[:,i])))
    return X_train, X_val, X_test, y_train, y_val, y_test

In [104]:
#features, y = data_extract_all()

(12416, 12)
----------------------------------------------

Extracting LDC....
Pitch dimensions:[1379, 8]
Spectral dimensions:[1379, 8]
RMS dimensions:[1379, 8]
SR dimensions:[1379, 8]
Zero dimensions:[1379, 8]
Feature Vector Shape: (1379, 131)
Features shape: (1379, 131)
Done extracting LDC
----------------------------------------------

Extracting CREMA....
Pitch dimensions:[7441, 8]
Spectral dimensions:[7441, 8]
RMS dimensions:[7441, 8]
SR dimensions:[7441, 8]
Zero dimensions:[7441, 8]
Feature Vector Shape: (7441, 131)
Features shape: 1379
Done extracting CREMA

----------------------------------------------

Extracting TESS....
Pitch dimensions:[2400, 8]
Spectral dimensions:[2400, 8]
RMS dimensions:[2400, 8]
SR dimensions:[2400, 8]
Zero dimensions:[2400, 8]
Feature Vector Shape: (2400, 131)
Features shape: 8820
Done extracting TESS

----------------------------------------------

Extracting RAVDESS....
Pitch dimensions:[1196, 8]
Spectral dimensions:[1196, 8]
RMS dimensions:[1196, 8

In [105]:
#features.shape, y.shape

((12416, 131), (12416, 12))

In [107]:
#np.save("../../most_of_the_data", (y,features, 0))

#X_train, X_val, X_test, y_train, y_val, y_test = x_y_split(features, y)

x train shape: (9932, 131)
y train shape: (9932, 12)
x test shape: (1242, 131)
y test shape: (1242, 12)
x validation shape: (1242, 131)
y validation shape: (1242, 12)
y_train for emotion 0: 711.0
y_train for emotion 1: 667.0
y_train for emotion 2: 665.0
y_train for emotion 3: 718.0
y_train for emotion 4: 615.0
y_train for emotion 5: 723.0
y_train for emotion 6: 1005.0
y_train for emotion 7: 943.0
y_train for emotion 8: 939.0
y_train for emotion 9: 1023.0
y_train for emotion 10: 863.0
y_train for emotion 11: 1060.0
y_test for emotion 0: 94.0
y_test for emotion 1: 77.0
y_test for emotion 2: 81.0
y_test for emotion 3: 97.0
y_test for emotion 4: 69.0
y_test for emotion 5: 89.0
y_test for emotion 6: 124.0
y_test for emotion 7: 136.0
y_test for emotion 8: 118.0
y_test for emotion 9: 121.0
y_test for emotion 10: 104.0
y_test for emotion 11: 132.0


In [108]:
#np.save("../../splitdata", (X_train, X_val, X_test, y_train, y_val, y_test))

In [109]:
#X_train, X_val, X_test, y_train, y_val, y_test = np.load("../../splitdata.npy", allow_pickle=True)
