In [3]:
Crema = 'CREMA-D/AudioWAV/'

In [4]:
import os
import sklearn
import pandas as pd
crema_directory_list = os.listdir(Crema)

file_emotion = []
file_path = []

for file in crema_directory_list:
    # storing file paths
    file_path.append(Crema + file)
    # storing file emotions
    part=file.split('_')
    if part[2] == 'SAD':    
        file_emotion.append('sad')
    elif part[2] == 'ANG':
        file_emotion.append('angry')
    elif part[2] == 'DIS':
        file_emotion.append('disgust')
    elif part[2] == 'FEA':
        file_emotion.append('fear')
    elif part[2] == 'HAP':
        file_emotion.append('happy')
    elif part[2] == 'NEU':
        file_emotion.append('neutral')
    else:
        file_emotion.append('Unknown')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Crema_df = pd.concat([emotion_df, path_df], axis=1)
Crema_df.head()

Unnamed: 0,Emotions,Path
0,fear,CREMA-D/AudioWAV/1025_TIE_FEA_XX.wav
1,disgust,CREMA-D/AudioWAV/1054_MTI_DIS_XX.wav
2,disgust,CREMA-D/AudioWAV/1052_IOM_DIS_XX.wav
3,sad,CREMA-D/AudioWAV/1067_ITH_SAD_XX.wav
4,neutral,CREMA-D/AudioWAV/1091_ITH_NEU_XX.wav


In [5]:
data_path = Crema_df
data_path.to_csv("data_path.csv",index=False)
data_path.head()


Unnamed: 0,Emotions,Path
0,fear,CREMA-D/AudioWAV/1025_TIE_FEA_XX.wav
1,disgust,CREMA-D/AudioWAV/1054_MTI_DIS_XX.wav
2,disgust,CREMA-D/AudioWAV/1052_IOM_DIS_XX.wav
3,sad,CREMA-D/AudioWAV/1067_ITH_SAD_XX.wav
4,neutral,CREMA-D/AudioWAV/1091_ITH_NEU_XX.wav


In [6]:
import numpy as np
import librosa
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
path = np.array(data_path.Path)[1]
data, sample_rate = librosa.load(path)

In [7]:
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [8]:
data_path

Unnamed: 0,Emotions,Path
0,fear,CREMA-D/AudioWAV/1025_TIE_FEA_XX.wav
1,disgust,CREMA-D/AudioWAV/1054_MTI_DIS_XX.wav
2,disgust,CREMA-D/AudioWAV/1052_IOM_DIS_XX.wav
3,sad,CREMA-D/AudioWAV/1067_ITH_SAD_XX.wav
4,neutral,CREMA-D/AudioWAV/1091_ITH_NEU_XX.wav
...,...,...
7437,happy,CREMA-D/AudioWAV/1090_TAI_HAP_XX.wav
7438,angry,CREMA-D/AudioWAV/1061_IOM_ANG_XX.wav
7439,disgust,CREMA-D/AudioWAV/1067_ITS_DIS_XX.wav
7440,fear,CREMA-D/AudioWAV/1055_MTI_FEA_XX.wav


In [9]:
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        Y.append(emotion)

  return librosa.effects.time_stretch(data, rate)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
  return f(*args, **kwargs)


In [10]:
len(X), len(Y), data_path.Path.shape

(22326, 22326, (7442,))

In [11]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.05358,0.637844,0.606111,0.571997,0.630106,0.71443,0.675026,0.665222,0.719204,0.826163,...,3.242615e-10,2.971604e-10,2.766176e-10,2.603836e-10,2.475797e-10,2.377858e-10,2.303247e-10,2.252069e-10,2.220143e-10,fear
1,0.054719,0.648945,0.618575,0.585555,0.643181,0.723304,0.677569,0.665227,0.717826,0.825301,...,2.597903e-06,2.547018e-06,2.722617e-06,2.717672e-06,2.615127e-06,2.598697e-06,2.703754e-06,2.725612e-06,2.528838e-06,fear
2,0.059451,0.583235,0.640867,0.584112,0.56725,0.634756,0.718933,0.701798,0.665793,0.731998,...,1.885746e-10,1.724273e-10,1.642871e-10,1.507513e-10,1.506967e-10,1.413461e-10,1.223686e-10,1.028102e-10,9.267615e-11,fear
3,0.059764,0.452337,0.392475,0.459169,0.471746,0.485209,0.50753,0.684371,0.797795,0.724068,...,3.943509e-08,3.484132e-08,3.144174e-08,2.88397e-08,2.684491e-08,2.534668e-08,2.423224e-08,2.347231e-08,2.300906e-08,disgust
4,0.141958,0.533265,0.539109,0.597065,0.602417,0.61296,0.629087,0.760494,0.794806,0.695523,...,0.0008666924,0.0008913741,0.0008474859,0.0008858735,0.0008761011,0.0008360007,0.0009038528,0.0008289524,0.0007953483,disgust


In [12]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [13]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16744, 162), (16744, 6), (5582, 162), (5582, 6))

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16744, 162), (16744, 6), (5582, 162), (5582, 6))

In [16]:
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((16744, 162, 1), (16744, 6), (5582, 162, 1), (5582, 6))

In [17]:
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.models import Sequential
model=Sequential()

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=6, activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()
"""import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
model = tf.keras.Sequential()
model.add(LSTM(256, input_shape=(x_train.shape[1], 1), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(64))
model.add(Dropout(0.2))

model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=6, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()"""

2023-01-06 23:07:52.282121: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-06 23:07:52.404038: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-06 23:07:52.406236: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-06 23:07:52.406246: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 162, 256)          1536      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 81, 256)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 81, 256)           327936    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 41, 256)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 41, 128)           163968    
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 21, 128)          0

2023-01-06 23:07:54.808715: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-06 23:07:54.808877: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-06 23:07:54.808928: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-01-06 23:07:54.808959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-01-06 23:07:54.823785: W tensorflow/c

"import tensorflow as tf\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\nfrom tensorflow.keras.layers import LSTM\nfrom tensorflow.keras.layers import Dense\nfrom tensorflow.keras.layers import Dropout\nmodel = tf.keras.Sequential()\nmodel.add(LSTM(256, input_shape=(x_train.shape[1], 1), return_sequences=True))\nmodel.add(Dropout(0.2))\n\nmodel.add(LSTM(128, return_sequences=True))\nmodel.add(Dropout(0.2))\n\nmodel.add(LSTM(64))\nmodel.add(Dropout(0.2))\n\nmodel.add(Dense(units=32, activation='relu'))\nmodel.add(Dropout(0.3))\n\nmodel.add(Dense(units=6, activation='softmax'))\nmodel.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n\nmodel.summary()"

In [18]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
history=model.fit(x_train, y_train, batch_size=64, epochs=150, validation_data=(x_test, y_test), callbacks=[rlrp])

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [24]:
import pyaudio
import wave
while True:
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    RECORD_SECONDS = 2
    WAVE_OUTPUT_FILENAME = "output.wav"

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("* recording")

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    audioFile = get_features(WAVE_OUTPUT_FILENAME)
    print(model.predict(audioFile))

* recording
* done recording


  return librosa.effects.time_stretch(data, rate)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
* recording
* done recording
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
* recording
* done recording
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
* recording
* done recording
[[7.1105357e-09 0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [1.4282307e-25 0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]]
* recording
* done recording
[[9.9619460e-01 0.0000000e+00 3.8053473e-03 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [5.0556429e-25 0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00]
 [2.4077317e-05 0.0000000e+00 9.9997592e-01 3.0770352e-27 0.0000000e+00
  0.0000000e+00]]
* recording
* done recording
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
* recording
* done recording


KeyboardInterrupt: 

In [27]:
thing = get_features('CREMA-D/AudioWAV/1001_IEO_HAP_HI.wav')
model.predict(thing)




  return librosa.effects.time_stretch(data, rate)
  return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)


array([[0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0000000e+00, 6.7680508e-36, 0.0000000e+00, 2.0294972e-12,
        0.0000000e+00, 0.0000000e+00],
       [8.0735727e-38, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]], dtype=float32)

In [32]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()



ValueError: could not convert string to float: 'angry'

In [33]:
X = Features.iloc[: ,:-1].values
"""
3=angry

"""

In [37]:
Crema_df['Emotions'].head()

0       fear
1    disgust
2    disgust
3        sad
4    neutral
Name: Emotions, dtype: object

In [38]:
Y[0:5]

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])