In [4]:
import os
import numpy as np

In [5]:
import pathlib
allPaths = []
dataPath = pathlib.Path('Data')
for i in dataPath.rglob('*.wav'):
    allPaths.append(i)

In [6]:
def pathToLabel(path):
    labels = ['angry','disgust','fear','happy','neutral','surprise','sad']
    for label in labels:
        if (label in str(path).lower()): return label
    return None

In [7]:
labels = {
    'angry':0,
    'disgust':1,
    'fear':2,
    'happy':3,
    'neutral':4,
    'surprise':5,
    'sad':6
}
def labelToInt(label):
    return labels[label]

In [8]:
labeledPaths = np.array(list(map(lambda x: pathToLabel(x),allPaths)))

In [9]:
y = np.array(list(map(lambda x: labelToInt(x),labeledPaths)))

In [10]:
import librosa
data, sampling_rate = librosa.load(allPaths[0])
data2, sr2 = librosa.load(allPaths[1])

In [11]:
x = []
for i in range(len(allPaths)):
    try:
        a, sr = librosa.load(allPaths[i])
        feat = librosa.feature.melspectrogram(y=a,sr=sr)
        x.append(feat)
    except:
        y = np.delete(y,i)
        print('Path '+str(allPaths[i])+' not found.')
x

[array([[4.20931960e-03, 1.87859889e-02, 4.52385284e-02, ...,
         4.12477255e-02, 1.48166921e-02, 5.78817911e-03],
        [1.48720713e-03, 2.85470183e-03, 2.65270728e-03, ...,
         2.02998752e-03, 1.49130251e-03, 6.23799220e-04],
        [7.70022307e-05, 1.35702925e-04, 3.96254400e-05, ...,
         3.86430656e-05, 2.98951672e-05, 5.02339171e-05],
        ...,
        [6.46549907e-08, 3.66798145e-06, 5.41412483e-05, ...,
         3.60555669e-06, 8.80300661e-07, 1.01577996e-07],
        [3.30214682e-08, 1.78459618e-06, 4.07043990e-05, ...,
         1.30374065e-06, 2.32321611e-07, 5.94244902e-08],
        [2.53028043e-09, 9.24478130e-08, 2.95018685e-06, ...,
         1.94059524e-07, 2.73151812e-08, 7.75217668e-09]], dtype=float32),
 array([[1.2747131e-02, 1.9700620e-02, 4.0703744e-02, ..., 5.8466140e-03,
         9.5014777e-03, 1.2387074e-02],
        [1.3828035e-03, 1.7736690e-03, 4.3622463e-04, ..., 4.5823230e-04,
         1.2417263e-03, 2.4469320e-03],
        [8.0050624e-05

In [12]:
cnt = 0
for i in range(len(x)):
    cnt+=len(x[i][0])
print(cnt/len(x))

89.01821428571428


In [13]:
mx = 0
for i in range(len(x)):
    if (len(x[i][0])>mx): mx = len(x[i][0])
print(mx)

129


In [14]:
length = 100
def pad(x):
    newX = []
    for row in x:
        if (len(row)>=length): newX.append(row[0:length])
        else: newX.append(np.concatenate((row,[0]*(length-len(row)))))
    return np.array(newX)

In [15]:
padX = np.array(list(map(lambda a: pad(a),x)))

In [16]:
padX[0].shape

(128, 100)

In [17]:
padX.shape

(2800, 128, 100)

In [18]:
from sklearn.model_selection import train_test_split
# Using the first and last as an example
xTrain, xTest, yTrain, yTest = train_test_split(padX[1:len(padX)-1],y[1:len(y)-1],test_size=.2,random_state=50)
xTest, xVal, yTest, yVal = train_test_split(xTest,yTest,test_size=.5)

In [30]:
import tensorflow as tf
from tensorflow.keras import layers
model = tf.keras.Sequential()
model.add(layers.Input((128,100)))

model.add(layers.Conv1D(128,3))
model.add(layers.Conv1D(128,3))
model.add(layers.MaxPooling1D())

model.add(layers.Conv1D(64,3))
model.add(layers.Conv1D(64,3))
model.add(layers.MaxPooling1D())

model.add(layers.Bidirectional(layers.LSTM(64)))
model.add(layers.Dropout(.2))

model.add(layers.Dense(32))
model.add(layers.Dense(32))
model.add(layers.Dense(7,activation="softmax"))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_20 (Conv1D)          (None, 126, 128)          38528     
                                                                 
 conv1d_21 (Conv1D)          (None, 124, 128)          49280     
                                                                 
 max_pooling1d_9 (MaxPoolin  (None, 62, 128)           0         
 g1D)                                                            
                                                                 
 conv1d_22 (Conv1D)          (None, 60, 64)            24640     
                                                                 
 conv1d_23 (Conv1D)          (None, 58, 64)            12352     
                                                                 
 max_pooling1d_10 (MaxPooli  (None, 29, 64)            0         
 ng1D)                                                

In [31]:
model.fit(xTrain,yTrain,epochs=5,validation_data=(xVal,yVal))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x246c52d5300>

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
testPreds = model.predict(xTest)
maxedTestPreds = np.array(list(map(lambda x: np.argmax(x),testPreds)))
print("Accuracy "+str(accuracy_score(maxedTestPreds,yTest)))
print("Disgust and suprise get mixed up; happy and suprise get mixed up; which is expected")
confusion_matrix(yTest,maxedTestPreds)

Accuracy 0.9857142857142858
Disgust and suprise get mixed up; happy and suprise get mixed up; which is expected


array([[35,  0,  0,  1,  0,  0,  0],
       [ 0, 37,  0,  0,  0,  1,  0],
       [ 0,  0, 39,  0,  0,  0,  0],
       [ 0,  0,  0, 44,  0,  0,  0],
       [ 0,  0,  0,  0, 36,  0,  0],
       [ 0,  1,  0,  0,  0, 38,  0],
       [ 0,  0,  0,  0,  0,  1, 47]], dtype=int64)

In [33]:
labels

{'angry': 0,
 'disgust': 1,
 'fear': 2,
 'happy': 3,
 'neutral': 4,
 'surprise': 5,
 'sad': 6}

In [34]:
preds = model.predict(np.array([padX[0],padX[-1]]))
argMaxedPreds = [np.argmax(preds[0]),np.argmax(preds[1])]
print("Predicted Angry and Sad from data outside the train and val sets")
argMaxedPreds

Predicted Angry and Sad from data outside the train and val sets


[0, 6]

In [35]:
from IPython.display import Audio
print("Predicted Angry")
Audio(allPaths[0])

Predicted Angry


In [36]:
print("Predicted Sad")
Audio(allPaths[-1])

Predicted Sad
