# Speech Emotion Recognition 

emotions: [happy, sad, neutral, fear, angry, disgust]

Datasets:
* Crowd-sourced Emotional Multimodal Actors Dataset (Crema-D)
* Ryerson Audio-Visual Database of Emotional Speech and Song (Ravdess)
* Surrey Audio-Visual Expressed Emotion (Savee)
* Toronto Emotional Speech Set (Tess)

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,random_split
from torchvision.models import googlenet, resnet18, resnet50
import torchvision.transforms as transforms
import torch.nn.functional as F
from torchvision.transforms import functional as TF
import sklearn
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

In [None]:
import opendatasets as od
od.download(
    "https://www.kaggle.com/datasets/ejlok1/cremad")
od.download(
    "https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio")
od.download(
    "https://www.kaggle.com/datasets/ejlok1/surrey-audiovisual-expressed-emotion-savee")
od.download(
    "https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess")

## fix random seed

In [2]:
import random

seed = 20236
deterministic = True

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if deterministic:
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## read data

In [3]:
RAVDESS = "test/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
CREMA = "test/cremad/AudioWAV/"
TESS = "test/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
SAVEE = "test/surrey-audiovisual-expressed-emotion-savee/ALL/"

In [4]:
ravdess_dir_lis = os.listdir(RAVDESS)
path_list = []
gender_list = []
emotion_list = []

emotion_dic = {
    '03' : 'happy',
    '01' : 'neutral',
    '04' : 'sad',
    '05' : 'angry',
    '06' : 'fear',
    '07' : 'disgust',
}

for directory in ravdess_dir_lis:
    actor_files = os.listdir(os.path.join(RAVDESS, directory))
    for audio_file in actor_files: 
        part = audio_file.split('.')[0]
        key = part.split('-')[2]
        if key in emotion_dic:
            gender_code = int(part.split('-')[6])
            path_list.append(f"{RAVDESS}{directory}/{audio_file}")
            gender_list.append('female' if gender_code & 1 == 0 else 'male')
            emotion_list.append(emotion_dic[key])
            
ravdess_df = pd.concat([
    pd.DataFrame(path_list, columns=['path']),
    pd.DataFrame(emotion_list, columns=['emotion'])
], axis=1)

ravdess_df.head()

Unnamed: 0,path,emotion
0,test/ravdess-emotional-speech-audio/audio_spee...,neutral
1,test/ravdess-emotional-speech-audio/audio_spee...,neutral
2,test/ravdess-emotional-speech-audio/audio_spee...,neutral
3,test/ravdess-emotional-speech-audio/audio_spee...,neutral
4,test/ravdess-emotional-speech-audio/audio_spee...,happy


In [5]:
crema_dir_list = os.listdir(CREMA)
path_list = []
gender_list = []
emotion_list = []

emotion_dic = {
    'HAP' : 'happy',
    'NEU' : 'neutral',
    'SAD' : 'sad',
    'ANG' : 'angry',
    'FEA' : 'fear',
    'DIS' : 'disgust',
}

female_id_list = [
    '1002', '1003', '1004', '1006', '1007', '1008', '1009', '1010', '1012', '1013', '1018', 
    '1020', '1021', '1024', '1025', '1028', '1029', '1030', '1037', '1043', '1046', '1047', 
    '1049', '1052', '1053', '1054', '1055', '1056', '1058', '1060', '1061', '1063', '1072', 
    '1073', '1074', '1075', '1076', '1078', '1079', '1082', '1084', '1089', '1091',
]

for audio_file in crema_dir_list:
    part = audio_file.split('_')
    key = part[2]
    if key in emotion_dic and part[3] == 'HI.wav':
        path_list.append(f"{CREMA}{audio_file}")
        gender_list.append('female' if part[0] in female_id_list else 'male')
        emotion_list.append(emotion_dic[key])

crema_df = pd.concat([
    pd.DataFrame(path_list, columns=['path']),
    pd.DataFrame(emotion_list, columns=['emotion'])
], axis=1)

crema_df.head()

Unnamed: 0,path,emotion
0,test/cremad/AudioWAV/1001_IEO_ANG_HI.wav,angry
1,test/cremad/AudioWAV/1001_IEO_DIS_HI.wav,disgust
2,test/cremad/AudioWAV/1001_IEO_FEA_HI.wav,fear
3,test/cremad/AudioWAV/1001_IEO_HAP_HI.wav,happy
4,test/cremad/AudioWAV/1001_IEO_SAD_HI.wav,sad


In [6]:
tess_dir_list = os.listdir(TESS)
path_list = []
gender_list = []
emotion_list = [] 

emotion_dic = {
    'happy'   : 'happy',
    'neutral' : 'neutral',
    'sad'     : 'sad',
    'Sad'     : 'sad',
    'angry'   : 'angry',
    'fear'    : 'fear',
    'disgust'  : 'disgust',
}

for directory in tess_dir_list:
    audio_files = os.listdir(os.path.join(TESS, directory))
    for audio_file in audio_files:
        part = audio_file.split('.')[0]
        key = part.split('_')[2]
        if key in emotion_dic:
            path_list.append(f"{TESS}{directory}/{audio_file}") 
            gender_list.append('female') # female only dataset
            emotion_list.append(emotion_dic[key])
            
tess_df = pd.concat([
    pd.DataFrame(path_list, columns=['path']),
    pd.DataFrame(emotion_list, columns=['emotion'])
], axis=1)

tess_df.head()

Unnamed: 0,path,emotion
0,test/toronto-emotional-speech-set-tess/tess to...,angry
1,test/toronto-emotional-speech-set-tess/tess to...,angry
2,test/toronto-emotional-speech-set-tess/tess to...,angry
3,test/toronto-emotional-speech-set-tess/tess to...,angry
4,test/toronto-emotional-speech-set-tess/tess to...,angry


In [7]:
savee_dir_list = os.listdir(SAVEE)
path_list = []
gender_list = []
emotion_list = []

emotion_dic = {
    'h'  : 'happy',
    'n'  : 'neutral',
    'sa' : 'sad',
    'a'  : 'angry',
    'f'  : 'fear',
    'd'  : 'disgust'
}

for audio_file in savee_dir_list:
    part = audio_file.split('_')[1]
    key = part[:-6]
    if key in emotion_dic:
        path_list.append(f"{SAVEE}{audio_file}")
        gender_list.append('male') # male only dataset
        emotion_list.append(emotion_dic[key])
        
savee_df = pd.concat([
    pd.DataFrame(path_list, columns=['path']),
    pd.DataFrame(emotion_list, columns=['emotion'])
], axis=1)

savee_df.head()

Unnamed: 0,path,emotion
0,test/surrey-audiovisual-expressed-emotion-save...,angry
1,test/surrey-audiovisual-expressed-emotion-save...,angry
2,test/surrey-audiovisual-expressed-emotion-save...,angry
3,test/surrey-audiovisual-expressed-emotion-save...,angry
4,test/surrey-audiovisual-expressed-emotion-save...,angry


In [8]:
df = pd.concat([
    ravdess_df, 
    crema_df, 
    tess_df, 
    savee_df
], axis=0)
df.head()

Unnamed: 0,path,emotion
0,test/ravdess-emotional-speech-audio/audio_spee...,neutral
1,test/ravdess-emotional-speech-audio/audio_spee...,neutral
2,test/ravdess-emotional-speech-audio/audio_spee...,neutral
3,test/ravdess-emotional-speech-audio/audio_spee...,neutral
4,test/ravdess-emotional-speech-audio/audio_spee...,happy


In [9]:
import librosa
from pydub import AudioSegment, effects

In [10]:
def preprocess_audio(path):
    _, sr = librosa.load(path)
    raw_audio = AudioSegment.from_file(path)
    
    samples = np.array(raw_audio.get_array_of_samples(), dtype='float32')
    trimmed, _ = librosa.effects.trim(samples, top_db=25)
    if (len(trimmed)>180000):
        padded = trimmed[:180000]
    else:
        padded = np.pad(trimmed, (0, 180000-len(trimmed)), 'constant')
    return padded, sr

In [11]:
emotion_dic = {
    'neutral' : 0,
    'happy'   : 1,
    'sad'     : 2, 
    'angry'   : 3, 
    'fear'    : 4, 
    'disgust' : 5
}

def encode(label):
    return emotion_dic.get(label)

In [12]:
zcr_list = []
rms_list = []
mfccs_list = []
emotion_list = []

FRAME_LENGTH = 2048
HOP_LENGTH = 512

for row in df.itertuples(index=False):
    try: 
        y, sr = preprocess_audio(row.path)
        zcr = librosa.feature.zero_crossing_rate(y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
        rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=HOP_LENGTH)
        zcr_list.append(zcr)
        rms_list.append(rms)
        mfccs_list.append(mfccs)

        emotion_list.append(encode(row.emotion))
    except:
        print(f"Failed for path: {row.path}")

In [13]:
X = np.concatenate((
    np.swapaxes(zcr_list, 1, 2), 
    np.swapaxes(rms_list, 1, 2), 
    np.swapaxes(mfccs_list, 1, 2)), 
    axis=2
)
X = X.astype('float32')

y = np.asarray(emotion_list)

## LSTM train code

In [14]:
class EmotionDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.emotions = torch.Tensor(labels).to(torch.int64)

    def __len__(self):
        return len(self.emotions)

    def __getitem__(self, idx):
        inputs = self.inputs[idx]
        labels = self.emotions[idx]
        return inputs, labels

In [15]:
class EmotionLSTM(nn.Module):
    def __init__(self, input_shape, hidden_size, n_class):
        super(EmotionLSTM, self).__init__()
        self.lstm1 = nn.LSTM(input_size=input_shape[1], hidden_size=hidden_size, batch_first=True, dropout = 0.3)
        self.lstm2 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, batch_first=True, dropout=0.3)
        self.lstm3 = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size//2, batch_first=True, dropout=0.3)
        self.fc = nn.Linear(hidden_size//2, n_class)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x = x[:, -1, :] 
        x = self.fc(x)
        return x

In [18]:
model = EmotionLSTM(X.shape[1:3],512,6)

## resnet train code

In [None]:
class EmotionDataset(Dataset):
    def __init__(self, images, labels):
        self.images = []
        self.emotions = torch.Tensor(labels).to(torch.int64)
        transform = transforms.Compose([
            transforms.Resize((224,224)),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])

        for i in tqdm(range(len(images))):
            
            # MFCC를 이미지로 변환하여 크기를 조정합니다.
            image = Image.fromarray(images[i],"RGB")
            image = transform(image)
            self.images.append(image)
            
        self.len = len(images)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.emotions[idx]
        return image, label

In [None]:
model = resnet18(pretrained=True)
num_features = model.fc.in_features
num_classes = 6
model.fc = nn.Linear(num_features, num_classes) 

## Train

In [16]:
dataset = EmotionDataset(X,y)
dataset_size = len(dataset)
train_size = int(dataset_size * 0.8)
validation_size = int(dataset_size * 0.1)
test_size = dataset_size - train_size - validation_size

train_dataset, validation_dataset, test_dataset = random_split(dataset, [train_size, validation_size, test_size])

In [17]:
dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
validation_loader = DataLoader(validation_dataset, batch_size=4, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True, drop_last=True)

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy}%')
    return accuracy

In [22]:
num_epochs = 200

In [23]:
accuracy_list = []
for epoch in range(num_epochs):
    running_loss = 0.0
    loop = tqdm(dataloader, total=len(dataloader), leave=True)
    model.train()
    
    for images, labels in loop:
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
        loop.set_postfix(loss=running_loss / (len(dataloader)))
    accuracy_list.append(test(model,test_loader))
    torch.save(model.state_dict(), "result/model_LSTM_512_{}.pth".format(epoch))
    
print('Training finished!')

Epoch [1/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.21it/s, loss=1.57]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 33.09it/s]


Test Accuracy: 32.407407407407405%


Epoch [2/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:17<00:00, 12.63it/s, loss=1.45]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 35.77it/s]


Test Accuracy: 33.7962962962963%


Epoch [3/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:15<00:00, 13.64it/s, loss=1.49]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 35.91it/s]


Test Accuracy: 40.0462962962963%


Epoch [4/200]: 100%|███████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.49it/s, loss=1.4]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 35.86it/s]


Test Accuracy: 37.26851851851852%


Epoch [5/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.18it/s, loss=1.39]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.37it/s]


Test Accuracy: 41.43518518518518%


Epoch [6/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:15<00:00, 13.78it/s, loss=1.35]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 35.94it/s]


Test Accuracy: 39.583333333333336%


Epoch [7/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:16<00:00, 12.77it/s, loss=1.32]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.06it/s]


Test Accuracy: 45.833333333333336%


Epoch [8/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.12it/s, loss=1.23]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 34.43it/s]


Test Accuracy: 41.43518518518518%


Epoch [9/200]: 100%|███████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.13it/s, loss=1.2]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 35.78it/s]


Test Accuracy: 43.98148148148148%


Epoch [10/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:15<00:00, 14.01it/s, loss=1.21]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.03it/s]


Test Accuracy: 45.370370370370374%


Epoch [11/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:15<00:00, 14.16it/s, loss=1.16]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.47it/s]


Test Accuracy: 48.611111111111114%


Epoch [12/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.28it/s, loss=1.16]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.47it/s]


Test Accuracy: 45.601851851851855%


Epoch [13/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:15<00:00, 14.34it/s, loss=1.15]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.39it/s]


Test Accuracy: 47.4537037037037%


Epoch [14/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:15<00:00, 13.91it/s, loss=1.2]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 37.21it/s]


Test Accuracy: 44.675925925925924%


Epoch [15/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:14<00:00, 14.58it/s, loss=1.19]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:03<00:00, 35.19it/s]


Test Accuracy: 50.925925925925924%


Epoch [16/200]: 100%|██████████████████████████████████████████████████████| 216/216 [00:15<00:00, 13.71it/s, loss=1.3]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 37.07it/s]


Test Accuracy: 47.916666666666664%


Epoch [17/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:14<00:00, 14.67it/s, loss=1.16]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 37.17it/s]


Test Accuracy: 53.47222222222222%


Epoch [18/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:15<00:00, 14.13it/s, loss=1.18]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.05it/s]


Test Accuracy: 47.916666666666664%


Epoch [19/200]: 100%|█████████████████████████████████████████████████████| 216/216 [00:16<00:00, 13.21it/s, loss=1.11]
100%|████████████████████████████████████████████████████████████████████████████████| 108/108 [00:02<00:00, 36.42it/s]


Test Accuracy: 56.48148148148148%


Epoch [20/200]:  87%|█████████████████████████████████████████████▎      | 188/216 [00:14<00:02, 12.83it/s, loss=0.926]


KeyboardInterrupt: 