In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm
import pandas as pd
import numpy as np
import librosa
import os

In [2]:
class EmotionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(EmotionModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        x = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        out, _ = self.lstm(x, (h0, c0))
        
        out, _ = pad_packed_sequence(out, batch_first=True)
        out = self.fc(out[:, -1, :])
        return out

In [11]:
class EmotionDataset(Dataset):
    def __init__(self, csv, n_mfcc, max_len=None):
        file_list = pd.read_csv(csv)
        self.mfccs = []
        self.emotions = []
        self.max_len = 0
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        for i in tqdm(range(len(file_list)), desc="Loading Data"):
            name = "datasets/emotion_audio_data/{}.wav".format(file_list.iloc[i, 1])
            y, sr = librosa.load(name, sr=None)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            if max_len:
                mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), 'constant', constant_values=(0))
            self.max_len = max(self.max_len, mfcc.shape[1])
            mfcc = torch.from_numpy(mfcc.astype(np.float32))
            self.mfccs.append(mfcc)

            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])
        self.len = len(file_list)
        self.n_mfcc = n_mfcc

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        mfcc = self.mfccs[idx]
        label = self.emotions[idx]
        return mfcc, label

In [4]:
n_mfcc = 13
batch_size = 16

In [16]:
def collate_fn(batch):
    mfccs = [item[0].T for item in batch]
    labels = [item[1] for item in batch]
    
    # Padding
    mfccs = pad_sequence(mfccs, batch_first=True)
    
    return mfccs, torch.tensor(labels)

In [12]:
train_dataset = EmotionDataset("datasets/emotion_train.csv", n_mfcc)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True)

100%|████████████████████████████████████████████████████████████████████████████| 35179/35179 [11:56<00:00, 49.13it/s]


In [13]:
test_dataset = EmotionDataset("datasets/emotion_test.csv", n_mfcc)
test_loader = DataLoader(dataset=test_dataset,
                          batch_size=batch_size,
                          shuffle=False)

100%|██████████████████████████████████████████████████████████████████████████████| 8793/8793 [03:00<00:00, 48.59it/s]


In [17]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          collate_fn=collate_fn,
                          shuffle=True)

In [18]:
test_loader = DataLoader(dataset=test_dataset,
                          batch_size=batch_size,
                         collate_fn=collate_fn,
                          shuffle=False)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
n_mfcc = 13
hidden_size = 512
num_layers = 2
num_classes = 7
learning_rate = 0.01
num_epochs = 30

In [27]:
model = EmotionModel(n_mfcc, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [28]:
model.train()
for epoch in range(num_epochs):
    # Initialize tqdm with total number of batches in an epoch
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (mfccs, labels) in progress_bar:
        mfccs = mfccs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(mfccs, [mfccs.shape[1]] * len(mfccs))
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar description
        progress_bar.set_description(f"Epoch [{epoch+1}/{num_epochs}] Loss: {loss.item():.4f}")

Epoch [1/30] Loss: 2.1395:  86%|███████████████████████████████████████████▊       | 1889/2199 [04:29<00:44,  7.01it/s]


KeyboardInterrupt: 

In [10]:
model

EmotionModel(
  (lstm): LSTM(13, 512, num_layers=2, batch_first=True)
  (fc): Linear(in_features=512, out_features=7, bias=True)
)

In [2]:
class EmotionDataset(Dataset):
    def __init__(self, csv_file, audio_dir, max_pad_len):
        self.emotion_frame = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.max_pad_len = max_pad_len
        self.emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}
        self.mfccs = []
        self.emotions = []
        self.load_data()

    def load_data(self):
        for idx in tqdm(range(len(self.emotion_frame))):
            file_path = os.path.join(self.audio_dir, self.emotion_frame.iloc[idx, 1]) + ".wav"
            y, sr = librosa.load(file_path, duration=3)  # 음성 파일 로드 (3초로 제한)
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
            pad_width = self.max_pad_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
            mfccs = mfccs.transpose()
            emotion = self.emotion_frame.iloc[idx, 3]
            emotion = self.emotion_to_int[emotion]

            self.mfccs.append(torch.tensor(mfccs, dtype=torch.float32))
            self.emotions.append(torch.tensor(emotion, dtype=torch.long))

    def __len__(self):
        return len(self.emotion_frame)

    def __getitem__(self, idx):
        mfccs = self.mfccs[idx]
        emotion = self.emotions[idx]

        return {'mfccs': mfccs, 'emotion': emotion}

In [3]:
# 데이터셋 클래스 정의
class EmotionDataset(Dataset):
    def __init__(self, csv_file, audio_dir, max_pad_len):
        self.emotion_frame = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.max_pad_len = max_pad_len
        self.emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3, 
                           'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

    def __len__(self):
        return len(self.emotion_frame)

    def __getitem__(self, idx):
        file_path = os.path.join(self.audio_dir, self.emotion_frame.iloc[idx, 1])+".wav"
        y, sr = librosa.load(file_path, duration=3)  # 음성 파일 로드 (3초로 제한)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
        pad_width = self.max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        mfccs = mfccs.transpose()
        emotion = self.emotion_frame.iloc[idx, 3]
        emotion = self.emotion_to_int[emotion]
        
        mfccs = torch.tensor(mfccs, dtype=torch.float32)
        emotion = torch.tensor(emotion, dtype=torch.long)

        return {'mfccs': mfccs, 'emotion': emotion}

In [3]:
# 모델 클래스 정의
class EmotionRecognitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(EmotionRecognitionModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [4]:
# 하이퍼파라미터 설정
batch_size = 32
input_size = 13  # MFCC 특징의 크기
hidden_size = 128
num_layers = 2
num_classes = 9
learning_rate = 0.001
num_epochs = 10

In [6]:
# 데이터셋과 데이터 로더 생성
train_csv_file = 'datasets/emotion_train.csv'
audio_dir = 'datasets/emotion_audio_data/'
max_pad_len = 1000  # 패딩 길이
train_dataset = EmotionDataset(train_csv_file, audio_dir, max_pad_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# 모델 초기화
model = EmotionRecognitionModel(input_size, hidden_size, num_layers, num_classes)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 손실 함수와 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 모델 학습
for epoch in range(num_epochs*10):
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader)
    model.train()
    for i, sample in enumerate(progress_bar):
        inputs, labels = sample['mfccs'].to(device), sample['emotion'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        progress_bar.set_description(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss / (i+1):.4f}')

100%|████████████████████████████████████████████████████████████████████████████| 35179/35179 [48:41<00:00, 12.04it/s]
Epoch [1/10], Loss: 1.7434: 100%|██████████████████████████████████████████████████| 1100/1100 [00:20<00:00, 53.01it/s]
Epoch [2/10], Loss: 1.7358: 100%|██████████████████████████████████████████████████| 1100/1100 [00:17<00:00, 62.49it/s]
Epoch [3/10], Loss: 1.7361: 100%|██████████████████████████████████████████████████| 1100/1100 [00:17<00:00, 62.36it/s]
Epoch [4/10], Loss: 1.7344: 100%|██████████████████████████████████████████████████| 1100/1100 [00:17<00:00, 62.34it/s]
Epoch [5/10], Loss: 1.7342: 100%|██████████████████████████████████████████████████| 1100/1100 [00:17<00:00, 62.41it/s]
Epoch [6/10], Loss: 1.7339: 100%|██████████████████████████████████████████████████| 1100/1100 [00:17<00:00, 62.34it/s]
Epoch [7/10], Loss: 1.7342: 100%|██████████████████████████████████████████████████| 1100/1100 [00:17<00:00, 62.37it/s]
Epoch [8/10], Loss: 1.7338: 100%|███████

In [7]:
import os
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import librosa

class EmotionDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None, max_pad_len=1057):
        self.emotion_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3, 
                           'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}
        self.max_pad_len = max_pad_len
        
    def __len__(self):
        return len(self.emotion_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        wav_name = os.path.join(self.root_dir,
                                self.emotion_frame.iloc[idx, 1])+".wav"
        y, sr = librosa.load(wav_name, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        pad_width = max(self.max_pad_len - mfccs.shape[1], 0)
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        mfccs = torch.tensor(mfccs)


        mfccs = mfccs.transpose()  # Transpose the matrix to align the input size

        emotion = self.emotion_frame.iloc[idx, 3]
        emotion = self.emotion_to_int[emotion]
        emotion = torch.tensor(emotion)

        sample = {'mfccs': mfccs, 'emotion': emotion}

        return sample
    
def collate_fn(batch):
    mfccs = [item['mfccs'] for item in batch]
    emotions = [item['emotion'] for item in batch]
    mfccs_padded = pad_sequence(mfccs, batch_first=True)
    emotions = torch.stack(emotions)
    return {'mfccs': mfccs_padded, 'emotion': emotions}


In [8]:
import torch.nn as nn

class EmotionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(EmotionModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim

        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(x.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_().to(x.device)


        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :]) 
        return out


In [9]:
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EmotionModel(input_dim=40, hidden_dim=256, layer_dim=2, output_dim=7).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

emotion_dataset = EmotionDataset(csv_file='datasets/emotion_train.csv', root_dir='datasets/emotion_audio_data/')
train_loader = DataLoader(emotion_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

model.train()
for epoch in range(10):
    print(f'Epoch {epoch+1}')
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader)
    for i, sample in enumerate(progress_bar):
        inputs, labels = sample['mfccs'].to(device), sample['emotion'].to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_description(f'Loss: {loss.item()}')

    print(f'Epoch {epoch+1} Loss: {epoch_loss/len(train_loader)}')


Epoch 1


  0%|                                                                                         | 0/2199 [00:00<?, ?it/s]


TypeError: transpose() received an invalid combination of arguments - got (), but expected one of:
 * (int dim0, int dim1)
 * (name dim0, name dim1)


In [69]:
label_index = {}
label_index["anger"] = 0
label_index["angry"] = 0
label_index["disgust"] = 1
label_index["fear"] = 2
label_index["happiness"] = 3
label_index["neutral"] = 4
label_index["sad"] = 5 
label_index["sadness"] = 5
label_index["surprise"] = 6

In [70]:
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

import os

class AudioDataset(Dataset):
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        audio_file = os.path.join("datasets/emotion_audio_data/", self.df.iloc[index, 1]+".wav")
        label = torch.tensor(label_index[self.df.iloc[index, 3]])
        audio =  torch.from_numpy(preprocess_audio(audio_file))
        return audio, label, audio.shape[0]


In [71]:
def collate_fn(data):
    data.sort(key=lambda x: x[2], reverse=True)
    audios, labels, lengths = zip(*data)
    audios = pad_sequence(audios, batch_first=True)
    labels = torch.tensor(labels)
    lengths = torch.tensor(lengths)
    return audios, labels, lengths

In [72]:
from torch.utils.data import DataLoader

# Create the dataset
dataset = AudioDataset(csv_file='datasets/emotion_train.csv')
loader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)

In [73]:
from tqdm import tqdm
import torch.optim as optim

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create the model
model = EmotionRecognitionModel(input_size=40, hidden_size=50, num_layers=2, num_classes=5).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10

for epoch in tqdm(range(num_epochs)):
    running_loss = 0.0
    for i, (inputs, labels, lengths) in tqdm(enumerate(loader)):
        inputs = pack_padded_sequence(inputs, lengths, batch_first=True, enforce_sorted=False)
        labels = labels.to(device, dtype=torch.long)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print loss
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(loader)}")

print('Finished Training')


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]


RuntimeError: input must have 2 dimensions, got 1

In [39]:
for i in loader:
    print(i)
    break

datasets/emotion_audio_data/5f04edc9b140144dfcfef53e.wav
tensor(0)
[[-80.       -80.       -80.       ... -42.019623 -40.404594 -42.039497]
 [-80.       -80.       -80.       ... -55.069542 -56.325912 -55.70308 ]
 [-80.       -80.       -80.       ... -57.148438 -58.260925 -62.47307 ]
 ...
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]]
datasets/emotion_audio_data/5f6eab4f111dfd48d40fd27c.wav
tensor(5)
[[-80.       -80.       -80.       ... -25.060246 -24.680363 -24.126091]
 [-80.       -80.       -80.       ... -28.653961 -29.755291 -31.59039 ]
 [-80.       -80.       -80.       ... -33.605335 -34.63547  -36.764503]
 ...
 [-80.       -80.       -80.       ... -67.442665 -67.25614  -65.94096 ]
 [-80.       -80.       -80.       ... -74.91232  -74.276596 -74.04675 ]
 [-80.       -80.       -80.       ... -74.62927  -7

datasets/emotion_audio_data/5efc92f1704f492ee12537b5.wav
tensor(5)
[[-80.       -80.       -80.       ... -61.573643 -56.607048 -54.22828 ]
 [-80.       -80.       -80.       ... -56.21641  -52.40764  -49.5945  ]
 [-80.       -80.       -80.       ... -56.81953  -54.32172  -52.159794]
 ...
 [-80.       -80.       -80.       ... -70.63549  -69.65435  -69.574005]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]]
datasets/emotion_audio_data/5f866c93111dfd48d40fe07a.wav
tensor(3)
[[-80.       -80.       -77.07026  ... -54.536133 -56.085934 -54.92125 ]
 [-80.       -80.       -80.       ... -68.35004  -70.35807  -73.32697 ]
 [-80.       -80.       -80.       ... -65.473595 -66.2165   -73.487526]
 ...
 [-80.       -80.       -80.       ... -78.83131  -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -80.       -80.      ]
 [-80.       -80.       -80.       ... -80.       -8

TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray

In [48]:
import scipy