In [3]:
import librosa
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [54]:
df = pd.read_csv("datasets/emotion_test.csv",index_col=0)

In [55]:
df

Unnamed: 0,wav_id,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번감정세기,5번 감정,5번 감정세기,나이,성별
0,5f7940c59e04b149046cc19a,함께 달리는 크루가 있어서 서로 큰 힘이 되어주고 있어.,happiness,happiness,1,happiness,2,happiness,1,happiness,1,happiness,1,46,female
1,5f100cc8b140144dfcff4b66,고등학교 때부터 알던 사이인데 방을 이렇게 더럽게 사용할 줄 몰랐어. 알았으면 룸메...,anger,Angry,2,Angry,2,Sadness,1,Angry,1,Angry,1,46,female
2,5f0a7b09b140144dfcff21a8,친구를 기다린지 한 시간이 넘었어. 너무 짜증이 나!,anger,Angry,2,Angry,2,Angry,1,Angry,1,Angry,2,46,female
3,5fbca5c6576e9378b67acf76,아까 나갔는데 짭새들이 모여있길래 구경했어.,neutral,fear,1,neutral,0,neutral,0,neutral,0,neutral,0,22,female
4,5f8da1949e04b149046cd695,"음악을 한 번 들어볼까, 그러면?",happiness,neutral,0,happiness,1,sadness,1,happiness,1,neutral,0,35,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8788,5f5ccadc3bd6941613f6d194,우리 집 쓰레기통에서 정말 고약시런 냄새가 나기 시작했어!,disgust,disgust,2,disgust,2,disgust,2,disgust,1,sadness,2,48,female
8789,5fb8dfcacb503578af9edd68,짜장면에서 벌레가 나왔어!,disgust,surprise,1,angry,1,neutral,0,angry,1,angry,1,24,male
8790,5f5f82b92e23c7161accd230,헐 나 이벤트 당첨됐어.,happiness,happiness,1,happiness,2,happiness,2,surprise,1,happiness,2,35,female
8791,5e36a893ee8206179943c292,자리가 부족해서 안락사시키는 애들이 많아.,disgust,Neutral,0,Sadness,1,Sadness,1,Disgust,1,Sadness,2,23,female


In [49]:
error = []
for name in tqdm(df['wav_id']):
    try:
        librosa.load("datasets/emotion_audio_data/"+name+".wav",sr=None)
    except:
        error.append(name)

100%|█████████████████████████████████████████████████████████████████████████████| 8798/8798 [00:42<00:00, 209.19it/s]


In [16]:
error # train data

['5e32924e5807b852d9e03894',
 '5e378b4233e9ad176cc9ae53',
 '5e3292655807b852d9e03896',
 '5e4155f3189842034d9f72b9',
 '5e298bdc5807b852d9e01a11',
 '5e33a9d35807b852d9e050f4',
 '5e298b9f5807b852d9e01a0f',
 '5f0dd0aeb140144dfcff3b51',
 '5e3161c65807b852d9e032af',
 '5e2998b85807b852d9e01b02',
 '5e315dca5807b852d9e03275',
 '5e2ad43e5807b852d9e020dc',
 '5e33638b5807b852d9e04aeb',
 '5e298c085807b852d9e01a12']

In [50]:
error # test data

['5e2979c25807b852d9e018d5',
 '5e2ad4145807b852d9e020d9',
 '5e298bc45807b852d9e01a10',
 '5e31622f5807b852d9e032ba',
 '5e3292825807b852d9e0389a']

In [51]:
for name in error:
    df.drop(df[df['wav_id'] == name].index, inplace=True)

In [52]:
df = df.reset_index(drop=True)

In [53]:
df.to_csv("datasets/emotion_test.csv")

In [4]:
def extract_mel_spectrogram(path, n_mels=128):
    y, sr = librosa.load(path, sr=None)
    mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels)
    return librosa.power_to_db(mel_spectrogram).astype(np.float32)

In [5]:
def padding_audio_data(audio_data):
    padded_data = []
    max_length = int(np.percentile([mel_spec.shape[1] for mel_spec in audio_data], 95))
    print(max_length)
    for mel_spec in tqdm(audio_data):
        if mel_spec.shape[1] < max_length:
            padding = np.zeros((128, max_length - mel_spec.shape[1]), dtype=np.float32)
            padded_data.append(np.hstack((mel_spec.astype(np.float32), padding)))
        else:
            padded_data.append(mel_spec[:, :max_length].astype(np.float32))
    return np.array(padded_data)

In [6]:
def padding_audio_data(audio_data, max_length):
    padded_data = []
    print(max_length)
    for mel_spec in tqdm(audio_data):
        if mel_spec.shape[1] < max_length:
            padding = np.zeros((128, max_length - mel_spec.shape[1]), dtype=np.float32)
            padded_data.append(np.hstack((mel_spec.astype(np.float32), padding)))
        else:
            padded_data.append(mel_spec[:, :max_length].astype(np.float32))
    return np.array(padded_data)

In [7]:
class EmotionDataset(Dataset):
    def __init__(self, audio_data, labels):
        self.labels = labels
        self.audio = audio_data
        
    def __len__(self):
        return len(self.audio)

    def __getitem__(self, idx):        
        return self.audio[idx], self.labels[idx]

In [8]:
csv_name = "datasets/emotion_train.csv"
audio_df = pd.read_csv(csv_name)

audio_train = []
for name in tqdm(audio_df['wav_id']):
    audio_path = os.path.join("datasets/emotion_audio_data",name+".wav")
    mel_spectrogram = extract_mel_spectrogram(audio_path)
    audio_train.append(mel_spectrogram)

audio_train = padding_audio_data(audio_train)

 19%|██████████████▏                                                              | 6509/35179 [02:16<09:59, 47.84it/s]


KeyboardInterrupt: 

In [10]:
audio_train

[array([[-61.21541 , -61.21541 , -61.21541 , ..., -25.507645, -22.860245,
         -24.648153],
        [-61.21541 , -61.21541 , -61.21541 , ..., -21.683083, -20.749516,
         -23.087696],
        [-61.21541 , -61.21541 , -61.21541 , ..., -28.363714, -25.314838,
         -25.642792],
        ...,
        [-61.21541 , -61.21541 , -61.21541 , ..., -61.21541 , -61.21541 ,
         -61.21541 ],
        [-61.21541 , -61.21541 , -61.21541 , ..., -61.21541 , -61.21541 ,
         -61.21541 ],
        [-61.21541 , -61.21541 , -61.21541 , ..., -61.21541 , -61.21541 ,
         -61.21541 ]], dtype=float32),
 array([[-86.908775, -86.908775, -86.908775, ..., -32.32954 , -32.806202,
         -28.67709 ],
        [-86.908775, -86.908775, -86.908775, ..., -32.686993, -32.035095,
         -33.02285 ],
        [-86.908775, -86.908775, -86.908775, ..., -37.46769 , -37.784523,
         -40.63982 ],
        ...,
        [-86.908775, -86.908775, -86.908775, ..., -84.986534, -85.31808 ,
         -86.772766

In [9]:
import librosa.display
import matplotlib.pyplot as plt

In [12]:
plt.figure(figsize=(10, 4))
librosa.display.specshow(audio_train[0], y_axis='mel', sr=sr, hop_length=input_stride, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel-Spectrogram')
plt.tight_layout()
plt.savefig('Mel-Spectrogram example.png')
plt.show()

NameError: name 'sr' is not defined

<Figure size 3000x1200 with 0 Axes>

In [23]:
column = '상황'
audio_df.replace("anger","angry")
audio_df.replace("sadness","sad")
labels_data = audio_df['상황'].tolist()
label_encoder = LabelEncoder()
encoded_labels_train = label_encoder.fit_transform(labels_data)

In [56]:
csv_name = "datasets/emotion_test.csv"
audio_df = pd.read_csv(csv_name)

audio_test = []
for name in tqdm(audio_df['wav_id']):
    audio_path = os.path.join("datasets/emotion_audio_data",name+".wav")
    mel_spectrogram = extract_mel_spectrogram(audio_path)
    audio_test.append(mel_spectrogram)

audio_test = padding_audio_data(audio_test,1057)

100%|██████████████████████████████████████████████████████████████████████████████| 8793/8793 [03:10<00:00, 46.18it/s]


1057


100%|████████████████████████████████████████████████████████████████████████████| 8793/8793 [00:08<00:00, 1098.36it/s]


In [57]:
column = '상황'
audio_df.replace("anger","angry")
audio_df.replace("sadness","sad")
labels_data = audio_df['상황'].tolist()
encoded_labels_test = label_encoder.transform(labels_data)

In [42]:
class EmotionRecognitionCNN(nn.Module):
    def __init__(self, num_situations):
        super(EmotionRecognitionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.relu2 = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.relu3 = nn.ReLU(inplace=True)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)

        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn4 = nn.BatchNorm2d(128)
        self.relu4 = nn.ReLU()
        self.pool4 = nn.MaxPool2d(kernel_size=(4, 4), stride=(4, 4), padding=0)

        #(128,1057) -> (4,33,128)
        self.conv_out_size = 4 * 33 * 128

        self.fc1 = nn.Linear(self.conv_out_size, 1024)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(1024, num_situations)

    def forward(self, x):
        x = self.pool1(self.relu1(self.bn1(self.conv1(x))))
        x = self.pool2(self.relu2(self.bn2(self.conv2(x))))
        x = self.pool3(self.relu3(self.bn3(self.conv3(x))))
        x = self.pool4(self.relu4(self.bn4(self.conv4(x))))
        
        x = x.view(x.size(0), -1)
        x = self.dropout1(self.fc1(x))
        x = self.fc2(x)
        return x

In [40]:
len(audio[0][0])

1057

In [24]:
num_situations = len(np.unique(encoded_labels))

In [43]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmotionRecognitionCNN(num_situations).to(device)

In [30]:
train_dataset = EmotionDataset(audio_train,encoded_labels_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [58]:
test_dataset = EmotionDataset(audio_test,encoded_labels_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [45]:
import gc

def clear_cuda_memory():
    torch.cuda.empty_cache()
    gc.collect()

clear_cuda_memory()

In [59]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch+1}/{num_epochs}')
    for batch_idx, (inputs, targets) in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs.unsqueeze(1))
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        if (batch_idx + 1) % 10 == 0:
            progress_bar.set_postfix({'Loss': f'{loss.item():.4f}'})
            
    # 모델 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs.unsqueeze(1))
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    print(f'Accuracy: {100 * correct / total:.2f}%')

Epoch 1/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:13<00:00, 14.88it/s, Loss=1.8498]


Accuracy: 27.93%


Epoch 2/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.41it/s, Loss=1.7956]


Accuracy: 30.18%


Epoch 3/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.33it/s, Loss=1.5584]


Accuracy: 30.82%


Epoch 4/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.64it/s, Loss=1.8216]


Accuracy: 34.36%


Epoch 5/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.60it/s, Loss=2.1214]


Accuracy: 33.75%


Epoch 6/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.43it/s, Loss=1.9471]


Accuracy: 37.13%


Epoch 7/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.41it/s, Loss=1.1837]


Accuracy: 36.81%


Epoch 8/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.37it/s, Loss=1.2834]


Accuracy: 37.54%


Epoch 9/50: 100%|█████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.43it/s, Loss=2.0513]


Accuracy: 38.17%


Epoch 10/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:12<00:00, 15.15it/s, Loss=1.0549]


Accuracy: 37.18%


Epoch 11/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:13<00:00, 15.04it/s, Loss=1.2650]


Accuracy: 37.77%


Epoch 12/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:12<00:00, 15.08it/s, Loss=0.8555]


Accuracy: 37.48%


Epoch 13/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:09<00:00, 15.72it/s, Loss=1.5842]


Accuracy: 36.18%


Epoch 14/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.65it/s, Loss=1.3255]


Accuracy: 37.86%


Epoch 15/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.60it/s, Loss=1.2609]


Accuracy: 36.43%


Epoch 16/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.61it/s, Loss=1.1122]


Accuracy: 37.73%


Epoch 17/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.61it/s, Loss=0.6278]


Accuracy: 37.68%


Epoch 18/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.70it/s, Loss=1.2338]


Accuracy: 37.89%


Epoch 19/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.43it/s, Loss=0.6157]


Accuracy: 37.72%


Epoch 20/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.42it/s, Loss=1.3431]


Accuracy: 36.70%


Epoch 21/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.50it/s, Loss=0.8419]


Accuracy: 38.01%


Epoch 22/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.56it/s, Loss=1.3291]


Accuracy: 37.59%


Epoch 23/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:11<00:00, 15.29it/s, Loss=0.9277]


Accuracy: 35.45%


Epoch 24/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:12<00:00, 15.13it/s, Loss=0.3491]


Accuracy: 36.52%


Epoch 25/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.61it/s, Loss=0.2119]


Accuracy: 38.04%


Epoch 26/50: 100%|████████████████████████████████████████████████████| 1100/1100 [01:10<00:00, 15.70it/s, Loss=0.5107]


Accuracy: 36.51%


Epoch 27/50:   5%|██▋                                                   | 55/1100 [00:03<01:07, 15.47it/s, Loss=0.7881]


KeyboardInterrupt: 

In [60]:
# 모델의 state_dict 저장
model_path = 'emotion_CNN_1057_1.pth'
torch.save(model.state_dict(), model_path)

In [None]:
# 모델 인스턴스를 만들고 state_dict 불러오기
model = EmotionRecognitionCNN()  # 동일한 모델 구조를 가진 인스턴스를 생성합니다.
length = 1057
model.load_state_dict(torch.load(model_path))

# 모델을 평가 모드로 설정하기
model.eval()