In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet18,resnet50
import torchvision.transforms as transforms
import torch.nn.functional as F
from torchvision.transforms import functional as TF
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

In [2]:
def load_mfcc(file_name, max_len=None):
    y, sr = librosa.load(file_name, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    
    if max_len:
        if mfcc.shape[1] > max_len:
            mfcc = mfcc[:, :max_len]
        else:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), 'constant', constant_values=(0))
    
    return mfcc

In [25]:
np.random.uniform(0, 1)

0.2208325795663355

In [3]:
class RandomErase(object):
    def __init__(self, p=0.5, s=(10, 20), value=0):
        self.p = p
        self.s = s
        self.value = value

    def __call__(self, img):
        if np.random.uniform(0, 1) < self.p:
            h, w = img.shape[-2:]
            r = np.random.randint(self.s[0], self.s[1])
            x = np.random.randint(0, w - r)
            y = np.random.randint(0, h - r)
            
            img[y:y + r, x:x + r] = self.value

        return img

# define transform
transform = transforms.Compose([
    transforms.ToTensor(),
    RandomErase(p=1.0, s=(10, 20), value=0),
])

In [4]:
sample_rate = librosa.get_samplerate("datasets/emotion_audio_data/5e27fa1c5807b852d9e01586.wav")

print("Sample Rate:", sample_rate)

Sample Rate: 48000


In [2]:
class EmotionDataset(Dataset):
    def __init__(self, csv, n_mels=128):
        file_list = pd.read_csv(csv)
        self.images = []
        self.emotions = []
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
        ])
        
        for i in tqdm(range(len(file_list))):
            name = "datasets/emotion_audio_data/{}.wav".format(file_list.iloc[i, 1])
            y, sr = librosa.load(name, res_type="kaiser_fast", duration=3.0)
            
            # 데이터 길이가 3초보다 짧은 경우 0으로 패딩합니다.

            mel_spectrogram = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels)
            mel_spectrogram = librosa.power_to_db(mel_spectrogram).astype(np.float32)
            
            # mel_spec을 이미지로 변환하여 크기를 조정합니다.
            #mel_spectrogram = np.stack([mel_spectrogram] * 3, axis=0) 
            image = Image.fromarray(mel_spectrogram,"RGB") 
            image = transform(image)
            
            self.images.append(image)

            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])
            
        self.len = len(file_list)
        self.n_mfcc = n_mfcc

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.emotions[idx]
        return image, label

In [4]:
class EmotionDataset(Dataset):
    def __init__(self, csv, n_mfcc):
        file_list = pd.read_csv(csv)
        self.images = []
        self.emotions = []
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        for i in tqdm(range(len(file_list))):
            name = "datasets/emotion_audio_data/{}.wav".format(file_list.iloc[i, 1])
            y, sr = librosa.load(name, res_type="kaiser_fast", duration=3.0)
            
            # 데이터 길이가 3초보다 짧은 경우 0으로 패딩합니다.
            if len(y) < sr * 3:
                pad_length = sr * 3 - len(y)
                y = np.pad(y, (0, pad_length), mode='constant')

            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            
            # MFCC를 이미지로 변환하여 크기를 조정합니다.
            mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min())  # 정규화
            mfcc = Image.fromarray(mfcc)
            mfcc = mfcc.resize((224, 224), resample=Image.BILINEAR)  # 크기 조정
            
            mfcc = torch.from_numpy(np.array(mfcc).astype(np.float32))
            self.images.append(mfcc)

            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])
            
        self.len = len(file_list)
        self.n_mfcc = n_mfcc

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.emotions[idx]
        return image, label


In [5]:
class EmotionDataset(Dataset):
    def __init__(self, csv, n_mfcc):
        file_list = pd.read_csv(csv)
        self.images = []
        self.emotions = []
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        for i in tqdm(range(len(file_list))):
            name = "datasets/emotion_audio_data/{}.wav".format(file_list.iloc[i, 1])
            y, sr = librosa.load(name, res_type="kaiser_fast", duration=3.0, sr=16000)
            
            # 데이터 길이가 3초보다 짧은 경우 0으로 패딩합니다.
            if len(y) < sr * 3:
                pad_length = sr * 3 - len(y)
                y = np.pad(y, (0, pad_length), mode='constant')

            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            
            # MFCC를 이미지로 변환하여 크기를 조정합니다.
            mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min())  # 정규화
            mfcc = Image.fromarray(mfcc)
            mfcc = mfcc.resize((224, 224), resample=Image.BILINEAR)  # 크기 조정
            mfcc = np.array(mfcc).astype(np.float32)
            mfcc = np.stack([mfcc] * 3, axis=0)  # 채널 수를 3으로 맞춥니다.
            
            mfcc = torch.from_numpy(mfcc)
            self.images.append(mfcc)

            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])
            
        self.len = len(file_list)
        self.n_mfcc = n_mfcc

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.emotions[idx]
        return image, label

In [6]:
class EmotionDataset(Dataset):
    def __init__(self, csv, n_mfcc, transform=None):
        file_list = pd.read_csv(csv)
        self.images = []
        self.emotions = []
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        for i in tqdm(range(len(file_list))):
            name = "datasets/emotion_audio_data/{}.wav".format(file_list.iloc[i, 1])
            y, sr = librosa.load(name, res_type="kaiser_fast", duration=3.0, sr=16000)
            
            # 데이터 길이가 3초보다 짧은 경우 0으로 패딩합니다.
            if len(y) < sr * 3:
                pad_length = sr * 3 - len(y)
                y = np.pad(y, (0, pad_length), mode='constant')

            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            
            # MFCC를 이미지로 변환하여 크기를 조정합니다.
            mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min())  # 정규화
            mfcc = Image.fromarray(mfcc)
            mfcc = mfcc.resize((224, 224), resample=Image.BILINEAR)  # 크기 조정
            mfcc = np.array(mfcc).astype(np.float32)
            mfcc = np.stack([mfcc] * 3, axis=0)  # 채널 수를 3으로 맞춥니다.
            
            mfcc = torch.from_numpy(mfcc)
            self.images.append(mfcc)

            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])
            
        self.len = len(file_list)
        self.n_mfcc = n_mfcc
        self.transform = transform

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.emotions[idx]
        if self.transform:
            image = self.transform(image)
        return image, label

In [7]:
import numpy as np
import librosa
import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
from tqdm.auto import tqdm

class EmotionDataset(Dataset):
    def __init__(self, csv, n_mfcc, train, transform=None):
        file_list = pd.read_csv(csv)
        self.file_list = file_list
        self.emotions = []
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        for i in tqdm(range(len(file_list))):
            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])

        self.len = len(file_list)
        self.n_mfcc = n_mfcc
        self.transform = transform
        self.train = train

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        name = "datasets/emotion_audio_data/{}.wav".format(self.file_list.iloc[idx, 1])
        y, sr = librosa.load(name, res_type="kaiser_fast", duration=3.0, sr=16000)
        
        # 데이터 길이가 3초보다 짧은 경우 0으로 패딩합니다.
        if len(y) < sr * 3:
            pad_length = sr * 3 - len(y)
            y = np.pad(y, (0, pad_length), mode='constant')
        
        if (self.train):
            # 시간 왜곡 적용
            y = librosa.effects.time_stretch(y, rate=np.random.uniform(0.8, 1.2))

            # 피치 변조 적용
            y = librosa.effects.pitch_shift(y, sr, n_steps=np.random.uniform(-2, 2))
        
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=self.n_mfcc)

        # MFCC를 이미지로 변환하여 크기를 조정합니다.
        mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min())  # 정규화
        mfcc = Image.fromarray(mfcc)
        mfcc = mfcc.resize((224, 224), resample=Image.BILINEAR)  # 크기 조정
        mfcc = np.array(mfcc).astype(np.float32)
        mfcc = np.stack([mfcc] * 3, axis=0)  # 채널 수를 3으로 맞춥니다.
        
        mfcc = torch.from_numpy(mfcc)
        
        label = self.emotions[idx]
        return mfcc, label


In [8]:
n_mfcc = 40

In [9]:
dataset = EmotionDataset(csv='datasets/emotion_train.csv', n_mfcc=n_mfcc, train=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

  0%|          | 0/35179 [00:00<?, ?it/s]

In [10]:
# 테스트 데이터셋 생성
test_dataset = EmotionDataset(csv='datasets/emotion_test.csv', n_mfcc=n_mfcc, train=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

  0%|          | 0/8793 [00:00<?, ?it/s]

In [5]:
dataset = EmotionDataset(csv='datasets/emotion_train.csv', n_mels=128)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

 75%|████████████████████████████████████████████████████████▊                   | 26287/35179 [15:01<05:04, 29.16it/s]


KeyboardInterrupt: 

In [None]:
# 테스트 데이터셋 생성
test_dataset = EmotionDataset(csv='datasets/emotion_test.csv', n_mels=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model = resnet18(pretrained=True)
num_features = model.fc.in_features
num_classes = 7
model.fc = nn.Linear(num_features, num_classes) 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 60

In [None]:
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy}%')
    return accuracy

In [None]:
for epoch in range(num_epochs):
    running_loss = 0.0
    loop = tqdm(dataloader, total=len(dataloader), leave=True)
    model.train()
    
    for mfccs, labels in loop:
        mfccs = mfccs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(mfccs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
        loop.set_postfix(loss=running_loss / (len(dataloader)))

    test(model,test_loader)
    torch.save(model.state_dict(), "model_resnet18_40__augmentation{}.pth".format(epoch))
    
print('Training finished!')

In [50]:
model.train()

for epoch in range(30,30+num_epochs):
    running_loss = 0.0
    loop = tqdm(dataloader, total=len(dataloader), leave=True)
    
    for mfccs, labels in loop:
        mfccs = mfccs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(mfccs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
        loop.set_postfix(loss=running_loss / (len(dataloader)))

    torch.save(model.state_dict(), "model_{}.pth".format(epoch))
    
print('Training finished!')

Epoch [31/30]: 100%|███████████████████████████████████████████████████| 2199/2199 [01:32<00:00, 23.89it/s, loss=0.091]
Epoch [32/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:33<00:00, 23.41it/s, loss=0.0851]
Epoch [33/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:35<00:00, 22.98it/s, loss=0.0791]
Epoch [34/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:35<00:00, 22.94it/s, loss=0.0781]
Epoch [35/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:35<00:00, 23.08it/s, loss=0.0758]
Epoch [36/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:36<00:00, 22.87it/s, loss=0.0739]
Epoch [37/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:36<00:00, 22.77it/s, loss=0.0675]
Epoch [38/30]: 100%|██████████████████████████████████████████████████| 2199/2199 [01:35<00:00, 22.91it/s, loss=0.0674]
Epoch [39/30]: 100%|████████████████████

Training finished!


In [16]:
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(test_loader):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy}%')
    return accuracy

In [10]:
# 테스트 데이터셋 생성
test_dataset = EmotionDataset(csv='datasets/emotion_test.csv', n_mfcc=n_mfcc)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

100%|██████████████████████████████████████████████████████████████████████████████| 8793/8793 [05:00<00:00, 29.29it/s]


NameError: name 'batch_size' is not defined

In [11]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [59]:
model.load_state_dict(torch.load('./model_61.pth'))

<All keys matched successfully>

In [60]:
# 모델 테스트
accuracy = test(model, test_loader)
print(f'Test Accuracy: {accuracy}%')

100%|████████████████████████████████████████████████████████████████████████████████| 550/550 [00:07<00:00, 77.09it/s]

Test Accuracy: 49.152735130217216%





# resize도 모델에 넣어 버리자 (성능 안좋음)

In [22]:
class ResnetEmotionModel(nn.Module):
    def __init__(self, n_mfcc, num_classes):
        super(ResnetEmotionModel, self).__init__()
        self.n_mfcc = n_mfcc
        self.resize = nn.AdaptiveAvgPool2d((224, 224))
        self.resnet = resnet18(pretrained=True)
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        x = self.resize(x)
        x = self.resnet(x)
        return x

In [4]:
class ResnetEmotionDataset(Dataset):
    def __init__(self, csv, n_mfcc):
        file_list = pd.read_csv(csv)
        self.images = []
        self.emotions = []
        emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}

        for i in tqdm(range(len(file_list))):
            name = "datasets/emotion_audio_data/{}.wav".format(file_list.iloc[i, 1])
            y, sr = librosa.load(name, res_type="kaiser_fast", duration=3.0)
            
            # 데이터 길이가 3초보다 짧은 경우 0으로 패딩합니다.
            if len(y) < sr * 3:
                pad_length = sr * 3 - len(y)
                y = np.pad(y, (0, pad_length), mode='constant')

            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            
            # MFCC를 이미지로 변환하여 크기를 조정합니다.
            mfcc = (mfcc - mfcc.min()) / (mfcc.max() - mfcc.min())  # 정규화
            mfcc = Image.fromarray(mfcc)
            mfcc = np.array(mfcc).astype(np.float32)
            mfcc = np.stack([mfcc] * 3, axis=0)  # 채널 수를 3으로 맞춥니다.
            
            mfcc = torch.from_numpy(mfcc)
            self.images.append(mfcc)

            emotion = file_list.iloc[i, 3]
            self.emotions.append(emotion_to_int[emotion])
            
        self.len = len(file_list)
        self.n_mfcc = n_mfcc

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.emotions[idx]
        return image, label

In [23]:
# 모델 생성
num_classes = 7
model = ResnetEmotionModel(n_mfcc=13, num_classes=num_classes)

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [19]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
dataset = ResnetEmotionDataset(csv='datasets/emotion_train.csv', n_mfcc=13)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

100%|████████████████████████████████████████████████████████████████████████████| 35179/35179 [17:15<00:00, 33.96it/s]


In [14]:
# 테스트 데이터셋 생성
test_dataset = ResnetEmotionDataset(csv='datasets/emotion_test.csv', n_mfcc=13)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

100%|██████████████████████████████████████████████████████████████████████████████| 8793/8793 [04:13<00:00, 34.71it/s]


NameError: name 'batch_size' is not defined

In [16]:
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [20]:
num_epochs = 30

In [25]:
for epoch in range(num_epochs):
    running_loss = 0.0
    loop = tqdm(dataloader, total=len(dataloader), leave=True)
    model.train()
    
    for mfccs, labels in loop:
        mfccs = mfccs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(mfccs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
        loop.set_postfix(loss=running_loss / (len(dataloader)))

    test(model,test_loader)
    torch.save(model.state_dict(), "model_resnet18_resize_{}.pth".format(epoch))
    
print('Training finished!')

Epoch [1/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:06<00:00, 33.20it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 113.36it/s]


Test Accuracy: 7.437734561583078%


Epoch [2/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.54it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 112.80it/s]


Test Accuracy: 7.494597975662459%


Epoch [3/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:06<00:00, 32.83it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 113.01it/s]


Test Accuracy: 7.09655407710679%


Epoch [4/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:04<00:00, 34.06it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 113.20it/s]


Test Accuracy: 7.3126350506084385%


Epoch [5/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:04<00:00, 33.95it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.03it/s]


Test Accuracy: 6.6075287160241105%


Epoch [6/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.71it/s, loss=2.17]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 121.50it/s]


Test Accuracy: 7.574206755373592%


Epoch [7/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 35.10it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.18it/s]


Test Accuracy: 7.6993062663482315%


Epoch [8/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.87it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 120.08it/s]


Test Accuracy: 6.778118958262254%


Epoch [9/30]: 100%|█████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 34.99it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.12it/s]


Test Accuracy: 7.392243830319573%


Epoch [10/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.89it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 119.02it/s]


Test Accuracy: 7.028317980211532%


Epoch [11/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.39it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.02it/s]


Test Accuracy: 7.051063345843285%


Epoch [12/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.56it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 114.31it/s]


Test Accuracy: 7.540088706925964%


Epoch [13/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:05<00:00, 33.48it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.01it/s]


Test Accuracy: 7.1420448083702945%


Epoch [14/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:04<00:00, 34.07it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 115.47it/s]


Test Accuracy: 7.1420448083702945%


Epoch [15/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:05<00:00, 33.64it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.10it/s]


Test Accuracy: 7.085181394290913%


Epoch [16/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:05<00:00, 33.60it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 113.71it/s]


Test Accuracy: 7.107926759922666%


Epoch [17/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:04<00:00, 33.94it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 117.25it/s]


Test Accuracy: 7.449107244398954%


Epoch [18/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:04<00:00, 33.91it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:05<00:00, 105.50it/s]


Test Accuracy: 7.028317980211532%


Epoch [19/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:04<00:00, 34.20it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 117.52it/s]


Test Accuracy: 7.301262367792562%


Epoch [20/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.76it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 117.92it/s]


Test Accuracy: 6.846355055157511%


Epoch [21/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.71it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 121.71it/s]


Test Accuracy: 7.27851700216081%


Epoch [22/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 35.11it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 121.72it/s]


Test Accuracy: 7.596952121005345%


Epoch [23/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.80it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 119.18it/s]


Test Accuracy: 7.164790174002047%


Epoch [24/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 35.02it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 121.04it/s]


Test Accuracy: 7.46047992721483%


Epoch [25/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 34.93it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 121.05it/s]


Test Accuracy: 7.187535539633799%


Epoch [26/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 34.96it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.32it/s]


Test Accuracy: 7.051063345843285%


Epoch [27/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:02<00:00, 35.15it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.16it/s]


Test Accuracy: 7.449107244398954%


Epoch [28/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.78it/s, loss=2.17]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 115.47it/s]


Test Accuracy: 7.73342431479586%


Epoch [29/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:05<00:00, 33.32it/s, loss=2.17]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 121.30it/s]


Test Accuracy: 7.187535539633799%


Epoch [30/30]: 100%|████████████████████████████████████████████████████| 2199/2199 [01:03<00:00, 34.70it/s, loss=2.16]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:04<00:00, 122.22it/s]

Test Accuracy: 7.187535539633799%
Training finished!





In [23]:
df = pd.read_csv("datasets/emotion.csv")

In [25]:
emotion_to_int = {'anger': 0, 'angry': 0, 'disgust': 1, 'fear': 2, 'happiness': 3,
                               'neutral': 4, 'sad': 5, 'sadness': 5, 'surprise': 6}
keys = list(emotion_to_int.keys())

In [31]:
for key in keys:
    print(key," : ",len(df[df["상황"]==key]))

anger  :  8372
angry  :  3263
disgust  :  4660
fear  :  4131
happiness  :  4548
neutral  :  3262
sad  :  11152
sadness  :  2848
surprise  :  1755


In [None]:
googlenet(pretrained=True)