In [1]:
import torch
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor

# AutoProcessor는 필요 없습니다.

model = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.output.bias', 'classifier.output.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.weight', 'projector.weight', 

In [None]:
model.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
import librosa
import os
from pydub import AudioSegment

In [11]:
class EmotionDataset(Dataset):
    def __init__(self, csv_file, audio_dir):
        self.emotion_frame = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.emotion_to_int = {'anger': 0, 'angry': 0, 'calm' : 1, 'disgust': 2, 'fear': 3, 'happiness': 4,
                               'neutral': 5, 'sad': 6, 'sadness': 6, 'surprise': 7}
        self.audios = []
        self.emotions = []
        self.load_data()

    def load_data(self):
        for idx in tqdm(range(len(self.emotion_frame))):
            file_path = os.path.join(self.audio_dir, self.emotion_frame.iloc[idx, 1]) + ".wav"
            audio = AudioSegment.from_file(file_path)
            audio = audio.set_frame_rate(16000)
            audio = np.array(audio.get_array_of_samples())
            
            emotion = self.emotion_frame.iloc[idx, 3]
            emotion = self.emotion_to_int[emotion]

            self.audios.append(torch.tensor(audio))
            self.emotions.append(torch.tensor(emotion, dtype=torch.long))

    def __len__(self):
        return len(self.emotion_frame)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        emotion = self.emotions[idx]

        return {'audio': audio, 'emotion': emotion}

In [13]:
batch_size = 4
num_epochs = 20

In [18]:

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [29]:
i=0
for inputs in train_loader:
    print(inputs["audio"].size())
    i+=1
    if i== 10:
        break

torch.Size([192512, 4])
torch.Size([87382, 4])
torch.Size([193878, 4])
torch.Size([145408, 4])
torch.Size([188416, 4])
torch.Size([207531, 4])
torch.Size([247808, 4])
torch.Size([98304, 4])
torch.Size([114006, 4])
torch.Size([144726, 4])


In [12]:
train_csv_file = 'datasets/emotion_train.csv'
audio_dir = 'datasets/emotion_audio_data/'
train_dataset = EmotionDataset(train_csv_file, audio_dir)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

100%|███████████████████████████████████████████████████████████████████████████| 35179/35179 [05:16<00:00, 111.21it/s]


NameError: name 'batch_size' is not defined

In [15]:
test_csv_file = 'datasets/emotion_test.csv'
test_dataset = EmotionDataset(test_csv_file, audio_dir)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

100%|██████████████████████████████████████████████████████████████████████████████| 8793/8793 [01:50<00:00, 79.70it/s]


In [3]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # 데이터셋에서 각 오디오 샘플의 길이가 다르므로, 이를 동일한 길이로 패딩해야 합니다.
    # pad_sequence 함수는 이러한 패딩을 수행하는 데 도움이 됩니다.
    inputs = pad_sequence([sample["audio"] for sample in batch])
    labels = torch.tensor([sample["emotion"] for sample in batch])
    return {"audio": inputs, "emotion": labels}

In [24]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = torch.nn.CrossEntropyLoss()

epochs = 10

for epoch in range(epochs):

    model.train()

    for batch in tqdm(train_dataset):
        inputs = batch['audio']
        labels = batch['emotion']
        inputs = inputs.float()
        inputs = inputs.to(device)
        labels = labels.to(device)

        predictions = model(inputs)

        loss = loss_function(predictions, labels)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    model.eval()

    accuracy = model.evaluate(test_dataset)

    print("Epoch {}: Accuracy {}".format(epoch, accuracy))

  0%|                                                                                        | 0/35179 [00:00<?, ?it/s]


RuntimeError: Given groups=1, weight of size [512, 1, 10], expected input[1, 58710, 1] to have 1 channels, but got 58710 channels instead

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 모델 학습
for epoch in range(num_epochs):
    progress_bar = tqdm(train_loader)
    for i, sample in enumerate(progress_bar):
        inputs, labels = sample["audio"].to(device), sample['emotion'].to(device)
        model.fit(inputs, labels)
    progress_bar = tqdm(test_loader)
    loss = 0
    for i, sample in enumerate(progress_bar):
        inputs, labels = sample["audio"].to(device), sample['emotion'].to(device)
        loss = loss + model.evaluate(inputs, labels)
    print("loss :",loss/len(train_dataset))

  0%|                                                                                         | 0/8795 [00:00<?, ?it/s]


AttributeError: 'Wav2Vec2ForSequenceClassification' object has no attribute 'fit'

In [None]:
model.fit(x_train, y_train)

model.evaluate(x_test, y_test)

In [6]:
import librosa

audio_file_path = "datasets/emotion_audio_data/5e2ac3d55807b852d9e01fd6.wav"
audio, sr = librosa.load(audio_file_path, sr=None)
output_logits = model(audio)

# 출력 로그its를 감정으로 변환합니다.
emotions = ["happy", "sad", "angry", "fearful", "surprised", "neutral"]
emotion = emotions[torch.argmax(output_logits)]

# 감정을 출력합니다.
print(emotion)

TypeError: conv1d() received an invalid combination of arguments - got (numpy.ndarray, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: ([31;1mnumpy.ndarray[0m, [31;1mParameter[0m, [31;1mParameter[0m, [31;1mtuple[0m, [31;1mtuple[0m, [31;1mtuple[0m, [32;1mint[0m)
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: ([31;1mnumpy.ndarray[0m, [31;1mParameter[0m, [31;1mParameter[0m, [31;1mtuple[0m, [31;1mtuple[0m, [31;1mtuple[0m, [32;1mint[0m)


In [None]:
import os

# Get the list of CSV files
df = pd.read_csv("datasets/emotion.csv")
file_names = df.iloc[:,1]

# Create a dictionary to map the CSV file names to the folder names
file_to_folder_map = {}
for name in file_names:
    # Create the folder for the year and month
    folder_name = f"{year}-{month}"
    os.makedirs(folder_name, exist_ok=True)

    # Map the CSV file name to the folder name
    file_to_folder_map[csv_file] = folder_name

# Iterate over the CSV files and move them to the corresponding folders
for csv_file, folder_name in file_to_folder_map.items():
    os.rename(csv_file, os.path.join(folder_name, csv_file))

## predict

In [36]:
class EmotionDataset(Dataset):
    def __init__(self, csv_file, audio_dir):
        self.emotion_frame = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.emotion_to_int = {'anger': 0, 'angry': 0, 'calm' : 1, 'disgust': 2, 'fear': 3, 'happiness': 4,
                               'neutral': 5, 'sad': 6, 'sadness': 6, 'surprise': 7}
        self.audios = []
        self.emotions = []
        self.load_data()

    def load_data(self):
        for idx in tqdm(range(len(self.emotion_frame))):
            file_path = os.path.join(self.audio_dir, self.emotion_frame.iloc[idx, 1]) + ".wav"
            
            emotion = self.emotion_frame.iloc[idx, 3]
            emotion = self.emotion_to_int[emotion]

            self.audios.append(file_path)
            self.emotions.append(emotion)

    def __len__(self):
        return len(self.emotion_frame)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        emotion = self.emotions[idx]

        return {'audio': audio, 'emotion': emotion}

In [30]:
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import numpy as np
from pydub import AudioSegment

# https://github.com/ehcalabres/EMOVoice
# the preprocessor was derived from https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
# processor1 = AutoProcessor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
# ^^^ no preload model available for this model (above), but the `feature_extractor` works in place
model1 = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

def predict_emotion(audio_file):
    if not audio_file:
        # I fetched some samples with known emotions from here: https://www.fesliyanstudios.com/royalty-free-sound-effects-download/poeple-crying-252
        audio_file = 'mp3/dude-crying.mp3'
    sound = AudioSegment.from_file(audio_file)
    sound = sound.set_frame_rate(16000)
    sound_array = np.array(sound.get_array_of_samples())
    # this model is VERY SLOW, so best to pass in small sections that contain 
    # emotional words from the transcript. like 10s or less.
    # how to make sub-chunk  -- this was necessary even with very short audio files 
    # test = torch.tensor(input.input_values.float()[:, :100000])

    input = feature_extractor(
        raw_speech=sound_array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt")

    result = model1.forward(input.input_values.float())
    # making sense of the result 
    id2label = {
        "0": "angry",
        "1": "calm",
        "2": "disgust",
        "3": "fearful",
        "4": "happy",
        "5": "neutral",
        "6": "sad",
        "7": "surprised"
    }
    interp = dict(zip(id2label.values(), list(round(float(i),4) for i in result[0][0])))
    return interp

Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.weight', 'projector.weight', 

In [39]:
train_csv_file = 'datasets/emotion_train.csv'
audio_dir = 'datasets/emotion_audio_data/'
train_dataset = EmotionDataset(train_csv_file, audio_dir)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

100%|█████████████████████████████████████████████████████████████████████████| 35179/35179 [00:00<00:00, 35287.14it/s]


In [45]:
emotion_to_int = {'angry': 0, 'calm' : 1, 'disgust': 2, 'fearful': 3, 'happy': 4,
                               'neutral': 5, 'sad': 6, 'surprised': 7}

In [46]:
for inputs in train_loader:
    name = inputs["audio"]
    label = inputs["emotion"]
    
    predict = []
    for file in name:
        result = predict_emotion(file)
        predict.append(emotion_to_int[max(result, key=result.get)])
    print(predict,label)
    break

[0, 5, 7, 5] tensor([4, 7, 0, 6])


In [42]:
predict[0]

{'angry': -0.0418,
 'calm': 0.004,
 'disgust': 0.0157,
 'fearful': 0.0401,
 'happy': -0.0276,
 'neutral': 0.0548,
 'sad': -0.0251,
 'surprised': 0.0568}

In [31]:
predict_emotion("datasets/emotion_audio_data/5e258fd1305bcf3ad153a6a4.wav")

{'angry': 0.0017,
 'calm': 0.0284,
 'disgust': -0.0384,
 'fearful': 0.02,
 'happy': -0.046,
 'neutral': 0.0779,
 'sad': -0.1076,
 'surprised': 0.0252}

## 재작성

In [3]:
class EmotionDataset(Dataset):
    def __init__(self, csv_file, audio_dir):
        self.emotion_frame = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.emotion_to_int = {'anger': 0, 'angry': 0, 'calm' : 1, 'disgust': 2, 'fear': 3, 'happiness': 4,
                               'neutral': 5, 'sad': 6, 'sadness': 6, 'surprise': 7}
        self.audios = []
        self.emotions = []
        self.load_data()

    def load_data(self):
        for idx in tqdm(range(len(self.emotion_frame))):
            file_path = os.path.join(self.audio_dir, self.emotion_frame.iloc[idx, 1]) + ".wav"
            sound = AudioSegment.from_file(file_path)
            sound = sound.set_frame_rate(16000)
            sound_array = np.array(sound.get_array_of_samples())
            # this model is VERY SLOW, so best to pass in small sections that contain 
            # emotional words from the transcript. like 10s or less.
            # how to make sub-chunk  -- this was necessary even with very short audio files 
            # test = torch.tensor(input.input_values.float()[:, :100000])

            input = feature_extractor(
                raw_speech=sound_array,
                sampling_rate=16000,
                padding=True,
                return_tensors="pt")
            input = input.input_values.float()
            
            emotion = self.emotion_frame.iloc[idx, 3]
            emotion = self.emotion_to_int[emotion]

            self.audios.append(input)
            self.emotions.append(emotion)

    def __len__(self):
        return len(self.emotion_frame)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        emotion = self.emotions[idx]

        return {'audio': audio, 'emotion': emotion}

In [4]:
train_csv_file = 'datasets/emotion_train.csv'
audio_dir = 'datasets/emotion_audio_data/'
train_dataset = EmotionDataset(train_csv_file, audio_dir)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

100%|████████████████████████████████████████████████████████████████████████████| 35179/35179 [09:27<00:00, 62.01it/s]


In [5]:
test_csv_file = 'datasets/emotion_test.csv'
test_dataset = EmotionDataset(test_csv_file, audio_dir)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

100%|██████████████████████████████████████████████████████████████████████████████| 8793/8793 [03:41<00:00, 39.71it/s]


In [7]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

In [None]:
result = model1.forward(input.input_values.float())

In [6]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = torch.nn.CrossEntropyLoss()

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,)

In [8]:
epoches = 30

for epoch in range(epoches):
    model.train()
    for batch in tqdm(train_loader):
        inputs = batch["audio"][0].to(device)
        labels = batch["emotion"].to(device)
        
        optimizer.zero_grad()
        predictions = model(inputs)
        predictions = predictions[:2][0]
        loss = loss_function(predictions,labels)
        
        loss.backward()
        optimizer.step()
    
    #print("Epoch {}: Accuracy {}".format(epoch, accuracy))

 42%|██████████████████████████████▍                                         | 14860/35179 [1:06:22<1:30:45,  3.73it/s]


RuntimeError: CUDA out of memory. Tried to allocate 514.00 MiB (GPU 0; 8.00 GiB total capacity; 6.40 GiB already allocated; 0 bytes free; 6.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
(tensor([[-0.0589,  0.0159, -0.0422,  0.0575, -0.0213, -0.0528, -0.0304,  0.0539]],
       device='cuda:0', grad_fn=<AddmmBackward0>),)