In [None]:
import torch as t
import librosa
import openl3
from os import listdir
import pandas as pd
from os.path import isfile, join
t.cuda.set_device(8)
import numpy as np
from tqdm import tqdm

In [None]:
audio_dir = "/raid/amana/lavish_multi_model/emotion_detection/data/raw_audio"

In [None]:
def get_dataset_from_me1256(audio_file):
    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type="env",embedding_size= 512)
    waveform, sample_rate = librosa.load(audio_file)
    waveform = t.tensor(waveform)
    waveform = waveform.mean(dim=0, keepdim=True)
    waveform /= waveform.abs().max()
    with t.no_grad():
        embeddings, timestamps = openl3.get_audio_embedding(
            waveform.numpy(), model=model, hop_size=0.1,sr=16000
        )
    return embeddings

In [None]:
onlyfiles = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f))]
data = pd.read_csv("/raid/amana/lavish_multi_model/emotion_detection/data/text_data.csv")
data['filename'] = [f'dia{a}_utt{b}' for a,b in zip(data['Dialogue_ID'],data['Utterance_ID'])]
labels = list(data['Emotion'])

In [None]:
# dataset = []

# for a,b in zip(onlyfiles,labels):
#     x = get_dataset_from_me1256(audio_dir+"/"+a)
#     dataset.append([x,b])

In [6]:
dataset = np.load('/raid/amana/lavish_multi_model/emotion_detection/dataset_mel256.npy',allow_pickle=True)

In [10]:
len(dataset)

1109

In [11]:

# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
# from torch.utils.data import DataLoader,random_split

# train_size = int(0.85 * len(dataset))
# test_size = len(dataset) - train_size

# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
# test_dataloader = DataLoader(test_dataset, batch_size=1,shuffle=False)

In [13]:
dataset[:,1]

array([6, 0, 5, ..., 6, 6, 6], dtype=object)

In [14]:
X_train,X_test,Y_train,Y_test = train_test_split(dataset[:,0],dataset[:,1],test_size=0.2,random_state=42)

In [27]:
import torch.nn as nn
import torch.nn.functional as F

class EmotionMLP(nn.Module):
    def softmax(self,x):
        e_x = t.exp(x - t.max(x))
        return e_x / e_x.sum()
    
    def __init__(self, num_classes):
        super(EmotionMLP, self).__init__()
        self.fc1 = nn.Linear(512,16)
        self.fc2 = nn.Linear(16,num_classes)
    
    def forward(self, x):
        x = x.view(-1, 512)
        x = t.relu(self.fc1(x))
        # x = F.softmax(x)
        return x

In [28]:
import warnings
warnings.filterwarnings("ignore")
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

In [29]:
num_classes = 7
model = EmotionMLP(num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.0005)

In [30]:
def train(model,X_train,Y_train,optimizer,criterion,num_epochs,device):
    loss_arr = []
    for epoch in tqdm(range(num_epochs)):
        # model.train()
        total_loss = 0
        for x,y in zip(X_train,Y_train):
            x = t.tensor(x)
            # Forward pass
            outputs = model(x.to(device))
            loss = criterion(outputs,t.tensor([y]).to(device)).to(device)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        loss_arr.append(total_loss/len(X_train))
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {total_loss/len(X_train)}')
    return loss_arr

num_epochs = 20
lossarr = train(model,X_train,Y_train,optimizer,criterion,num_epochs,device)

  5%|▌         | 1/20 [00:07<02:14,  7.05s/it]

Epoch 1/20, Training Loss: 2.7742611243732886


 10%|█         | 2/20 [00:14<02:13,  7.40s/it]

Epoch 2/20, Training Loss: 2.7725887298583984


 15%|█▌        | 3/20 [00:22<02:06,  7.44s/it]

Epoch 3/20, Training Loss: 2.7725887298583984


 20%|██        | 4/20 [00:30<02:01,  7.61s/it]

Epoch 4/20, Training Loss: 2.7725887298583984


 25%|██▌       | 5/20 [00:39<02:04,  8.30s/it]

Epoch 5/20, Training Loss: 2.7725887298583984


 30%|███       | 6/20 [00:47<01:53,  8.08s/it]

Epoch 6/20, Training Loss: 2.7725887298583984


 35%|███▌      | 7/20 [00:55<01:47,  8.27s/it]

Epoch 7/20, Training Loss: 2.7725887298583984


 40%|████      | 8/20 [01:03<01:37,  8.10s/it]

Epoch 8/20, Training Loss: 2.7725887298583984


 45%|████▌     | 9/20 [01:11<01:27,  7.96s/it]

Epoch 9/20, Training Loss: 2.7725887298583984


 50%|█████     | 10/20 [01:19<01:20,  8.09s/it]

Epoch 10/20, Training Loss: 2.7725887298583984


 55%|█████▌    | 11/20 [01:27<01:11,  7.93s/it]

Epoch 11/20, Training Loss: 2.7725887298583984


 60%|██████    | 12/20 [01:35<01:03,  7.99s/it]

Epoch 12/20, Training Loss: 2.7725887298583984


 65%|██████▌   | 13/20 [01:44<00:58,  8.35s/it]

Epoch 13/20, Training Loss: 2.7725887298583984


 70%|███████   | 14/20 [01:53<00:51,  8.65s/it]

Epoch 14/20, Training Loss: 2.7725887298583984


 75%|███████▌  | 15/20 [02:01<00:42,  8.43s/it]

Epoch 15/20, Training Loss: 2.7725887298583984


 80%|████████  | 16/20 [02:10<00:33,  8.37s/it]

Epoch 16/20, Training Loss: 2.7725887298583984


 85%|████████▌ | 17/20 [02:17<00:24,  8.17s/it]

Epoch 17/20, Training Loss: 2.7725887298583984


 90%|█████████ | 18/20 [02:25<00:16,  8.01s/it]

Epoch 18/20, Training Loss: 2.7725887298583984


 95%|█████████▌| 19/20 [02:32<00:07,  7.88s/it]

Epoch 19/20, Training Loss: 2.7725887298583984


100%|██████████| 20/20 [02:40<00:00,  8.02s/it]

Epoch 20/20, Training Loss: 2.7725887298583984





In [32]:
def test(model,X_test,Y_test,device):
    acc = 0
    def accuracy(y_true, y_pred):
        eq = t.eq(y_true, y_pred).int()
        return sum(eq)/len(eq)

    with t.no_grad():
        model.eval()
        for x,y in zip(X_test,Y_test):
            outputs = model(t.tensor(x).to(device))
            outputs1 = outputs.detach().cpu()
            outputs1 = outputs1.argmax(dim=1)
            acc += accuracy(t.tensor([y]),outputs1)
        print(f"accuracy: {(acc/len(X_test))*100: 0.2f}%")
        
test(model,X_test,Y_test,device)

accuracy:  12.61%
