In [None]:
import torch
import pandas as pd
import numpy as np
import os
import warnings
import matplotlib.pyplot as plt

from transformers import AutoTokenizer,AutoProcessor, AutoModelForSequenceClassification
from datasets import load_dataset
from tqdm import tqdm
from torchvision import models
from torchmetrics.classification import MultilabelF1Score

In [None]:

warnings.filterwarnings("ignore")

# Data Preprocessing

In [None]:
genres = ["admiration", "amusement", "anger", "annoyance", "approval", 
        "caring", "confusion","curiosity", "desire", "disappointment", 
        "disapproval", "disgust", "embarrassment", "excitement", "fear", 
        "gratitude", "grief", "joy", "love", "nervousness", "optimism", 
        "pride", "realization", "relief", "remorse", "sadness", "surprise", 
        "neutral"]
mapping = {}
for i in range(len(genres)):
    mapping[i] = genres[i]
mapping

In [None]:
#none dataset found

# Generate Text From Dataset

In [None]:
processor_gen_text = AutoProcessor.from_pretrained("openai/whisper-tiny")
model_gen_text = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny")

In [None]:
#use the model above to generate text for the input audio file using function
def generate_text(df: df.DataFrame, model: AutoModelForSpeechSeq2Seq, processor: AutoProcessor, device) -> pd.DataFrame:
    model_gen_text.to(device)
    model_gen_text.eval()
    
    for i in tqdm(range(len(df))):
        with torch.no_grad():
            input_ids = processor(df.loc[i, "audio"], return_tensors="pt").input_ids.to(device)
            output = model_gen_text.generate(input_ids, max_length=100)
            df.loc[i, 'text'] = processor.decode(output[0], skip_special_tokens=True)
    return df

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model Implementation

### Sub-Model

In [None]:
tokenizer_text = AutoTokenizer.from_pretrained("joeddav/distilbert-base-uncased-go-emotions-student")
model_text = AutoModelForSequenceClassification.from_pretrained("joeddav/distilbert-base-uncased-go-emotions-student")

processor_speech = AutoProcessor.from_pretrained("pollner/distilhubert-finetuned-ravdess")
model_speech = AutoModelForAudioClassification.from_pretrained("pollner/distilhubert-finetuned-ravdess")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_text.to(device)
model_speech.to(device)
device

### DMAF

In [None]:
class Multimodel(torch.nn.Module):
    def __init__(self, model_text, model_speech):
        super().__init__()
        self.model_text = model_text
        self.model_speech = model_speech
        self.fc1 = torch.nn.Linear(27, 27)
        self.fc2 = torch.nn.Linear(27, 27)
    
    def forward(self, text_input_ids, text_attention_mask, speech_input_ids, speech_attention_mask):
        text_output = self.model_text(text_input_ids, text_attention_mask)
        speech_output = self.model_speech(speech_input_ids, speech_attention_mask)
        
        text_output = self.fc1(text_output.logits)
        speech_output = self.fc2(speech_output.logits)
        
        x = torch.add(text_output, speech_output)
        return x

# Dataset and Dataloader

In [None]:
class Sunset(torch.utils.data.Dataset):
    def __init__(self, df, 
                tokenizer_text, processor_speech, 
                max_len1=32, max_len2=32,
                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')):
        self.df = df
        self.tokenizer_text = tokenizer_text
        self.processor_speech = processor_speech
        self.max_len1 = max_len1
        self.max_len2 = max_len2
        self.device = device
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, "text"]
        speech = self.df.loc[idx, "audio"]
        
        text = self.tokenizer_text(text, return_tensors="pt", max_length=self.max_len1, padding="max_length", truncation=True)
        speech = self.processor_speech(speech, return_tensors="pt", max_length=self.max_len2, padding="max_length", truncation=True)
        
        text_input_ids = text.input_ids.to(self.device)
        text_attention_mask = text.attention_mask.to(self.device)
        speech_input_ids = speech.input_values.to(self.device)
        speech_attention_mask = speech.attention_mask.to(self.device)
        
        return {
            "text_input_ids": text_input_ids,
            "text_attention_mask": text_attention_mask,
            "speech_input_ids": speech_input_ids,
            "speech_attention_mask": speech_attention_mask
        }

In [None]:
#trainset = Sunset(df, tokenizer_text, processor_speech, device)
#testset = Sunset(df, tokenizer_text, processor_speech, device)

In [None]:
#trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
#testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False)

# Train

### GPU and Model

In [None]:
model = Multimodel(model_text, model_speech)
model.to(device)
device

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=2e-5)

### Train and Validation

In [None]:
history_loss = []
history_f1 = []
def train(epoch):
    model.train()
    f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
    f1.to(device)

    actual = []
    predicted = []
    for _, data in tqdm(enumerate(trainloader, 0), total=len(trainloader)):
        text_input_ids = data["text_input_ids"]
        text_attention_mask = data["text_attention_mask"]
        speech_input_ids = data["speech_input_ids"]
        speech_attention_mask = data["speech_attention_mask"]
        #emotion = data['emotion'].to(device)
        
        targets = torch.tensor([df.loc[i, genres].values for i in range(len(df))]).to(device)
        
        optimizer.zero_grad()
        outputs = model(text_input_ids, text_attention_mask, speech_input_ids, speech_attention_mask)
        
        loss = loss_fn(outputs, emotion)
        loss.backward()
        optimizer.step()
        
        f1.update(outputs.sigmoid(), emotion)
    
    print(f'Epoch: {epoch}, Train Loss: {loss.item()}, Train F1: {f1.compute().item()}, Train MAP: {maps}, Train NDCG: {ndcg}')
    history_loss.append(loss.item())
    history_f1.append(f1.compute().item())

In [None]:
for epoch in range(16):
    train(epoch)

In [None]:
torch.save(model.state_dict(), 'multimodel.pt')

In [None]:
plt.plot(history_loss)
plt.plot(history_f1)
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['loss', 'F1-Macro'], loc='upper left')
plt.show()

In [None]:
#Validation
def test(testing_loader):
    model.eval()
    f1 = MultilabelF1Score(num_labels=18, threshold=0.5, average='macro')
    f1.to(device)
    
    actual = []
    predicted = []
    for _, data in tqdm(enumerate(testing_loader, 0), total=len(testing_loader)):
        text_input_ids = data["text_input_ids"]
        text_attention_mask = data["text_attention_mask"]
        speech_input_ids = data["speech_input_ids"]
        speech_attention_mask = data["speech_attention_mask"]
        emotion = data['emotion'].to(device)
        
        with torch.no_grad():
            outputs = model(text_input_ids, text_attention_mask, speech_input_ids, speech_attention_mask)
        
        f1.update(outputs.sigmoid(), emotion)
        
    print(f'Test F1: {f1.compute().item()}')
    return f1.compute().item()