# Import

In [1]:
!pip install torcheval

Collecting torcheval
  Obtaining dependency information for torcheval from https://files.pythonhosted.org/packages/e4/de/e7abc784b00de9d05999657d29187f1f7a3406ed10ecaf164de06482608f/torcheval-0.0.7-py3-none-any.whl.metadata
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [2]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertTokenizer, BertModel, AutoModel, AutoProcessor
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch, torchaudio, torchtext
from torcheval.metrics.functional import multiclass_f1_score
import torch.nn as nn
import os
import gc
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')



Using device: cuda


# Load df

In [3]:
try:
    df_path = '/kaggle/input/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = '/kaggle/input/MM-USElecDeb60to16/audio_clips'
    save_path = '/kaggle/working/'
    df = pd.read_csv(df_path, index_col=0)
except FileNotFoundError:
    df_path = 'multimodal-dataset/files/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = 'multimodal-dataset/files/MM-USElecDeb60to16/audio_clips'
    save_path = 'multimodal-dataset/files'
    df = pd.read_csv(df_path, index_col=0)
# drop rows where audio length is 0
df = df[df['NewBegin'] != df['NewEnd']]

train_df_complete = df[df['Set'] == 'TRAIN']
val_df_complete = df[df['Set'] == 'VALIDATION']
test_df_complete = df[df['Set'] == 'TEST']

DATASET_RATIO = 1

train_df = train_df_complete.iloc[:int(DATASET_RATIO * len(train_df_complete))]
val_df = val_df_complete.iloc[:int(DATASET_RATIO * len(val_df_complete))]
test_df = test_df_complete.iloc[:int(DATASET_RATIO * len(test_df_complete))]

In [4]:
train_df.head()

Unnamed: 0,Text,Part,Document,Order,Sentence,Start,End,Annotator,Tag,Component,...,Speaker,SpeakerType,Set,Date,Year,Name,MainTag,NewBegin,NewEnd,idClip
0,"CHENEY: Gwen, I want to thank you, and I want ...",1,30_2004,0,0,2101,2221,,"{""O"": 27}",O,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O,126.52,131.08,clip_0
1,"It's a very important event, and they've done ...",1,30_2004,1,1,2221,2304,,"{""O"": 19}",O,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O,131.08,134.4,clip_1
2,It's important to look at all of our developme...,1,30_2004,2,2,2304,2418,,"{""O"": 23}",O,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O,134.4,140.56,clip_2
3,"And, after 9/11, it became clear that we had t...",1,30_2004,3,3,2418,2744,,"{""O"": 16, ""Claim"": 50}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,140.56,158.92,clip_3
4,And we also then finally had to stand up democ...,1,30_2004,4,4,2744,2974,,"{""O"": 4, ""Claim"": 13, ""Premise"": 25}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,158.92,172.92,clip_4


In [5]:
len(train_df), len(test_df), len(val_df)

(12419, 7465, 6897)

## Distribution of classes over train df

In [6]:
num_claim = len(train_df[train_df['Component'] == 'Claim'])
num_premise = len(train_df[train_df['Component'] == 'Premise'])
num_other = len(train_df[train_df['Component'] == 'O'])

print(f'Total Claim: {num_claim}: {num_claim*100/len(train_df):.2f}%')
print(f'Total Premise: {num_premise}: {num_premise*100/len(train_df):.2f}%')
print(f'Total Other: {num_other}: {num_other*100/len(train_df):.2f}%')

Total Claim: 5029: 40.49%
Total Premise: 4426: 35.64%
Total Other: 2964: 23.87%


Unbalanced dataset

# Train and evaluation Loop

In [7]:
class BestModel:
    """
        Class to keep track of the best performing model on validation set during training
    """
    def __init__(self):
        self.best_validation_loss = float('Infinity')
        self.best_state_dict = None
    def __call__(self, model, loss):
        if loss < self.best_validation_loss:
            self.best_validation_loss = loss
            self.best_state_dict = model.state_dict()

def evaluate(model, val_loader, loss_fn, best_model_tracker):
    model.eval()
    valid_loss = 0.0
    num_correct = 0 
    num_examples = 0
    tot_pred, tot_targ = torch.LongTensor().to(device), torch.LongTensor().to(device)
    for batch in val_loader:
        texts, audio_features, audio_attention, targets = batch
        audio_features = audio_features.to(device)
        audio_attention = audio_attention.to(device)
        targets = targets.to(device)
        output = model(texts,audio_features,audio_attention)
        loss = loss_fn(output, targets)
        valid_loss += loss.detach()
        predicted_labels = torch.argmax(output, dim=-1)
        tot_targ = torch.cat((tot_targ, targets))
        tot_pred = torch.cat((tot_pred, predicted_labels))            
        correct = torch.eq(predicted_labels, targets).view(-1)
        num_correct += torch.sum(correct).item()
        num_examples += correct.shape[0]
    best_model_tracker(model, valid_loss)
    valid_loss = valid_loss.cpu().item()
    valid_loss /= len(val_loader.dataset)
    accuracy = num_correct/num_examples
    f1 = multiclass_f1_score(tot_pred, tot_targ, num_classes=3, average="macro")
    return valid_loss, accuracy, f1

            
def train(model, optimizer, loss_fn, train_loader, val_loader, epochs=10, device="cuda"):
    best_model_tracker = BestModel()
    for epoch in tqdm(range(epochs)):
        training_loss = 0.0
        model.train()

        for batch in train_loader:
            optimizer.zero_grad()
            texts, audio_features, audio_attention, targets = batch
            audio_features = audio_features.to(device)
            audio_attention = audio_attention.to(device)
            targets = targets.to(device)
            output = model(texts,audio_features,audio_attention)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.detach()
        training_loss = training_loss.cpu().item()
        training_loss /= len(train_loader.dataset)
        valid_loss, accuracy, f1 = evaluate(model, val_loader, loss_fn, best_model_tracker)
        print(f'Epoch: {epoch}, Training Loss: {training_loss:.4f}, Validation Loss: {valid_loss:.4f}, accuracy = {accuracy:.4f}, F1={f1:.4f}')
    model.load_state_dict(best_model_tracker.best_state_dict)    

# Dataset Creation

In [8]:
text_model_card = 'bert-base-uncased'
audio_model_card = 'facebook/wav2vec2-base-960h'

tokenizer = BertTokenizer.from_pretrained(text_model_card)
embedder = BertModel.from_pretrained(text_model_card).to(device)

for params in embedder.parameters():
    params.requires_grad = False

label_2_id = {
    'Claim': 0,
    'Premise': 1,
    'O': 2
}

DOWNSAMPLE_FACTOR = 1/5

class MM_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, audio_dir, sample_rate):
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate

        self.audio_processor = AutoProcessor.from_pretrained(audio_model_card)
        self.audio_model = AutoModel.from_pretrained(audio_model_card).to(device)

        self.dataset = []

        # Iterate over df
        for _, row in tqdm(df.iterrows()):
            path = os.path.join(self.audio_dir, f"{row['Document']}/{row['idClip']}.wav")
            if os.path.exists(path):
                # obtain audio WAV2VEC features
                audio, sampling_rate = torchaudio.load(path)
                if sampling_rate != self.sample_rate:
                    audio = torchaudio.functional.resample(audio, sample_rate, self.sample_rate)
                    audio = torch.mean(audio, dim=0, keepdim=True)
                with torch.inference_mode():
                    input_values = self.audio_processor(audio, sampling_rate=self.sample_rate).input_values[0]
                    input_values = torch.tensor(input_values).to(device)
                    audio_model_output = self.audio_model(input_values)
                    audio_features = audio_model_output.last_hidden_state[0].unsqueeze(0)
                    audio_features = torch.nn.functional.interpolate(audio_features.permute(0,2,1), scale_factor=DOWNSAMPLE_FACTOR, mode='linear')
                    audio_features = audio_features.permute(0,2,1)[0]
                    audio_features = audio_features.cpu()
                
                text = row['Text']

                self.dataset.append((text, audio_features, label_2_id[row['Component']]))
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
try:
    train_dataset = torch.load(f'{save_path}/train_dataset.pkl')
    test_dataset = torch.load(f'{save_path}/test_dataset.pkl')
    val_dataset = torch.load(f'{save_path}/val_dataset.pkl')
    print('Restored datasets from memory')
except:
    print('Creating new datasets')
    train_dataset = MM_Dataset(train_df, audio_path, 16_000)
    test_dataset = MM_Dataset(test_df, audio_path, 16_000)
    val_dataset = MM_Dataset(val_df, audio_path, 16_000)
    try:
        torch.save(train_dataset, f'{save_path}/train_dataset.pkl')
        torch.save(test_dataset, f'{save_path}/test_dataset.pkl')
        torch.save(val_dataset, f'{save_path}/val_dataset.pkl')
    except:
        print('Erorr in saving datasets')

Creating new datasets


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
12419it [20:30, 10.10it/s]


Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
7465it [12:28,  9.97it/s]


Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
6897it [11:38,  9.87it/s]


## Dataloader creation

In [10]:
def create_dataloader(dataset, batch_size):
    def pack_fn(batch):
        texts = [x[0] for x in batch]
        audio_features = [x[1] for x in batch]
        labels = torch.tensor([x[2] for x in batch])
        
        # pad audio features
        audio_features = pad_sequence(audio_features, batch_first=True, padding_value=float('-inf'))

        audio_features_attention_mask = audio_features[:, :, 0] != float('-inf')
        
        audio_features[(audio_features == float('-inf'))] = 0

        return texts, audio_features, audio_features_attention_mask, labels

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=pack_fn)
    return dataloader

In [11]:
train_dataloader = create_dataloader(train_dataset, 8)
val_dataloader = create_dataloader(val_dataset, 8)
test_dataloader = create_dataloader(test_dataset, 8)

In [12]:
#del early_fusion
gc.collect()

0

# Positional Encoding

In [13]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dual_modality=False, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.dual_modality = dual_modality
        self.pe = self.pe.to(device)

    def forward(self, x, is_first=True):
        if self.dual_modality:
            modality = torch.ones((x.shape[0], x.shape[1], 4), dtype=torch.float32).to(device) * (0 if is_first else 1)
            x = x + self.pe[:x.size(0)]
            x = self.dropout(x)        
            return torch.cat((x, modality), axis=-1)
        else:
            x = x + self.pe[:x.size(0)]
            return self.dropout(x)

# Multimodal-Transformer Model

In [14]:
class MultiModalTransformer(nn.Module):
    def __init__(self, tokenizer, embedder, transformer, head):
        super().__init__()
        self.pos_encoder = PositionalEncoding(768, dual_modality=True)
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.transformer = transformer
        self.head = head

    def forward(self, texts, audio_features, audio_attentions):
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][0]
        text_features = self.pos_encoder(text_features, is_first=True)
        text_attentions = tokenizer_output.attention_mask
        
        audio_features = self.pos_encoder(audio_features, is_first=False)
        
        concatenated_features = torch.cat((text_features, audio_features), dim=1)
        concatenated_attentions = torch.cat((text_attentions, audio_attentions.float()), dim=1)
        
        # padding mask is 1 where there is padding (i.e. where attention is 0) and 0 otherwise
        concatenated_padding_mask = ~concatenated_attentions.to(torch.bool)
        
        # compute a full attention mask of size [seq_len, seq_len]
        full_attention_mask = torch.zeros((concatenated_features.shape[1], concatenated_features.shape[1]), dtype=torch.bool).to(device)
                
        transformer_output = self.transformer(src=concatenated_features,  mask=full_attention_mask, src_key_padding_mask=concatenated_padding_mask)
        transformer_output_sum = (transformer_output * concatenated_attentions.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / concatenated_attentions.sum(axis=1).unsqueeze(-1)
        return self.head(transformer_output_pooled)

multimodal_transformer_layer = nn.TransformerEncoderLayer(d_model=772, nhead=4, dim_feedforward=512, batch_first=True).to(device)
multimodal_transformer_encoder = nn.TransformerEncoder(multimodal_transformer_layer, num_layers=4).to(device)

multimodal_transformer_head = nn.Sequential(
    nn.Linear(772, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

multimodal_transformer = MultiModalTransformer(tokenizer, embedder, multimodal_transformer_encoder, multimodal_transformer_head).to(device)

multimodal_optimizer = torch.optim.Adam(multimodal_transformer.parameters(), lr=1e-4)
multimodal_criterion = nn.CrossEntropyLoss()

train(multimodal_transformer, multimodal_optimizer, multimodal_criterion, train_dataloader, val_dataloader, epochs=10, device=device)

 10%|█         | 1/10 [03:18<29:42, 198.02s/it]

Epoch: 0, Training Loss: 0.1327, Validation Loss: 0.1375, accuracy = 0.4145, F1=0.2113


 20%|██        | 2/10 [06:35<26:21, 197.67s/it]

Epoch: 1, Training Loss: 0.1317, Validation Loss: 0.1365, accuracy = 0.4083, F1=0.1933


 30%|███       | 3/10 [09:52<23:02, 197.57s/it]

Epoch: 2, Training Loss: 0.1346, Validation Loss: 0.1352, accuracy = 0.4083, F1=0.1933


 40%|████      | 4/10 [13:11<19:47, 198.00s/it]

Epoch: 3, Training Loss: 0.1336, Validation Loss: 0.1397, accuracy = 0.4083, F1=0.1933


 50%|█████     | 5/10 [16:28<16:28, 197.78s/it]

Epoch: 4, Training Loss: 0.1342, Validation Loss: 0.1356, accuracy = 0.4083, F1=0.1933


 60%|██████    | 6/10 [19:46<13:10, 197.71s/it]

Epoch: 5, Training Loss: 0.1340, Validation Loss: 0.1400, accuracy = 0.4083, F1=0.1933


 70%|███████   | 7/10 [23:04<09:52, 197.66s/it]

Epoch: 6, Training Loss: 0.1344, Validation Loss: 0.1350, accuracy = 0.4083, F1=0.1933


 80%|████████  | 8/10 [26:21<06:35, 197.70s/it]

Epoch: 7, Training Loss: 0.1336, Validation Loss: 0.1363, accuracy = 0.4083, F1=0.1933


 90%|█████████ | 9/10 [29:40<03:17, 197.99s/it]

Epoch: 8, Training Loss: 0.1346, Validation Loss: 0.1354, accuracy = 0.4083, F1=0.1933


100%|██████████| 10/10 [32:59<00:00, 197.97s/it]

Epoch: 9, Training Loss: 0.1345, Validation Loss: 0.1352, accuracy = 0.4083, F1=0.1933





# Ensembling-Fusion Model

## Text-Only and Audio-Only Models 

In [15]:
class TextModel(nn.Module):
    def __init__(self, tokenizer, embedder, head):
        super().__init__()
        self.pos_encoder = PositionalEncoding(768, dual_modality=False)
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.head = head
    def forward(self, texts, audio_features, audio_attention):
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['last_hidden_state']
        text_features = self.pos_encoder(text_features)        
        text_features_sum = (text_features * tokenizer_output.attention_mask.unsqueeze(-1)).sum(axis=1)
        text_features_pooled = text_features_sum / tokenizer_output.attention_mask.sum(axis=1).unsqueeze(-1)
        return self.head(text_features_pooled)
    
class AudioModel(nn.Module):        
    def __init__(self, transformer, head):
        super().__init__()
        self.pos_encoder = PositionalEncoding(768, dual_modality=False)
        self.transformer = transformer
        self.head = head
        
    def forward(self, texts, audio_features, audio_attention):
        padding_mask = ~audio_attention.to(torch.bool)
        audio_features = self.pos_encoder(audio_features)
        full_attention_mask = torch.zeros((audio_features.shape[1],audio_features.shape[1]), dtype=torch.bool).to(device)
        transformer_output = self.transformer(src=audio_features, mask=full_attention_mask, src_key_padding_mask=padding_mask)
        
        # pooling transformer output
        transformer_output_sum = (transformer_output * audio_attention.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / audio_attention.sum(axis=1).unsqueeze(-1)
        return self.head(transformer_output_pooled)

## Ensembling Model

In [16]:
 class EnsemblingFusion(nn.Module):
    def __init__(self, text_model, audio_model):
        super().__init__()
        self.text_model = text_model
        self.audio_model = audio_model
        self.weight = torch.nn.Parameter(torch.tensor(0.0))
        
    def forward(self, texts, audio_features, audio_attentions):
        text_logits = self.text_model(texts, audio_features, audio_attentions)
        audio_logits = self.audio_model(texts, audio_features, audio_attentions)
        
        text_probabilities = torch.nn.functional.softmax(text_logits)
        audio_probabilities = torch.nn.functional.softmax(audio_logits)
        
        coefficient = (torch.tanh(self.weight) + 1) / 2
        
        coefficient = coefficient*0.4 + 0.3
        
        return coefficient*text_probabilities + (1-coefficient)*audio_probabilities
    
ensembling_text_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

ensembling_audio_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

ensembling_transformer_layer = nn.TransformerEncoderLayer(d_model=768, nhead=4, dim_feedforward=512, batch_first=True).to(device)
ensembling_transformer_encoder = nn.TransformerEncoder(ensembling_transformer_layer, num_layers=4).to(device)

ensembling_text_model = TextModel(tokenizer, embedder, ensembling_text_head)
ensembling_audio_model = AudioModel(ensembling_transformer_encoder, ensembling_audio_head)

ensembling_fusion = EnsemblingFusion(ensembling_text_model, ensembling_audio_model)

ensembling_optimizer = torch.optim.Adam(ensembling_fusion.parameters(), lr=1e-4)

def custom_loss(outputs, targets):
    return torch.nn.functional.nll_loss(torch.log(outputs), targets, reduction='mean')

train(ensembling_fusion, ensembling_optimizer, custom_loss, train_dataloader, val_dataloader, epochs=10, device=device)

 10%|█         | 1/10 [02:53<26:04, 173.79s/it]

Epoch: 0, Training Loss: 0.1213, Validation Loss: 0.1348, accuracy = 0.4412, F1=0.2789


 20%|██        | 2/10 [05:47<23:09, 173.70s/it]

Epoch: 1, Training Loss: 0.1117, Validation Loss: 0.1298, accuracy = 0.4908, F1=0.3918


 30%|███       | 3/10 [08:41<20:15, 173.70s/it]

Epoch: 2, Training Loss: 0.1093, Validation Loss: 0.1256, accuracy = 0.5186, F1=0.4468


 40%|████      | 4/10 [11:34<17:22, 173.76s/it]

Epoch: 3, Training Loss: 0.1080, Validation Loss: 0.1246, accuracy = 0.5239, F1=0.4554


 50%|█████     | 5/10 [14:28<14:29, 173.82s/it]

Epoch: 4, Training Loss: 0.1068, Validation Loss: 0.1251, accuracy = 0.5233, F1=0.4532


 60%|██████    | 6/10 [17:22<11:35, 173.85s/it]

Epoch: 5, Training Loss: 0.1062, Validation Loss: 0.1250, accuracy = 0.5256, F1=0.4571


 70%|███████   | 7/10 [20:16<08:41, 173.88s/it]

Epoch: 6, Training Loss: 0.1058, Validation Loss: 0.1217, accuracy = 0.5440, F1=0.4898


 80%|████████  | 8/10 [23:10<05:47, 173.81s/it]

Epoch: 7, Training Loss: 0.1054, Validation Loss: 0.1215, accuracy = 0.5460, F1=0.4928


 90%|█████████ | 9/10 [26:05<02:54, 174.28s/it]

Epoch: 8, Training Loss: 0.1049, Validation Loss: 0.1226, accuracy = 0.5427, F1=0.4859


100%|██████████| 10/10 [29:00<00:00, 174.04s/it]

Epoch: 9, Training Loss: 0.1047, Validation Loss: 0.1230, accuracy = 0.5395, F1=0.4806





# Text-Only

In [19]:
text_only_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

text_only = TextModel(tokenizer, embedder, text_only_head)

text_only_optimizer = torch.optim.Adam(text_only.parameters(), lr=1e-4)
text_only_criterion = nn.CrossEntropyLoss()

train(text_only, text_only_optimizer, text_only_criterion, train_dataloader, val_dataloader, epochs=10, device=device)

 10%|█         | 1/10 [01:17<11:38, 77.61s/it]

Epoch: 0, Training Loss: 0.1167, Validation Loss: 0.1342, accuracy = 0.4773, F1=0.3622


 20%|██        | 2/10 [02:35<10:22, 77.83s/it]

Epoch: 1, Training Loss: 0.1067, Validation Loss: 0.1343, accuracy = 0.5049, F1=0.4173


 30%|███       | 3/10 [03:53<09:05, 77.91s/it]

Epoch: 2, Training Loss: 0.1045, Validation Loss: 0.1333, accuracy = 0.5133, F1=0.4336


 40%|████      | 4/10 [05:11<07:47, 77.91s/it]

Epoch: 3, Training Loss: 0.1034, Validation Loss: 0.1321, accuracy = 0.5202, F1=0.4463


 50%|█████     | 5/10 [06:29<06:29, 77.94s/it]

Epoch: 4, Training Loss: 0.1026, Validation Loss: 0.1317, accuracy = 0.5239, F1=0.4526


 60%|██████    | 6/10 [07:47<05:11, 77.89s/it]

Epoch: 5, Training Loss: 0.1021, Validation Loss: 0.1315, accuracy = 0.5270, F1=0.4579


 70%|███████   | 7/10 [09:05<03:53, 77.97s/it]

Epoch: 6, Training Loss: 0.1014, Validation Loss: 0.1320, accuracy = 0.5275, F1=0.4578


 80%|████████  | 8/10 [10:23<02:35, 77.95s/it]

Epoch: 7, Training Loss: 0.1010, Validation Loss: 0.1331, accuracy = 0.5265, F1=0.4564


 90%|█████████ | 9/10 [11:41<01:18, 78.04s/it]

Epoch: 8, Training Loss: 0.1009, Validation Loss: 0.1316, accuracy = 0.5301, F1=0.4619


100%|██████████| 10/10 [12:59<00:00, 77.93s/it]

Epoch: 9, Training Loss: 0.1002, Validation Loss: 0.1306, accuracy = 0.5375, F1=0.4756





# Unaligned Multimodal Model

In [18]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ffn)
        self.w_2 = nn.Linear(d_ffn, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

class CrossModalAttentionBlock(nn.Module):
    def __init__(self, embedding_dim, d_ffn):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.layer_norm = nn.LayerNorm(self.embedding_dim)
        self.mh_attention = nn.MultiheadAttention(self.embedding_dim, 4, 0.1, batch_first=True)
        self.pointwise_ff = PositionwiseFeedForward(self.embedding_dim, d_ffn=self.d_ffn)
    
    def forward(self, elem_a, elem_b, attn_mask):
        elem_a = self.layer_norm(elem_a)
        elem_b = self.layer_norm(elem_b)
        attn_mask = attn_mask.to(torch.float32)
        
        mh_out, _ = self.mh_attention(elem_a, elem_b, elem_b, key_padding_mask=attn_mask, need_weights=False)
        add_out = mh_out + elem_a
        
        add_out_norm = self.layer_norm(add_out)
        out_ffn = self.pointwise_ff(add_out_norm)
        out = out_ffn + add_out
        return out
    
class UnalignedMultimodalModel(nn.Module):
    def __init__(self, embedding_dim, d_ffn, n_blocks, head):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.n_blocks = n_blocks
        self.head = head
        self.text_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.audio_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.pos_encoder = PositionalEncoding(embedding_dim, dual_modality=False)
    
    def forward(self, texts, audio_features, audio_attentions):
        tokenizer_output = tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][0]
        text_features = self.pos_encoder(text_features)
        text_attentions = tokenizer_output.attention_mask
        
        audio_features = self.pos_encoder(audio_features)
        
        text_crossmodal_out = text_features
        for cm_block in self.text_crossmodal_blocks:
            text_crossmodal_out = cm_block(text_crossmodal_out, audio_features, audio_attentions)
        
        audio_crossmodal_out = audio_features
        for cm_block in self.audio_crossmodal_blocks:
            audio_crossmodal_out = cm_block(audio_crossmodal_out, text_features, text_attentions)

        text_crossmodal_out_mean = torch.mean(text_crossmodal_out, dim=1)
        audio_crossmodal_out_mean = torch.mean(audio_crossmodal_out, dim=1)
        
        text_audio = torch.cat((text_crossmodal_out_mean, audio_crossmodal_out_mean), dim=-1)
        
        return self.head(text_audio)
        
unaligned_head = nn.Sequential(
    nn.Linear(768*2, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

unaligned_mm_model = UnalignedMultimodalModel(768, 100, 4, unaligned_head).to(device)

unaligned_optimizer = torch.optim.Adam(unaligned_mm_model.parameters(), lr=1e-4)
unaligned_criterion = nn.CrossEntropyLoss()

train(unaligned_mm_model, unaligned_optimizer, unaligned_criterion, train_dataloader, val_dataloader, epochs=10, device=device)

 10%|█         | 1/10 [02:39<23:51, 159.06s/it]

Epoch: 0, Training Loss: 0.1194, Validation Loss: 0.1337, accuracy = 0.5137, F1=0.4411


 20%|██        | 2/10 [05:17<21:09, 158.74s/it]

Epoch: 1, Training Loss: 0.1075, Validation Loss: 0.1352, accuracy = 0.5297, F1=0.4624


 30%|███       | 3/10 [07:56<18:30, 158.67s/it]

Epoch: 2, Training Loss: 0.1018, Validation Loss: 0.1321, accuracy = 0.5379, F1=0.4810


 40%|████      | 4/10 [10:35<15:52, 158.82s/it]

Epoch: 3, Training Loss: 0.0987, Validation Loss: 0.1336, accuracy = 0.5327, F1=0.4717


 50%|█████     | 5/10 [13:14<13:14, 158.93s/it]

Epoch: 4, Training Loss: 0.0946, Validation Loss: 0.1352, accuracy = 0.5482, F1=0.5015


 60%|██████    | 6/10 [15:52<10:35, 158.76s/it]

Epoch: 5, Training Loss: 0.0915, Validation Loss: 0.1386, accuracy = 0.5379, F1=0.4863


 70%|███████   | 7/10 [18:31<07:56, 158.92s/it]

Epoch: 6, Training Loss: 0.0887, Validation Loss: 0.1436, accuracy = 0.5378, F1=0.4879


 80%|████████  | 8/10 [21:10<05:17, 158.79s/it]

Epoch: 7, Training Loss: 0.0866, Validation Loss: 0.1408, accuracy = 0.5453, F1=0.4991


 90%|█████████ | 9/10 [23:49<02:38, 158.90s/it]

Epoch: 8, Training Loss: 0.0846, Validation Loss: 0.1528, accuracy = 0.5363, F1=0.4843


100%|██████████| 10/10 [26:28<00:00, 158.82s/it]

Epoch: 9, Training Loss: 0.0823, Validation Loss: 0.1494, accuracy = 0.5398, F1=0.4922



