# Import

In [1]:
!pip install torcheval

Collecting torcheval
  Obtaining dependency information for torcheval from https://files.pythonhosted.org/packages/e4/de/e7abc784b00de9d05999657d29187f1f7a3406ed10ecaf164de06482608f/torcheval-0.0.7-py3-none-any.whl.metadata
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [2]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertTokenizer, BertModel, AutoModel, AutoProcessor
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch, torchaudio, torchtext
from torcheval.metrics.functional import multiclass_f1_score
import torch.nn as nn
import os
import gc
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

MODEL_NUM_LABELS = 3
REMOVE_OTHER = True
OTHER_LABEL = 'O'
    
if REMOVE_OTHER:
    MODEL_NUM_LABELS = 2



Using device: cuda


# Load df

In [3]:
try:
    df_path = '/kaggle/input/multimodal-argument-mining/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = '/kaggle/input/multimodal-argument-mining/MM-USElecDeb60to16/audio_clips'
    save_path = '/kaggle/input/mm-dataset-subsampling/'
    df = pd.read_csv(df_path, index_col=0)
except FileNotFoundError:
    df_path = 'multimodal-dataset/files/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = 'multimodal-dataset/files/MM-USElecDeb60to16/audio_clips'
    save_path = 'multimodal-dataset/files'
    df = pd.read_csv(df_path, index_col=0)
    
# drop rows where audio length is 0
df = df[df['NewBegin'] != df['NewEnd']]
if REMOVE_OTHER:
    # drop rows where Component is 'Other'
    df = df[df['Component'] != OTHER_LABEL]

train_df_complete = df[df['Set'] == 'TRAIN']
val_df_complete = df[df['Set'] == 'VALIDATION']
test_df_complete = df[df['Set'] == 'TEST']

DATASET_RATIO = 1

train_df = train_df_complete.iloc[:int(DATASET_RATIO * len(train_df_complete))]
val_df = val_df_complete.iloc[:int(DATASET_RATIO * len(val_df_complete))]
test_df = test_df_complete.iloc[:int(DATASET_RATIO * len(test_df_complete))]

In [4]:
train_df.head()

Unnamed: 0,Text,Part,Document,Order,Sentence,Start,End,Annotator,Tag,Component,...,Speaker,SpeakerType,Set,Date,Year,Name,MainTag,NewBegin,NewEnd,idClip
3,"And, after 9/11, it became clear that we had t...",1,30_2004,3,3,2418,2744,,"{""O"": 16, ""Claim"": 50}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,140.56,158.92,clip_3
4,And we also then finally had to stand up democ...,1,30_2004,4,4,2744,2974,,"{""O"": 4, ""Claim"": 13, ""Premise"": 25}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,158.92,172.92,clip_4
9,What we did in Iraq was exactly the right thin...,1,30_2004,9,9,3861,3916,,"{""Claim"": 12, ""O"": 1}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,224.08,226.88,clip_9
10,"If I had it to recommend all over again, I wou...",1,30_2004,10,10,3916,4010,,"{""Premise"": 19, ""O"": 1}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Premise,226.88,231.56,clip_10
11,The world is far safer today because Saddam Hu...,1,30_2004,11,11,4010,4112,,"{""Claim"": 6, ""O"": 2, ""Premise"": 13}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,231.56,237.56,clip_11


In [5]:
len(train_df), len(test_df), len(val_df)

(9455, 5908, 5201)

## Distribution of classes over train df

In [6]:
num_claim = len(train_df[train_df['Component'] == 'Claim'])
print(f'Total Claim: {num_claim}: {num_claim*100/len(train_df):.2f}%')

num_premise = len(train_df[train_df['Component'] == 'Premise'])
print(f'Total Premise: {num_premise}: {num_premise*100/len(train_df):.2f}%')

if not REMOVE_OTHER:
    num_other = len(train_df[train_df['Component'] == 'O'])
    print(f'Total Other: {num_other}: {num_other*100/len(train_df):.2f}%')

Total Claim: 5029: 53.19%
Total Premise: 4426: 46.81%


Unbalanced dataset

# Train and evaluation Loop

In [171]:
ce_loss = nn.CrossEntropyLoss()

class BestModel:
    """
        Class to keep track of the best performing model on validation set during training
    """
    def __init__(self):
        self.best_validation_loss = float('Infinity')
        self.best_state_dict = None
    def __call__(self, model, loss):
        if loss < self.best_validation_loss:
            self.best_validation_loss = loss
            self.best_state_dict = model.state_dict()

def evaluate(model, val_loader, loss_fn):
    model.eval()
    valid_loss = 0.0
    num_correct = 0 
    num_examples = 0
    tot_pred, tot_targ = torch.LongTensor().to(device), torch.LongTensor().to(device)
    for batch in val_loader:
        texts, audio_features, audio_attention, targets = batch
        audio_features = audio_features.to(device)
        audio_attention = audio_attention.to(device)
        targets = targets.to(device)
        output = model(texts,audio_features,audio_attention)
        # print("out",output)
        # print("targets",targets)
        loss = loss_fn(output, targets)
        valid_loss += loss.detach()
        
        # if label O is still in the dataset we remove it from the outputs
        # since it's a binary task
        if not REMOVE_OTHER:
            not_other = targets != 2
            output = output[not_other]
            targets = targets[not_other]
        
        predicted_labels = torch.argmax(output, dim=-1)
        tot_targ = torch.cat((tot_targ, targets))
        tot_pred = torch.cat((tot_pred, predicted_labels))            
        correct = torch.eq(predicted_labels, targets).view(-1)
        num_correct += torch.sum(correct).item()
        num_examples += correct.shape[0]
    valid_loss = valid_loss.cpu().item()
    valid_loss /= len(val_loader.dataset)
    accuracy = num_correct/num_examples
    f1 = multiclass_f1_score(tot_pred, tot_targ, num_classes=2, average="macro")
    return valid_loss, accuracy, f1

            
def train(model, loss_fn, train_loader, val_loader, epochs=10, device="cuda", lr=1e-3, lr_decay_factor=0.1, lr_decay_patience=3, weight_decay=1e-5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) 
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_decay_factor, patience=lr_decay_patience, verbose=True)
    best_model_tracker = BestModel()
    for epoch in tqdm(range(epochs)):
        training_loss = 0.0
        model.train()

        for batch in train_loader:
            optimizer.zero_grad()
            texts, audio_features, audio_attention, targets = batch
            audio_features = audio_features.to(device)
            audio_attention = audio_attention.to(device)
            targets = targets.to(device)
            output = model(texts,audio_features,audio_attention)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.detach()
        training_loss = training_loss.cpu().item()
        training_loss /= len(train_loader.dataset)
        valid_loss, accuracy, f1 = evaluate(model, val_loader, loss_fn)
        best_model_tracker(model, valid_loss)
        scheduler.step(valid_loss)
        print(f'Epoch: {epoch}, Training Loss: {training_loss:.4f}, Validation Loss: {valid_loss:.4f}, accuracy = {accuracy:.4f}, F1={f1:.4f}')
    model.load_state_dict(best_model_tracker.best_state_dict)    

# Dataset Creation

In [8]:
text_model_card = 'bert-base-uncased'
audio_model_card = 'facebook/wav2vec2-base-960h'

tokenizer = BertTokenizer.from_pretrained(text_model_card)
embedder = BertModel.from_pretrained(text_model_card).to(device)

for params in embedder.parameters():
    params.requires_grad = False

label_2_id = {
    'Claim': 0,
    'Premise': 1,
    'O': 2
}

DOWNSAMPLE_FACTOR = 1/5

class MM_Dataset(torch.utils.data.Dataset):
    def __init__(self, df, audio_dir, sample_rate):
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate

        self.audio_processor = AutoProcessor.from_pretrained(audio_model_card)
        self.audio_model = AutoModel.from_pretrained(audio_model_card).to(device)

        self.dataset = []

        # Iterate over df
        for _, row in tqdm(df.iterrows()):
            path = os.path.join(self.audio_dir, f"{row['Document']}/{row['idClip']}.wav")
            if os.path.exists(path):
                # obtain audio WAV2VEC features
                audio, sampling_rate = torchaudio.load(path)
                if sampling_rate != self.sample_rate:
                    audio = torchaudio.functional.resample(audio, sample_rate, self.sample_rate)
                    audio = torch.mean(audio, dim=0, keepdim=True)
                with torch.inference_mode():
                    input_values = self.audio_processor(audio, sampling_rate=self.sample_rate).input_values[0]
                    input_values = torch.tensor(input_values).to(device)
                    audio_model_output = self.audio_model(input_values)
                    audio_features = audio_model_output.last_hidden_state[0].unsqueeze(0)
                    audio_features = torch.nn.functional.interpolate(audio_features.permute(0,2,1), scale_factor=DOWNSAMPLE_FACTOR, mode='linear')
                    audio_features = audio_features.permute(0,2,1)[0]
                    audio_features = audio_features.cpu()
                
                text = row['Text']

                self.dataset.append((text, audio_features, label_2_id[row['Component']]))
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
try:
    train_dataset = torch.load(f'{save_path}/train_dataset.pkl')
    test_dataset = torch.load(f'{save_path}/test_dataset.pkl')
    val_dataset = torch.load(f'{save_path}/val_dataset.pkl')
    if REMOVE_OTHER:
        train_dataset = list(filter(lambda x: x[2] != 2, train_dataset))
        test_dataset = list(filter(lambda x: x[2] != 2, test_dataset))
        val_dataset = list(filter(lambda x: x[2] != 2, val_dataset))
    print('Restored datasets from memory')
except:
    print('Creating new datasets')
    train_dataset = MM_Dataset(train_df, audio_path, 16_000)
    test_dataset = MM_Dataset(test_df, audio_path, 16_000)
    val_dataset = MM_Dataset(val_df, audio_path, 16_000)

Restored datasets from memory


## Dataloader creation

In [10]:
def create_dataloader(dataset, batch_size):
    def pack_fn(batch):
        texts = [x[0] for x in batch]
        audio_features = [x[1] for x in batch]
        labels = torch.tensor([x[2] for x in batch])
        
        # pad audio features
        audio_features = pad_sequence(audio_features, batch_first=True, padding_value=float('-inf'))

        audio_features_attention_mask = audio_features[:, :, 0] != float('-inf')
        
        audio_features[(audio_features == float('-inf'))] = 0

        return texts, audio_features, audio_features_attention_mask, labels

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=pack_fn)
    return dataloader

In [11]:
BATCH_SIZE = 12

train_dataloader = create_dataloader(train_dataset, BATCH_SIZE)
val_dataloader = create_dataloader(val_dataset, BATCH_SIZE)
test_dataloader = create_dataloader(test_dataset, BATCH_SIZE)

In [12]:
gc.collect()

18

In [13]:
def number_parameters(model):
    """
        Computes the number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Positional Encoding

In [89]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dual_modality=False, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.dual_modality = dual_modality
        self.pe = self.pe.to(device)

    def forward(self, x, is_first=True):
        if self.dual_modality:
            modality = torch.ones((x.shape[0], x.shape[1], 4), dtype=torch.float32).to(device) * (0 if is_first else 1)
            x = x + self.pe[:,:x.size(1)]
            x = self.dropout(x)
            return torch.cat((x, modality), axis=-1)
        else:
            x = x + self.pe[:,:x.size(1)]
            return self.dropout(x)

# 1 - Multimodal-Transformer

In [182]:
class MultiModalTransformer(nn.Module):
    def __init__(self, tokenizer, embedder, transformer, head):
        super().__init__()
        self.pos_encoder = PositionalEncoding(768, dual_modality=False)
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.transformer = transformer
        self.head = head

    def forward(self, texts, audio_features, audio_attentions):
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][9]
        
        #text_features = self.pos_encoder(text_features, is_first=True)
        text_attentions = tokenizer_output.attention_mask
        #audio_features = self.pos_encoder(audio_features, is_first=False)
        
        test = torch.nn.functional.interpolate(audio_features.permute(0,2,1), scale_factor=text_features.size(1)/audio_features.size(1), mode='linear')
        test = test.permute(0,2,1)
        
        concatenated_features = torch.cat((text_features, test), dim=1)
        concatenated_attentions = torch.cat((text_attentions, audio_attentions.float()), dim=1)
        
        # padding mask is 1 where there is padding (i.e. where attention is 0) and 0 otherwise
        concatenated_padding_mask = ~(concatenated_attentions.to(torch.bool))
        
        # compute a full attention mask of size [seq_len, seq_len]
        full_attention_mask = torch.zeros((concatenated_features.shape[1], concatenated_features.shape[1]), dtype=torch.float32).to(device)
        
        
        seq_len = concatenated_features.shape[1]
        rel_pos_enc = np.fromfunction(lambda i, j: torch.sin(torch.tensor(math.pi * (i / (seq_len - 1) * (j + 1)))), (seq_len, seq_len)) 

        rel_pos_enc = torch.from_numpy(rel_pos_enc).float().to(device)
                
        transformer_output = self.transformer(src=concatenated_features, mask=rel_pos_enc)#, src_key_padding_mask=concatenated_padding_mask)
        #print("to",transformer_output)
        #transformer_output_sum = (transformer_output * concatenated_attentions.unsqueeze(-1)).sum(axis=1)
        #transformer_output_pooled = transformer_output_sum / concatenated_attentions.sum(axis=1).unsqueeze(-1)
        #print("to shape",transformer_output.shape)
        #print("to_p ", torch.mean(torch.abs(transformer_output_pooled),dim=1))
        transformer_output_pooled = torch.mean(transformer_output, dim=1)
        
        return self.head(transformer_output_pooled)

In [16]:
del multimodal_criterion 
del multimodal_optimizer
del multimodal_transformer_layer
del multimodal_transformer_encoder
del multimodal_transformer_head
del multimodal_transformer

gc.collect()

NameError: name 'multimodal_criterion' is not defined

In [185]:
multimodal_transformer_layer = nn.TransformerEncoderLayer(d_model=768, nhead=2, batch_first=True).to(device)
multimodal_transformer_encoder = nn.TransformerEncoder(multimodal_transformer_layer, num_layers=1).to(device)

multimodal_transformer_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

multimodal_transformer = MultiModalTransformer(tokenizer, embedder, multimodal_transformer_encoder, multimodal_transformer_head).to(device)

multimodal_optimizer = torch.optim.Adam(multimodal_transformer.parameters(), lr=1e-3)
multimodal_criterion = nn.CrossEntropyLoss()

train(multimodal_transformer, multimodal_criterion, train_dataloader, val_dataloader, epochs=10, device=device)

 10%|█         | 1/10 [04:21<39:14, 261.56s/it]

Epoch: 0, Training Loss: 0.0510, Validation Loss: 0.0690, accuracy = 0.6383, F1=0.5874


 20%|██        | 2/10 [08:44<34:59, 262.39s/it]

Epoch: 1, Training Loss: 0.0487, Validation Loss: 0.0553, accuracy = 0.6439, F1=0.5912


 30%|███       | 3/10 [13:11<30:52, 264.57s/it]

Epoch: 2, Training Loss: 0.0460, Validation Loss: 0.0569, accuracy = 0.6549, F1=0.6138


 40%|████      | 4/10 [17:39<26:34, 265.83s/it]

Epoch: 3, Training Loss: 0.0453, Validation Loss: 0.0576, accuracy = 0.6843, F1=0.6607


 50%|█████     | 5/10 [22:06<22:10, 266.20s/it]

Epoch: 4, Training Loss: 0.0452, Validation Loss: 0.0598, accuracy = 0.6818, F1=0.6608


 60%|██████    | 6/10 [26:33<17:45, 266.41s/it]

Epoch 00006: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 5, Training Loss: 0.0448, Validation Loss: 0.0607, accuracy = 0.6660, F1=0.6296


 70%|███████   | 7/10 [31:00<13:20, 266.71s/it]

Epoch: 6, Training Loss: 0.0448, Validation Loss: 0.0526, accuracy = 0.7045, F1=0.6996


 80%|████████  | 8/10 [35:27<08:53, 266.72s/it]

Epoch: 7, Training Loss: 0.0439, Validation Loss: 0.0523, accuracy = 0.7029, F1=0.6987


 90%|█████████ | 9/10 [39:52<04:26, 266.15s/it]

Epoch: 8, Training Loss: 0.0435, Validation Loss: 0.0527, accuracy = 0.7004, F1=0.6945


100%|██████████| 10/10 [44:21<00:00, 266.13s/it]

Epoch: 9, Training Loss: 0.0432, Validation Loss: 0.0538, accuracy = 0.7037, F1=0.6951





In [186]:
test_loss, acc, f1 = evaluate(multimodal_transformer, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

Results on Test Set: 
Test loss: 0.055005217534751606	Accuracy: 0.6858496953283684	F1: 0.6796482801437378


# 2 - Ensembling-Fusion

## Text-Only and Audio-Only Models 

In [16]:
class TextModel(nn.Module):
    def __init__(self, tokenizer, embedder, head):
        super().__init__()
        #self.pos_encoder = PositionalEncoding(768, dual_modality=False)
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.head = head
    def forward(self, texts, audio_features, audio_attention):
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['last_hidden_state']
        text_features_sum = (text_features * tokenizer_output.attention_mask.unsqueeze(-1)).sum(axis=1)
        text_features_pooled = text_features_sum / tokenizer_output.attention_mask.sum(axis=1).unsqueeze(-1)
        return self.head(text_features_pooled)
    
class AudioModel(nn.Module):        
    def __init__(self, transformer, head):
        super().__init__()
        self.pos_encoder = PositionalEncoding(768, dual_modality=False)
        self.transformer = transformer
        self.head = head
        
    def forward(self, texts, audio_features, audio_attention):
        padding_mask = ~audio_attention.to(torch.bool)
        audio_features = self.pos_encoder(audio_features)
        full_attention_mask = torch.zeros((audio_features.shape[1],audio_features.shape[1]), dtype=torch.bool).to(device)
        transformer_output = self.transformer(src=audio_features, mask=full_attention_mask, src_key_padding_mask=padding_mask)
        
        # pooling transformer output
        transformer_output_sum = (transformer_output * audio_attention.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / audio_attention.sum(axis=1).unsqueeze(-1)
        return self.head(transformer_output_pooled)

## Ensembling Model

In [17]:
 class EnsemblingFusion(nn.Module):
    def __init__(self, text_model, audio_model):
        super().__init__()
        self.text_model = text_model
        self.audio_model = audio_model
        self.weight = torch.nn.Parameter(torch.tensor(0.0))
        
    def forward(self, texts, audio_features, audio_attentions):
        text_logits = self.text_model(texts, audio_features, audio_attentions)
        audio_logits = self.audio_model(texts, audio_features, audio_attentions)
        
        text_probabilities = torch.nn.functional.softmax(text_logits)
        audio_probabilities = torch.nn.functional.softmax(audio_logits)
        
        coefficient = (torch.tanh(self.weight) + 1) / 2
        
        coefficient = coefficient*0.4 + 0.3
        
        return coefficient*text_probabilities + (1-coefficient)*audio_probabilities

In [19]:
# TRAINING OF ENSEMBLING
ensembling_text_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

ensembling_audio_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 3)
).to(device)

ensembling_transformer_layer = nn.TransformerEncoderLayer(d_model=768, nhead=4, dim_feedforward=512, batch_first=True).to(device)
ensembling_transformer_encoder = nn.TransformerEncoder(ensembling_transformer_layer, num_layers=4).to(device)

ensembling_text_model = TextModel(tokenizer, embedder, ensembling_text_head)
ensembling_audio_model = AudioModel(ensembling_transformer_encoder, ensembling_audio_head)

ensembling_fusion = EnsemblingFusion(ensembling_text_model, ensembling_audio_model).to(device)

print(f'#Params: {number_parameters(ensembling_fusion)}')
def custom_loss(outputs, targets):
    return torch.nn.functional.nll_loss(torch.log(outputs), targets, reduction='mean')

train(ensembling_fusion, custom_loss, train_dataloader, val_dataloader, epochs=10, device=device)

#Params: 13007879


 10%|█         | 1/10 [02:16<20:29, 136.59s/it]

Epoch: 0, Training Loss: 0.0773, Validation Loss: 0.0924, accuracy = 0.5789, F1=0.4541


 20%|██        | 2/10 [04:33<18:12, 136.54s/it]

Epoch: 1, Training Loss: 0.0737, Validation Loss: 0.0880, accuracy = 0.6043, F1=0.5121


 30%|███       | 3/10 [06:49<15:55, 136.50s/it]

Epoch: 2, Training Loss: 0.0726, Validation Loss: 0.0890, accuracy = 0.6047, F1=0.5120


 40%|████      | 4/10 [09:06<13:39, 136.55s/it]

Epoch: 3, Training Loss: 0.0721, Validation Loss: 0.0863, accuracy = 0.6145, F1=0.5347


 50%|█████     | 5/10 [11:22<11:23, 136.60s/it]

Epoch: 4, Training Loss: 0.0715, Validation Loss: 0.0869, accuracy = 0.6164, F1=0.5395


 60%|██████    | 6/10 [13:39<09:06, 136.65s/it]

Epoch: 5, Training Loss: 0.0713, Validation Loss: 0.0889, accuracy = 0.6064, F1=0.5174


 70%|███████   | 7/10 [15:56<06:50, 136.69s/it]

Epoch: 6, Training Loss: 0.0709, Validation Loss: 0.0866, accuracy = 0.6166, F1=0.5400


 80%|████████  | 8/10 [18:13<04:33, 136.85s/it]

Epoch 00008: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 7, Training Loss: 0.0706, Validation Loss: 0.0882, accuracy = 0.6095, F1=0.5221


 90%|█████████ | 9/10 [20:30<02:16, 136.92s/it]

Epoch: 8, Training Loss: 0.0713, Validation Loss: 0.0791, accuracy = 0.6570, F1=0.6196


100%|██████████| 10/10 [22:47<00:00, 136.76s/it]

Epoch: 9, Training Loss: 0.0705, Validation Loss: 0.0793, accuracy = 0.6551, F1=0.6157





In [21]:
train(ensembling_fusion, custom_loss, train_dataloader, val_dataloader, epochs=10, device=device)

 10%|█         | 1/10 [02:17<20:33, 137.06s/it]

Epoch: 0, Training Loss: 0.0705, Validation Loss: 0.0856, accuracy = 0.6214, F1=0.5508


 20%|██        | 2/10 [04:33<18:15, 136.94s/it]

Epoch: 1, Training Loss: 0.0705, Validation Loss: 0.0836, accuracy = 0.6331, F1=0.5715


 30%|███       | 3/10 [06:50<15:58, 136.94s/it]

Epoch: 2, Training Loss: 0.0706, Validation Loss: 0.0869, accuracy = 0.6160, F1=0.5365


 40%|████      | 4/10 [09:07<13:41, 136.96s/it]

Epoch: 3, Training Loss: 0.0704, Validation Loss: 0.0876, accuracy = 0.6097, F1=0.5251


 50%|█████     | 5/10 [11:24<11:24, 136.99s/it]

Epoch: 4, Training Loss: 0.0706, Validation Loss: 0.0936, accuracy = 0.6022, F1=0.5065


 60%|██████    | 6/10 [13:42<09:08, 137.06s/it]

Epoch 00006: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 5, Training Loss: 0.0705, Validation Loss: 0.0852, accuracy = 0.6226, F1=0.5523


 70%|███████   | 7/10 [15:59<06:51, 137.03s/it]

Epoch: 6, Training Loss: 0.0709, Validation Loss: 0.0795, accuracy = 0.6560, F1=0.6185


 80%|████████  | 8/10 [18:16<04:34, 137.01s/it]

Epoch: 7, Training Loss: 0.0707, Validation Loss: 0.0801, accuracy = 0.6518, F1=0.6089


 90%|█████████ | 9/10 [20:33<02:17, 137.01s/it]

Epoch: 8, Training Loss: 0.0705, Validation Loss: 0.0802, accuracy = 0.6505, F1=0.6058


100%|██████████| 10/10 [22:50<00:00, 137.00s/it]

Epoch: 9, Training Loss: 0.0704, Validation Loss: 0.0796, accuracy = 0.6539, F1=0.6132





In [22]:
test_loss, acc, f1 = evaluate(ensembling_fusion, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

Results on Test Set: 
Test loss: 0.11293967833935448	Accuracy: 0.6420108327691266	F1: 0.6087323427200317


In [23]:
ensembling_fusion.weight

Parameter containing:
tensor(0.8591, device='cuda:0', requires_grad=True)

# 3 - Text-Only

Text-only trained on the first two classes ignoring 'Other' class

In [21]:
text_only_head = nn.Sequential(
    nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

text_only = TextModel(tokenizer, embedder, text_only_head)

train(text_only, ce_loss, train_dataloader, val_dataloader, epochs=20, device=device)

test_loss, acc, f1 = evaluate(text_only, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

  5%|▌         | 1/20 [01:16<24:18, 76.75s/it]

Epoch: 0, Training Loss: 0.1090, Validation Loss: 0.1326, accuracy = 0.5922, F1=0.4865


 10%|█         | 2/20 [02:34<23:11, 77.30s/it]

Epoch: 1, Training Loss: 0.1057, Validation Loss: 0.1317, accuracy = 0.5974, F1=0.4951


 15%|█▌        | 3/20 [03:50<21:45, 76.77s/it]

Epoch: 2, Training Loss: 0.1043, Validation Loss: 0.1297, accuracy = 0.6097, F1=0.5240


 20%|██        | 4/20 [05:06<20:25, 76.56s/it]

Epoch: 3, Training Loss: 0.1037, Validation Loss: 0.1295, accuracy = 0.6145, F1=0.5329


 25%|██▌       | 5/20 [06:23<19:08, 76.55s/it]

Epoch: 4, Training Loss: 0.1032, Validation Loss: 0.1324, accuracy = 0.6064, F1=0.5148


 30%|███       | 6/20 [07:40<17:53, 76.67s/it]

Epoch: 5, Training Loss: 0.1030, Validation Loss: 0.1283, accuracy = 0.6133, F1=0.5335


 35%|███▌      | 7/20 [08:56<16:34, 76.53s/it]

Epoch: 6, Training Loss: 0.1022, Validation Loss: 0.1309, accuracy = 0.6141, F1=0.5306


 40%|████      | 8/20 [10:12<15:18, 76.51s/it]

Epoch: 7, Training Loss: 0.1020, Validation Loss: 0.1297, accuracy = 0.6087, F1=0.5225


 45%|████▌     | 9/20 [11:28<13:59, 76.28s/it]

Epoch: 8, Training Loss: 0.1019, Validation Loss: 0.1281, accuracy = 0.6208, F1=0.5459


 50%|█████     | 10/20 [12:44<12:40, 76.05s/it]

Epoch: 9, Training Loss: 0.1019, Validation Loss: 0.1325, accuracy = 0.6110, F1=0.5257


 55%|█████▌    | 11/20 [13:59<11:22, 75.86s/it]

Epoch: 10, Training Loss: 0.1014, Validation Loss: 0.1293, accuracy = 0.6197, F1=0.5407


 60%|██████    | 12/20 [15:16<10:08, 76.10s/it]

Epoch: 11, Training Loss: 0.1016, Validation Loss: 0.1295, accuracy = 0.6176, F1=0.5400


 65%|██████▌   | 13/20 [16:32<08:53, 76.17s/it]

Epoch 00013: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 12, Training Loss: 0.1015, Validation Loss: 0.1289, accuracy = 0.6201, F1=0.5450


 70%|███████   | 14/20 [17:49<07:37, 76.32s/it]

Epoch: 13, Training Loss: 0.1011, Validation Loss: 0.1162, accuracy = 0.6464, F1=0.5952


 75%|███████▌  | 15/20 [19:06<06:22, 76.51s/it]

Epoch: 14, Training Loss: 0.1009, Validation Loss: 0.1152, accuracy = 0.6455, F1=0.5953


 80%|████████  | 16/20 [20:22<05:05, 76.31s/it]

Epoch: 15, Training Loss: 0.1007, Validation Loss: 0.1150, accuracy = 0.6464, F1=0.5970


 85%|████████▌ | 17/20 [21:38<03:49, 76.38s/it]

Epoch: 16, Training Loss: 0.1008, Validation Loss: 0.1167, accuracy = 0.6412, F1=0.5855


 90%|█████████ | 18/20 [22:55<02:32, 76.42s/it]

Epoch: 17, Training Loss: 0.1001, Validation Loss: 0.1157, accuracy = 0.6447, F1=0.5916


 95%|█████████▌| 19/20 [24:11<01:16, 76.43s/it]

Epoch: 18, Training Loss: 0.1007, Validation Loss: 0.1154, accuracy = 0.6445, F1=0.5931


100%|██████████| 20/20 [25:28<00:00, 76.40s/it]

Epoch: 19, Training Loss: 0.1004, Validation Loss: 0.1144, accuracy = 0.6506, F1=0.6050





Results on Test Set: 
Test loss: 0.11366133590871987	Accuracy: 0.6352403520649966	F1: 0.5967629551887512


# 4 - Unaligned Multimodal Model

In [25]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ffn)
        self.w_2 = nn.Linear(d_ffn, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

class CrossModalAttentionBlock(nn.Module):
    def __init__(self, embedding_dim, d_ffn):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.layer_norm = nn.LayerNorm(self.embedding_dim)
        self.mh_attention = nn.MultiheadAttention(self.embedding_dim, 4, 0.1, batch_first=True)
        self.pointwise_ff = PositionwiseFeedForward(self.embedding_dim, d_ffn=self.d_ffn)
    
    def forward(self, elem_a, elem_b, attn_mask):
        elem_a = self.layer_norm(elem_a)
        elem_b = self.layer_norm(elem_b)
        attn_mask = attn_mask.to(torch.float32)
        
        mh_out, _ = self.mh_attention(elem_a, elem_b, elem_b, key_padding_mask=attn_mask, need_weights=False)
        add_out = mh_out + elem_a
        
        add_out_norm = self.layer_norm(add_out)
        out_ffn = self.pointwise_ff(add_out_norm)
        out = out_ffn + add_out
        return out
    
class UnalignedMultimodalModel(nn.Module):
    def __init__(self, embedding_dim, d_ffn, n_blocks, head):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.n_blocks = n_blocks
        self.head = head
        self.text_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.audio_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.pos_encoder = PositionalEncoding(embedding_dim, dual_modality=False)
    
    def forward(self, texts, audio_features, audio_attentions):
        tokenizer_output = tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][0]
        text_features = self.pos_encoder(text_features)
        text_attentions = tokenizer_output.attention_mask
        
        audio_features = self.pos_encoder(audio_features)
        
        text_crossmodal_out = text_features
        for cm_block in self.text_crossmodal_blocks:
            text_crossmodal_out = cm_block(text_crossmodal_out, audio_features, audio_attentions)
        
        audio_crossmodal_out = audio_features
        for cm_block in self.audio_crossmodal_blocks:
            audio_crossmodal_out = cm_block(audio_crossmodal_out, text_features, text_attentions)

        text_crossmodal_out_mean = torch.mean(text_crossmodal_out, dim=1)
        audio_crossmodal_out_mean = torch.mean(audio_crossmodal_out, dim=1)
        
        text_audio = torch.cat((text_crossmodal_out_mean, audio_crossmodal_out_mean), dim=-1)
        
        return self.head(text_audio)

In [26]:
# TRAINING OF UNALIGNED-MODEL
unaligned_head = nn.Sequential(
    nn.Linear(768*2, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

unaligned_mm_model = UnalignedMultimodalModel(768, 100, 4, unaligned_head).to(device)

train(unaligned_mm_model, ce_loss, train_dataloader, val_dataloader, epochs=20, device=device)

test_loss, acc, f1 = evaluate(unaligned_mm_model, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

  5%|▌         | 1/20 [02:05<39:49, 125.74s/it]

Epoch: 0, Training Loss: 0.0864, Validation Loss: 0.0948, accuracy = 0.5414, F1=0.3513


 10%|█         | 2/20 [04:11<37:48, 126.04s/it]

Epoch: 1, Training Loss: 0.0852, Validation Loss: 0.0904, accuracy = 0.5414, F1=0.3513


 15%|█▌        | 3/20 [06:18<35:45, 126.19s/it]

Epoch: 2, Training Loss: 0.0851, Validation Loss: 0.0890, accuracy = 0.5414, F1=0.3513


 20%|██        | 4/20 [08:24<33:40, 126.30s/it]

Epoch: 3, Training Loss: 0.0852, Validation Loss: 0.0896, accuracy = 0.5414, F1=0.3513


 25%|██▌       | 5/20 [10:31<31:39, 126.61s/it]

Epoch: 4, Training Loss: 0.0851, Validation Loss: 0.0910, accuracy = 0.5414, F1=0.3513


 30%|███       | 6/20 [12:38<29:33, 126.65s/it]

Epoch: 5, Training Loss: 0.0848, Validation Loss: 0.0957, accuracy = 0.5414, F1=0.3513


 35%|███▌      | 7/20 [14:45<27:26, 126.63s/it]

Epoch 00007: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 6, Training Loss: 0.0849, Validation Loss: 0.0928, accuracy = 0.5414, F1=0.3513


 40%|████      | 8/20 [16:52<25:20, 126.72s/it]

Epoch: 7, Training Loss: 0.0866, Validation Loss: 0.0863, accuracy = 0.5414, F1=0.3513


 45%|████▌     | 9/20 [19:00<23:18, 127.10s/it]

Epoch: 8, Training Loss: 0.0864, Validation Loss: 0.0863, accuracy = 0.5414, F1=0.3513


 50%|█████     | 10/20 [21:07<21:11, 127.17s/it]

Epoch: 9, Training Loss: 0.0863, Validation Loss: 0.0865, accuracy = 0.5414, F1=0.3513


 55%|█████▌    | 11/20 [23:13<19:02, 126.94s/it]

Epoch: 10, Training Loss: 0.0861, Validation Loss: 0.0866, accuracy = 0.5414, F1=0.3513


 60%|██████    | 12/20 [25:21<16:56, 127.07s/it]

Epoch 00012: reducing learning rate of group 0 to 1.0000e-05.
Epoch: 11, Training Loss: 0.0860, Validation Loss: 0.0868, accuracy = 0.5414, F1=0.3513


 65%|██████▌   | 13/20 [27:28<14:50, 127.15s/it]

Epoch: 12, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


 70%|███████   | 14/20 [29:36<12:43, 127.23s/it]

Epoch: 13, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


 75%|███████▌  | 15/20 [31:43<10:36, 127.26s/it]

Epoch: 14, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


 80%|████████  | 16/20 [33:50<08:29, 127.31s/it]

Epoch 00016: reducing learning rate of group 0 to 1.0000e-06.
Epoch: 15, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


 85%|████████▌ | 17/20 [35:58<06:21, 127.31s/it]

Epoch: 16, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


 90%|█████████ | 18/20 [38:04<04:14, 127.07s/it]

Epoch: 17, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


 95%|█████████▌| 19/20 [40:11<02:06, 126.95s/it]

Epoch: 18, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513


100%|██████████| 20/20 [42:17<00:00, 126.89s/it]

Epoch 00020: reducing learning rate of group 0 to 1.0000e-07.
Epoch: 19, Training Loss: 0.0864, Validation Loss: 0.0864, accuracy = 0.5414, F1=0.3513





Results on Test Set: 
Test loss: 0.08669511902227382	Accuracy: 0.5140487474610698	F1: 0.3395192623138428
