# Import

In [1]:
!pip install torcheval

Collecting torcheval
  Obtaining dependency information for torcheval from https://files.pythonhosted.org/packages/e4/de/e7abc784b00de9d05999657d29187f1f7a3406ed10ecaf164de06482608f/torcheval-0.0.7-py3-none-any.whl.metadata
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [30]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
import transformers
import copy
import torch, torchaudio, torchtext
import torch.nn.functional as F
import torch.nn as nn
import warnings
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torcheval.metrics.functional import multiclass_f1_score
from transformers import BertTokenizer, BertModel, AutoModel, AutoProcessor
from tqdm import tqdm

try:
    from CustomTransformer import CustomEncoder, PositionalEncoding, LayerNorm
except:
    from customtransformer import CustomEncoder, PositionalEncoding, LayerNorm
    
warnings.filterwarnings('ignore')

### Constants

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

MODEL_NUM_LABELS = 3
REMOVE_OTHER = True
OTHER_LABEL = 'O'
    
if REMOVE_OTHER:
    MODEL_NUM_LABELS = 2

EMBEDDING_DIM = 768
BATCH_SIZE = 12

Using device: cuda


# Load df

In [4]:
try:
    # Try to load from Kaggle
    df_path = '/kaggle/input/multimodal-argument-mining/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = '/kaggle/input/multimodal-argument-mining/MM-USElecDeb60to16/audio_clips'
    save_path = '/kaggle/input/mm-dataset-subsampling/'
    df = pd.read_csv(df_path, index_col=0)
except FileNotFoundError:
    # Try to load from local
    df_path = 'multimodal-dataset/files/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = 'multimodal-dataset/files/MM-USElecDeb60to16/audio_clips'
    save_path = 'multimodal-dataset/files'
    df = pd.read_csv(df_path, index_col=0)
    
# drop rows where audio length is 0
df = df[df['NewBegin'] != df['NewEnd']]
if REMOVE_OTHER:
    # drop rows where Component is 'Other'
    df = df[df['Component'] != OTHER_LABEL]

# train, val, test split
train_df_complete = df[df['Set'] == 'TRAIN']
val_df_complete = df[df['Set'] == 'VALIDATION']
test_df_complete = df[df['Set'] == 'TEST']

# subsample datasets for memory reasons
DATASET_RATIO = 1
train_df = train_df_complete.iloc[:int(DATASET_RATIO * len(train_df_complete))]
val_df = val_df_complete.iloc[:int(DATASET_RATIO * len(val_df_complete))]
test_df = test_df_complete.iloc[:int(DATASET_RATIO * len(test_df_complete))]

In [5]:
train_df.head()

Unnamed: 0,Text,Part,Document,Order,Sentence,Start,End,Annotator,Tag,Component,...,Speaker,SpeakerType,Set,Date,Year,Name,MainTag,NewBegin,NewEnd,idClip
3,"And, after 9/11, it became clear that we had t...",1,30_2004,3,3,2418,2744,,"{""O"": 16, ""Claim"": 50}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,140.56,158.92,clip_3
4,And we also then finally had to stand up democ...,1,30_2004,4,4,2744,2974,,"{""O"": 4, ""Claim"": 13, ""Premise"": 25}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,158.92,172.92,clip_4
9,What we did in Iraq was exactly the right thin...,1,30_2004,9,9,3861,3916,,"{""Claim"": 12, ""O"": 1}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,224.08,226.88,clip_9
10,"If I had it to recommend all over again, I wou...",1,30_2004,10,10,3916,4010,,"{""Premise"": 19, ""O"": 1}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Premise,226.88,231.56,clip_10
11,The world is far safer today because Saddam Hu...,1,30_2004,11,11,4010,4112,,"{""Claim"": 6, ""O"": 2, ""Premise"": 13}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,231.56,237.56,clip_11


In [6]:
len(train_df), len(test_df), len(val_df)

(9455, 5908, 5201)

## Distribution of classes over train df

In [7]:
num_claim = len(train_df[train_df['Component'] == 'Claim'])
print(f'Total Claim: {num_claim}: {num_claim*100/len(train_df):.2f}%')

num_premise = len(train_df[train_df['Component'] == 'Premise'])
print(f'Total Premise: {num_premise}: {num_premise*100/len(train_df):.2f}%')

if not REMOVE_OTHER:
    num_other = len(train_df[train_df['Component'] == 'O'])
    print(f'Total Other: {num_other}: {num_other*100/len(train_df):.2f}%')

Total Claim: 5029: 53.19%
Total Premise: 4426: 46.81%


Classes are not balanced, but not too bad either.

# Train and evaluation Loop

In [31]:
ce_loss = nn.CrossEntropyLoss()

class BestModel:
    """
    Class to keep track of the best performing model on validation set during training
    """
    def __init__(self):
        self.best_validation_loss = float('Infinity')
        self.best_state_dict = None
    def __call__(self, model, loss):
        if loss < self.best_validation_loss:
            self.best_validation_loss = loss
            self.best_state_dict = copy.deepcopy(model.state_dict())

def evaluate(model, data_loader, loss_fn, debug=False):
    """
    Evaluate the model on the set passed
    Args:
        model: model to evaluate
        data_loader: DataLoader object
        loss_fn: loss function to use
    """
    model.eval()
    valid_loss = 0.0
    num_correct = 0 
    num_examples = 0
    tot_pred, tot_targ = torch.LongTensor().to(device), torch.LongTensor().to(device)
    for batch in data_loader:
        texts, audio_features, audio_attention, targets = batch
        audio_features = audio_features.to(device)
        audio_attention = audio_attention.to(device)
        targets = targets.to(device)
        output = model(texts,audio_features,audio_attention)
        if debug:
            print("OUTPUT",output)
            print("TARGETS", targets)
        loss = loss_fn(output, targets)
        valid_loss += loss.detach()
        
        # if label O is still in the dataset we remove it from the outputs
        # since it's a binary task
        if not REMOVE_OTHER:
            not_other = targets != 2
            output = output[not_other]
            targets = targets[not_other]
        
        predicted_labels = torch.argmax(output[:, :2], dim=-1)
        tot_targ = torch.cat((tot_targ, targets))
        tot_pred = torch.cat((tot_pred, predicted_labels))            
        correct = torch.eq(predicted_labels, targets).view(-1)
        num_correct += torch.sum(correct).item()
        num_examples += correct.shape[0]
    valid_loss = valid_loss.cpu().item()
    valid_loss /= len(data_loader.dataset)
    accuracy = num_correct/num_examples
    f1 = multiclass_f1_score(tot_pred, tot_targ, num_classes=2, average="macro")
    return valid_loss, accuracy, f1, tot_pred, tot_targ

            
def train(model, loss_fn, train_loader, val_loader, epochs=10, device="cuda", lr=1e-3, lr_decay_factor=0.1, lr_decay_patience=3, weight_decay=1e-5, verbose=True, debug=False):
    """
    Train the model on the train set and evaluate on the validation set with the given parameters
    Args:
        model: model to train
        loss_fn: loss function to use
        train_loader: DataLoader object for train set
        val_loader: DataLoader object for validation set
        epochs: number of epochs
        device: device to use
        lr: initial learning rate
        lr_decay_factor: factor to decay learning rate
        lr_decay_patience: patience for learning rate decay
        weight_decay: weight decay
    """
    # set up optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) 
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_decay_factor, patience=lr_decay_patience, verbose=True)
    best_model_tracker = BestModel()
    for epoch in tqdm(range(epochs)):
        training_loss = 0.0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            texts, audio_features, audio_attention, targets = batch
            audio_features = audio_features.to(device)
            audio_attention = audio_attention.to(device)
            targets = targets.to(device)
            output = model(texts,audio_features,audio_attention)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.detach()
        training_loss = training_loss.cpu().item()
        training_loss /= len(train_loader.dataset)
        valid_loss, accuracy, f1, _, _ = evaluate(model, val_loader, loss_fn, debug)
        best_model_tracker(model, valid_loss)
        scheduler.step(valid_loss)
        if verbose:
            print(f'Epoch: {epoch}, Training Loss: {training_loss:.4f}, Validation Loss: {valid_loss:.4f}, accuracy = {accuracy:.4f}, F1={f1:.4f}')
    model.load_state_dict(best_model_tracker.best_state_dict) 
    return model

# Dataset Creation

In [9]:
# set up tokenizer and model
text_model_card = 'bert-base-uncased'
audio_model_card = 'facebook/wav2vec2-base-960h'

tokenizer = BertTokenizer.from_pretrained(text_model_card)
embedder = BertModel.from_pretrained(text_model_card).to(device)

# freeze bert layers
for params in embedder.parameters():
    params.requires_grad = False

label_2_id = {
    'Claim': 0,
    'Premise': 1,
    'O': 2
}

# Downsample audio features to 1/5 of the original size to fit in memory
DOWNSAMPLE_FACTOR = 1/5

class MM_Dataset(torch.utils.data.Dataset):
    """
    Dataset class for multimodal dataset
    """
    def __init__(self, df, audio_dir, sample_rate):
        """
        Args:
            df: dataframe containing the dataset
            audio_dir: directory containing the audio clips
            sample_rate: sample rate to use for audio clips
        """
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate

        self.audio_processor = AutoProcessor.from_pretrained(audio_model_card)
        self.audio_model = AutoModel.from_pretrained(audio_model_card).to(device)

        self.dataset = []

        # Iterate over df
        for _, row in tqdm(df.iterrows()):
            path = os.path.join(self.audio_dir, f"{row['Document']}/{row['idClip']}.wav")
            if os.path.exists(path):
                # obtain audio WAV2VEC features
                audio, sampling_rate = torchaudio.load(path)
                # resample audio if necessary
                if sampling_rate != self.sample_rate:
                    audio = torchaudio.functional.resample(audio, sample_rate, self.sample_rate)
                    # mean pooling over channels
                    audio = torch.mean(audio, dim=0, keepdim=True)
                with torch.inference_mode():
                    # run audio through model
                    input_values = self.audio_processor(audio, sampling_rate=self.sample_rate).input_values[0]
                    input_values = torch.tensor(input_values).to(device)
                    audio_model_output = self.audio_model(input_values)
                    audio_features = audio_model_output.last_hidden_state[0].unsqueeze(0)
                    # downsample audio features
                    audio_features = torch.nn.functional.interpolate(audio_features.permute(0,2,1), scale_factor=DOWNSAMPLE_FACTOR, mode='linear')
                    audio_features = audio_features.permute(0,2,1)[0]
                    audio_features = audio_features.cpu()
                
                text = row['Text']

                self.dataset.append((text, audio_features, label_2_id[row['Component']]))
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
try:
    train_dataset = torch.load(f'{save_path}/train_dataset.pkl')
    test_dataset = torch.load(f'{save_path}/test_dataset.pkl')
    val_dataset = torch.load(f'{save_path}/val_dataset.pkl')
    if REMOVE_OTHER:
        train_dataset = list(filter(lambda x: x[2] != 2, train_dataset))
        test_dataset = list(filter(lambda x: x[2] != 2, test_dataset))
        val_dataset = list(filter(lambda x: x[2] != 2, val_dataset))
    print('Restored datasets from memory')
except:
    print('Creating new datasets')
    train_dataset = MM_Dataset(train_df, audio_path, 16_000)
    test_dataset = MM_Dataset(test_df, audio_path, 16_000)
    val_dataset = MM_Dataset(val_df, audio_path, 16_000)

Restored datasets from memory


## Dataloader creation

In [11]:
def create_dataloader(dataset, batch_size):
    """
    Create a DataLoader object from the given dataset with the given batch size
    Args:
        dataset: dataset to use
        batch_size: batch size to use
    """
    def pack_fn(batch):
        """
        Function to pad the audio features and create the attention mask
        """
        texts = [x[0] for x in batch]
        audio_features = [x[1] for x in batch]
        labels = torch.tensor([x[2] for x in batch])
        
        # pad audio features
        audio_features = pad_sequence(audio_features, batch_first=True, padding_value=float('-inf'))
        audio_features_attention_mask = audio_features[:, :, 0] != float('-inf')
        audio_features[(audio_features == float('-inf'))] = 0
        return texts, audio_features, audio_features_attention_mask, labels

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pack_fn)
    return dataloader

In [12]:
train_dataloader = create_dataloader(train_dataset, BATCH_SIZE)
val_dataloader = create_dataloader(val_dataset, BATCH_SIZE)
test_dataloader = create_dataloader(test_dataset, BATCH_SIZE)

In [13]:
gc.collect()

18

In [14]:
def number_parameters(model):
    """
    Computes the number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# 0-A Text-Only

In [15]:
class TextModel(nn.Module):
    """
    Class for the text-only model
    """
    def __init__(self, tokenizer, embedder, head):
        """
        Args:
            tokenizer: tokenizer to use
            embedder: embedder to use
            head: head to use
        """
        super().__init__()
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.head = head
    def forward(self, texts, audio_features, audio_attention):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['last_hidden_state']

        # pooling transformer output
        text_features_sum = (text_features * tokenizer_output.attention_mask.unsqueeze(-1)).sum(axis=1)
        text_features_pooled = text_features_sum / tokenizer_output.attention_mask.sum(axis=1).unsqueeze(-1)
        return self.head(text_features_pooled)

# 0-B Audio-Only

In [16]:
class AudioModel(nn.Module):        
    """
    Class for the audio-only model
    """
    def __init__(self, transformer, head):
        """
        Args:
            transformer: transformer to use
            head: head to use
        """
        super().__init__()
        self.pos_encoder = PositionalEncoding(EMBEDDING_DIM, dual_modality=False)
        self.transformer = transformer
        self.head = head
        self.ln = LayerNorm(EMBEDDING_DIM)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, texts, audio_features, audio_attention):
        global hard_debug
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        padding_mask = ~audio_attention.to(torch.bool)        
        full_attention_mask = torch.zeros((audio_features.shape[1],audio_features.shape[1]), dtype=torch.bool).to(device)
        
        audio_features = self.pos_encoder(audio_features)
        
        transformer_output = self.transformer(audio_features, mask=full_attention_mask, src_key_padding_mask=padding_mask)
        
        # Dropout and LayerNorm to help training phase
        transformer_output = self.dropout(transformer_output)
        transformer_output = self.ln(audio_features + transformer_output)

        transformer_output_sum = (transformer_output * audio_attention.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / audio_attention.sum(axis=1).unsqueeze(-1)

        return self.head(transformer_output_pooled)

# 1 - Multimodal-Transformer

In [22]:
class MultiModalTransformer(nn.Module):
    """
    Class for the multimodal transformer model
    """
    def __init__(self, tokenizer, embedder, transformer, head):
        """
        Args:
            tokenizer: tokenizer to use
            embedder: embedder to use
            transformer: transformer to use
            head: head to use
        """
        super().__init__()
        self.pos_encoder = PositionalEncoding(EMBEDDING_DIM, dual_modality=False)
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.transformer = transformer
        self.head = head

    def forward(self, texts, audio_features, audio_attentions):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][9]
        text_attentions = tokenizer_output.attention_mask

        concatenated_attentions = torch.cat((text_attentions, audio_attentions.float()), dim=1)
        
        audio_features = self.pos_encoder(audio_features)
        
        concatenated_features = torch.cat((text_features, audio_features), dim=1)

        transformer_output = self.transformer(concatenated_features, text_attentions, audio_attentions)

        # pooling of transformer output        
        transformer_output_sum = (transformer_output * concatenated_attentions.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / concatenated_attentions.sum(axis=1).unsqueeze(-1)
        return self.head(transformer_output_pooled)

# 2 - Ensembling-Fusion

In [18]:
 class EnsemblingFusion(nn.Module):
    """
    Class for the ensembling model
    """
    def __init__(self, text_model, audio_model):
        """
        Args:
            text_model: text model to use
            audio_model: audio model to use
        """
        super().__init__()
        self.text_model = text_model
        self.audio_model = audio_model
        # weight to balance the two models
        self.weight = torch.nn.Parameter(torch.tensor(0.0))
        
    def forward(self, texts, audio_features, audio_attentions):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        text_logits = self.text_model(texts, audio_features, audio_attentions)
        audio_logits = self.audio_model(texts, audio_features, audio_attentions)
        
        text_probabilities = torch.nn.functional.softmax(text_logits)
        audio_probabilities = torch.nn.functional.softmax(audio_logits)
        
        # coefficient to balance the two models based on weight learned
        # (tanh + 1) / 2 to have values in [0,1]
        coefficient = (torch.tanh(self.weight) + 1) / 2
        # next step is to have values in [0.3,0.7] to avoid too much imbalance
        coefficient = coefficient*0.4 + 0.3
        
        return coefficient*text_probabilities + (1-coefficient)*audio_probabilities

# 3 - Unaligned Multimodal Model

In [28]:
class UnalignedPositionwiseFeedForward(nn.Module):
    """
    Class for the positionwise feed forward layer
    """
    def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
        """
        Args:
            d_model: dimension of the model
            d_ffn: dimension of the feed forward layer
            dropout: dropout to use
        """
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ffn)
        self.w_2 = nn.Linear(d_ffn, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass of the model
        Args:
            x: input to use
        """
        return self.w_2(self.dropout(self.w_1(x).relu()))

class CrossModalAttentionBlock(nn.Module):
    """
    Class for the cross modal attention block
    """
    def __init__(self, embedding_dim, d_ffn):
        """
        Args:
            embedding_dim: dimension of the embedding
            d_ffn: dimension of the feed forward layer
        """
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.layer_norm = nn.LayerNorm(self.embedding_dim)
        self.mh_attention = nn.MultiheadAttention(self.embedding_dim, 4, 0.1, batch_first=True)
        self.pointwise_ff = UnalignedPositionwiseFeedForward(self.embedding_dim, d_ffn=self.d_ffn)
    
    def forward(self, elem_a, elem_b, attn_mask):
        """
        Forward pass of the model
        Args:
            elem_a: elements of the modality A
            elem_b: elements of the modality B
            attn_mask: attention mask to use
        """
        elem_a = self.layer_norm(elem_a)
        elem_b = self.layer_norm(elem_b)
        attn_mask = attn_mask.to(torch.float32)
        
        # cross modal attention with elem_a as query and elem_b as key and value
        mh_out, _ = self.mh_attention(elem_a, elem_b, elem_b, key_padding_mask=attn_mask, need_weights=False)
        # residual connection
        add_out = mh_out + elem_a
        
        add_out_norm = self.layer_norm(add_out)
        out_ffn = self.pointwise_ff(add_out_norm)
        out = out_ffn + add_out
        return out
    
class UnalignedMultimodalModel(nn.Module):
    """
    Class for the unaligned multimodal model
    """
    def __init__(self, embedding_dim, d_ffn, n_blocks, head):
        """
        Args:
            embedding_dim: dimension of the embedding
            d_ffn: dimension of the feed forward layer
            n_blocks: number of blocks to use
            head: head to use
        """
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.n_blocks = n_blocks
        self.head = head
        self.text_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.audio_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.pos_encoder = PositionalEncoding(embedding_dim, dual_modality=False)
    
    def forward(self, texts, audio_features, audio_attentions):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        tokenizer_output = tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][9]
        text_features = self.pos_encoder(text_features)
        text_attentions = tokenizer_output.attention_mask
        
        audio_features = self.pos_encoder(audio_features)
        
        # cross modal attention blocks for text
        # using audio features as key and value and text features as query
        text_crossmodal_out = text_features
        for cm_block in self.text_crossmodal_blocks:
            text_crossmodal_out = cm_block(text_crossmodal_out, audio_features, audio_attentions)
        
        # cross modal attention blocks for audio
        # using text features as key and value and audio features as query
        audio_crossmodal_out = audio_features
        for cm_block in self.audio_crossmodal_blocks:
            audio_crossmodal_out = cm_block(audio_crossmodal_out, text_features, text_attentions)

        # pooling of transformer output
        text_crossmodal_out_mean = torch.mean(text_crossmodal_out, dim=1)
        audio_crossmodal_out_mean = torch.mean(audio_crossmodal_out, dim=1)
        
        # concatenate text and audio features
        text_audio = torch.cat((text_crossmodal_out_mean, audio_crossmodal_out_mean), dim=-1)
        
        return self.head(text_audio)

# Training of the models

In [27]:
def create_models():
    """
    Creates all the models
    """
    ###################################################################################### -- TEXT MODEL --

    text_only_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    text_only = TextModel(tokenizer, embedder, text_only_head)

    ###################################################################################### -- AUDIO MODEL --
    
    audio_only_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    audio_only_transformer_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_DIM, nhead=8, dim_feedforward=100, batch_first=True).to(device)
    audio_only_transformer_encoder = nn.TransformerEncoder(audio_only_transformer_layer, num_layers=1).to(device)
    audio_only = AudioModel(audio_only_transformer_encoder, audio_only_head).to(device)

    ###################################################################################### -- MULTIMODAL MODEL --
    
    multimodal_encoder = CustomEncoder(d_model=EMBEDDING_DIM, ffn_hidden=2048, n_head=4, n_layers=1, drop_prob=0.1)
    multimodal_transformer_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    multimodal_transformer = MultiModalTransformer(tokenizer, embedder, multimodal_encoder, multimodal_transformer_head).to(device)

    ###################################################################################### -- ENSEMBLING MODEL --

    ensembling_text_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    ensembling_audio_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    ensembling_transformer_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_DIM, nhead=4, dim_feedforward=2048, batch_first=True).to(device)
    ensembling_transformer_encoder = nn.TransformerEncoder(ensembling_transformer_layer, num_layers=1).to(device)
    ensembling_text_model = TextModel(tokenizer, embedder, ensembling_text_head)
    ensembling_audio_model = AudioModel(ensembling_transformer_encoder, ensembling_audio_head)
    ensembling_fusion = EnsemblingFusion(ensembling_text_model, ensembling_audio_model).to(device)

    ###################################################################################### -- UNALIGNED MODEL --

    unaligned_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM*2, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    unaligned_mm_model = UnalignedMultimodalModel(embedding_dim=EMBEDDING_DIM, d_ffn=2048, n_blocks=4, head=unaligned_head).to(device)
    
    ######################################################################################-
    
    model_names = [ 'text_only', 'audio_only', 'multimodal', 'ensembling', 'unaligned'] # 'text_only', 'audio_only', 
    models = [  text_only, audio_only, multimodal_transformer, ensembling_fusion, unaligned_mm_model ] #text_only, audio_only,
    
    return model_names, models

In [21]:
SEEDS = [1]#, 42, 69, 420, 666]

val_results = {
    'text_only': [],
    'audio_only': [],
    'multimodal': [],
    'ensembling': [],
    'unaligned': []
}

test_results = {
    'text_only': [],
    'audio_only': [],
    'multimodal': [],
    'ensembling': [],
    'unaligned': []
}

EPOCHS = 8
INITIAL_LR = 1e-3
WEIGHT_DECAY = 1e-3
LR_DECAY_FACTOR = 1e-1
LR_DECAY_PATIENCE = 3
VERBOSE_TRAIN = True

for seed in SEEDS:
    print(f'{f"TRAINING WITH SEED {seed}":=^65}')
    print()
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    model_names, models = create_models()
    
    # TO select a sinle model:
    # models = [models[4]]
    # model_names = [model_names[4]]

    while models:
        model = models[0]
        model_name = model_names[0]
        torch.manual_seed(seed)
        np.random.seed(seed)
        print(f'{f"Training model {model_name}":_^65}')
        
        loss = ce_loss
        def custom_loss(outputs, targets):
            return torch.nn.functional.nll_loss(torch.log(outputs), targets, reduction='mean')
        
        if model_name == 'ensembling':
            loss = custom_loss
            WEIGHT_DECAY = 1e-3
            INITIAL_LR = 1e-4
        
        if model_name == 'audio_only':
            WEIGHT_DECAY = 1e-3
            INITIAL_LR = 1e-4
            
        train(
            model,
            loss,
            train_dataloader,
            val_dataloader,
            epochs=EPOCHS,
            device=device,
            lr=INITIAL_LR,
            lr_decay_factor=LR_DECAY_FACTOR,
            lr_decay_patience=LR_DECAY_PATIENCE,
            weight_decay=WEIGHT_DECAY,
            verbose=VERBOSE_TRAIN,
            debug = False
        )

        _, val_acc, val_f1, val_pred, val_targ = evaluate(model, val_dataloader, loss)
        _, test_acc, test_f1, test_pred, test_targ = evaluate(model, test_dataloader, loss)
        if VERBOSE_TRAIN:
            print(f'[VAL] Model: {model_name} - acc: {val_acc:.4f} - f1: {val_f1:.4f}')
            print(f'[TEST] Model: {model_name} - acc: {test_acc:.4f} - f1: {test_f1:.4f}')
            print()
        val_results[model_name].append({
            'acc': val_acc,
            'f1': val_f1,
            'pred': val_pred,
            'targ': val_targ
        })
        test_results[model_name].append({
            'acc': test_acc,
            'f1': test_f1,
            'pred': test_pred,
            'targ': test_targ
        })
        
        del model
        del models[0]
        del model_names[0]
        gc.collect()


____________________Training model text_only_____________________


 10%|█         | 1/10 [01:00<09:05, 60.65s/it]

Epoch: 0, Training Loss: 0.0493, Validation Loss: 0.0517, accuracy = 0.6845, F1=0.6724


 20%|██        | 2/10 [01:59<07:57, 59.70s/it]

Epoch: 1, Training Loss: 0.0468, Validation Loss: 0.0507, accuracy = 0.6776, F1=0.6689


 30%|███       | 3/10 [02:58<06:55, 59.30s/it]

Epoch: 2, Training Loss: 0.0461, Validation Loss: 0.0510, accuracy = 0.6874, F1=0.6840


 40%|████      | 4/10 [03:57<05:55, 59.17s/it]

Epoch: 3, Training Loss: 0.0451, Validation Loss: 0.0497, accuracy = 0.6853, F1=0.6783


 50%|█████     | 5/10 [04:55<04:54, 58.89s/it]

Epoch: 4, Training Loss: 0.0444, Validation Loss: 0.0503, accuracy = 0.6829, F1=0.6690


 60%|██████    | 6/10 [05:54<03:55, 58.79s/it]

Epoch: 5, Training Loss: 0.0434, Validation Loss: 0.0517, accuracy = 0.6839, F1=0.6821


 70%|███████   | 7/10 [06:52<02:56, 58.69s/it]

Epoch: 6, Training Loss: 0.0426, Validation Loss: 0.0495, accuracy = 0.6914, F1=0.6906


 80%|████████  | 8/10 [07:51<01:57, 58.57s/it]

Epoch: 7, Training Loss: 0.0419, Validation Loss: 0.0512, accuracy = 0.6876, F1=0.6800


 90%|█████████ | 9/10 [08:49<00:58, 58.50s/it]

Epoch: 8, Training Loss: 0.0406, Validation Loss: 0.0522, accuracy = 0.6843, F1=0.6795


100%|██████████| 10/10 [09:48<00:00, 58.81s/it]

Epoch: 9, Training Loss: 0.0394, Validation Loss: 0.0514, accuracy = 0.6847, F1=0.6820





[VAL] Model: text_only - acc: 0.6847 - f1: 0.6820
[TEST] Model: text_only - acc: 0.6767 - f1: 0.6752

____________________Training model audio_only____________________


 10%|█         | 1/10 [00:39<05:55, 39.48s/it]

Epoch: 0, Training Loss: 0.0579, Validation Loss: 0.0575, accuracy = 0.5416, F1=0.4908


 20%|██        | 2/10 [01:18<05:15, 39.39s/it]

Epoch: 1, Training Loss: 0.0575, Validation Loss: 0.0578, accuracy = 0.5261, F1=0.5171


 30%|███       | 3/10 [01:57<04:34, 39.27s/it]

Epoch: 2, Training Loss: 0.0575, Validation Loss: 0.0575, accuracy = 0.5459, F1=0.4808


 40%|████      | 4/10 [02:37<03:56, 39.36s/it]

Epoch: 3, Training Loss: 0.0574, Validation Loss: 0.0576, accuracy = 0.5378, F1=0.5087


 50%|█████     | 5/10 [03:16<03:16, 39.28s/it]

Epoch: 4, Training Loss: 0.0574, Validation Loss: 0.0576, accuracy = 0.5437, F1=0.3795


 60%|██████    | 6/10 [03:55<02:37, 39.28s/it]

Epoch: 5, Training Loss: 0.0574, Validation Loss: 0.0577, accuracy = 0.5257, F1=0.5191


 70%|███████   | 7/10 [04:34<01:57, 39.19s/it]

Epoch: 6, Training Loss: 0.0575, Validation Loss: 0.0575, accuracy = 0.5414, F1=0.4535


 80%|████████  | 8/10 [05:14<01:18, 39.19s/it]

Epoch: 7, Training Loss: 0.0574, Validation Loss: 0.0575, accuracy = 0.5418, F1=0.4532


 90%|█████████ | 9/10 [05:53<00:39, 39.21s/it]

Epoch: 8, Training Loss: 0.0574, Validation Loss: 0.0575, accuracy = 0.5426, F1=0.4738


100%|██████████| 10/10 [06:32<00:00, 39.27s/it]

Epoch: 9, Training Loss: 0.0574, Validation Loss: 0.0575, accuracy = 0.5428, F1=0.4741





[VAL] Model: audio_only - acc: 0.5428 - f1: 0.4741
[TEST] Model: audio_only - acc: 0.5508 - f1: 0.4975

____________________Training model multimodal____________________


 10%|█         | 1/10 [01:34<14:07, 94.21s/it]

Epoch: 0, Training Loss: 0.0509, Validation Loss: 0.0503, accuracy = 0.6901, F1=0.6831


 20%|██        | 2/10 [03:08<12:34, 94.27s/it]

Epoch: 1, Training Loss: 0.0463, Validation Loss: 0.0509, accuracy = 0.6747, F1=0.6746


 30%|███       | 3/10 [04:42<10:59, 94.15s/it]

Epoch: 2, Training Loss: 0.0446, Validation Loss: 0.0514, accuracy = 0.6787, F1=0.6762


 40%|████      | 4/10 [06:17<09:26, 94.46s/it]

Epoch: 3, Training Loss: 0.0426, Validation Loss: 0.0509, accuracy = 0.6835, F1=0.6813


 50%|█████     | 5/10 [07:51<07:51, 94.38s/it]

Epoch 00005: reducing learning rate of group 0 to 1.0000e-05.
Epoch: 4, Training Loss: 0.0410, Validation Loss: 0.0525, accuracy = 0.6758, F1=0.6709


 60%|██████    | 6/10 [09:26<06:17, 94.40s/it]

Epoch: 5, Training Loss: 0.0363, Validation Loss: 0.0565, accuracy = 0.6726, F1=0.6703


 70%|███████   | 7/10 [11:00<04:43, 94.42s/it]

Epoch: 6, Training Loss: 0.0354, Validation Loss: 0.0564, accuracy = 0.6672, F1=0.6625


 80%|████████  | 8/10 [12:34<03:08, 94.27s/it]

Epoch: 7, Training Loss: 0.0344, Validation Loss: 0.0573, accuracy = 0.6679, F1=0.6646


 90%|█████████ | 9/10 [14:08<01:34, 94.20s/it]

Epoch 00009: reducing learning rate of group 0 to 1.0000e-06.
Epoch: 8, Training Loss: 0.0338, Validation Loss: 0.0587, accuracy = 0.6704, F1=0.6672


100%|██████████| 10/10 [15:43<00:00, 94.32s/it]

Epoch: 9, Training Loss: 0.0329, Validation Loss: 0.0592, accuracy = 0.6699, F1=0.6667





[VAL] Model: multimodal - acc: 0.6699 - f1: 0.6667
[TEST] Model: multimodal - acc: 0.6562 - f1: 0.6547

____________________Training model ensembling____________________


 10%|█         | 1/10 [01:29<13:24, 89.41s/it]

Epoch: 0, Training Loss: 0.0521, Validation Loss: 0.0512, accuracy = 0.6785, F1=0.6691


 20%|██        | 2/10 [02:58<11:56, 89.51s/it]

Epoch: 1, Training Loss: 0.0487, Validation Loss: 0.0506, accuracy = 0.6789, F1=0.6771


 30%|███       | 3/10 [04:28<10:25, 89.29s/it]

Epoch: 2, Training Loss: 0.0482, Validation Loss: 0.0503, accuracy = 0.6841, F1=0.6800


 40%|████      | 4/10 [05:58<08:58, 89.74s/it]

Epoch: 3, Training Loss: 0.0475, Validation Loss: 0.0501, accuracy = 0.6845, F1=0.6801


 50%|█████     | 5/10 [07:27<07:27, 89.59s/it]

Epoch: 4, Training Loss: 0.0472, Validation Loss: 0.0503, accuracy = 0.6818, F1=0.6729


 60%|██████    | 6/10 [08:57<05:58, 89.66s/it]

Epoch: 5, Training Loss: 0.0469, Validation Loss: 0.0502, accuracy = 0.6835, F1=0.6767


 70%|███████   | 7/10 [10:27<04:28, 89.64s/it]

Epoch: 6, Training Loss: 0.0469, Validation Loss: 0.0507, accuracy = 0.6783, F1=0.6782


 80%|████████  | 8/10 [11:56<02:59, 89.53s/it]

Epoch: 7, Training Loss: 0.0467, Validation Loss: 0.0498, accuracy = 0.6856, F1=0.6829


 90%|█████████ | 9/10 [13:25<01:29, 89.48s/it]

Epoch: 8, Training Loss: 0.0464, Validation Loss: 0.0501, accuracy = 0.6885, F1=0.6818


100%|██████████| 10/10 [14:55<00:00, 89.51s/it]

Epoch: 9, Training Loss: 0.0462, Validation Loss: 0.0500, accuracy = 0.6849, F1=0.6825





[VAL] Model: ensembling - acc: 0.6849 - f1: 0.6825
[TEST] Model: ensembling - acc: 0.6848 - f1: 0.6840

____________________Training model unaligned_____________________


 10%|█         | 1/10 [03:13<29:00, 193.41s/it]

Epoch: 0, Training Loss: 0.0538, Validation Loss: 0.0516, accuracy = 0.6735, F1=0.6669


 20%|██        | 2/10 [06:27<25:48, 193.59s/it]

Epoch: 1, Training Loss: 0.0486, Validation Loss: 0.0502, accuracy = 0.6851, F1=0.6842


 30%|███       | 3/10 [09:39<22:31, 193.01s/it]

Epoch: 2, Training Loss: 0.0472, Validation Loss: 0.0519, accuracy = 0.6803, F1=0.6729


 40%|████      | 4/10 [12:54<19:23, 193.92s/it]

Epoch: 3, Training Loss: 0.0459, Validation Loss: 0.0517, accuracy = 0.6812, F1=0.6808


 50%|█████     | 5/10 [16:07<16:08, 193.66s/it]

Epoch: 4, Training Loss: 0.0455, Validation Loss: 0.0520, accuracy = 0.6701, F1=0.6494


 60%|██████    | 6/10 [19:22<12:55, 193.80s/it]

Epoch 00006: reducing learning rate of group 0 to 1.0000e-05.
Epoch: 5, Training Loss: 0.0445, Validation Loss: 0.0521, accuracy = 0.6799, F1=0.6784


 70%|███████   | 7/10 [22:35<09:41, 193.70s/it]

Epoch: 6, Training Loss: 0.0416, Validation Loss: 0.0526, accuracy = 0.6791, F1=0.6753


 80%|████████  | 8/10 [25:49<06:27, 193.64s/it]

Epoch: 7, Training Loss: 0.0412, Validation Loss: 0.0524, accuracy = 0.6797, F1=0.6754


 90%|█████████ | 9/10 [29:02<03:13, 193.54s/it]

Epoch: 8, Training Loss: 0.0408, Validation Loss: 0.0529, accuracy = 0.6795, F1=0.6770


100%|██████████| 10/10 [32:15<00:00, 193.59s/it]

Epoch 00010: reducing learning rate of group 0 to 1.0000e-06.
Epoch: 9, Training Loss: 0.0404, Validation Loss: 0.0533, accuracy = 0.6789, F1=0.6760





[VAL] Model: unaligned - acc: 0.6787 - f1: 0.6758
[TEST] Model: unaligned - acc: 0.6732 - f1: 0.6720



In [29]:
SEEDS = [1]#, 42, 69, 420, 666]

val_results = {
    'text_only': [],
    'audio_only': [],
    'multimodal': [],
    'ensembling': [],
    'unaligned': []
}

test_results = {
    'text_only': [],
    'audio_only': [],
    'multimodal': [],
    'ensembling': [],
    'unaligned': []
}

EPOCHS = 8
INITIAL_LR = 1e-3
WEIGHT_DECAY = 1e-3 # 1e-5
LR_DECAY_FACTOR = 1e-1
LR_DECAY_PATIENCE = 3
VERBOSE_TRAIN = True

for seed in SEEDS:
    print(f'{f"TRAINING WITH SEED {seed}":=^65}')
    print()
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    model_names, models = create_models()
    
    # TO select a sinle model:
    models = [models[-1]]
    model_names = [model_names[-1]]

    while models:
        model = models[0]
        model_name = model_names[0]
        torch.manual_seed(seed)
        np.random.seed(seed)
        print(f'{f"Training model {model_name}":_^65}')
        
        loss = ce_loss
        def custom_loss(outputs, targets):
            return torch.nn.functional.nll_loss(torch.log(outputs), targets, reduction='mean')
        
        if model_name == 'ensembling':
            loss = custom_loss
            WEIGHT_DECAY = 1e-3
            INITIAL_LR = 1e-4
        
        if model_name == 'audio_only':
            WEIGHT_DECAY = 1e-3
            INITIAL_LR = 1e-4
            
        model = train(
            model,
            loss,
            train_dataloader,
            val_dataloader,
            epochs=EPOCHS,
            device=device,
            lr=INITIAL_LR,
            lr_decay_factor=LR_DECAY_FACTOR,
            lr_decay_patience=LR_DECAY_PATIENCE,
            weight_decay=WEIGHT_DECAY,
            verbose=VERBOSE_TRAIN,
            debug = False
        )

        _, val_acc, val_f1, val_pred, val_targ = evaluate(model, val_dataloader, loss)
        _, test_acc, test_f1, test_pred, test_targ = evaluate(model, test_dataloader, loss)
        if VERBOSE_TRAIN:
            print(f'[VAL] Model: {model_name} - acc: {val_acc:.4f} - f1: {val_f1:.4f}')
            print(f'[TEST] Model: {model_name} - acc: {test_acc:.4f} - f1: {test_f1:.4f}')
            print()
        val_results[model_name].append({
            'acc': val_acc,
            'f1': val_f1,
            'pred': val_pred,
            'targ': val_targ
        })
        test_results[model_name].append({
            'acc': test_acc,
            'f1': test_f1,
            'pred': test_pred,
            'targ': test_targ
        })
        
        del model
        del models[0]
        del model_names[0]
        gc.collect()


____________________Training model unaligned_____________________


 12%|█▎        | 1/8 [03:14<22:41, 194.57s/it]

Epoch: 0, Training Loss: 0.0533, Validation Loss: 0.0532, accuracy = 0.6862, F1=0.6819


 25%|██▌       | 2/8 [06:30<19:30, 195.08s/it]

Epoch: 1, Training Loss: 0.0495, Validation Loss: 0.0490, accuracy = 0.6903, F1=0.6810


 38%|███▊      | 3/8 [09:43<16:12, 194.44s/it]

Epoch: 2, Training Loss: 0.0468, Validation Loss: 0.0494, accuracy = 0.6945, F1=0.6936


 50%|█████     | 4/8 [13:00<13:00, 195.22s/it]

Epoch: 3, Training Loss: 0.0460, Validation Loss: 0.0476, accuracy = 0.7035, F1=0.7002


 62%|██████▎   | 5/8 [16:14<09:45, 195.03s/it]

Epoch: 4, Training Loss: 0.0460, Validation Loss: 0.0521, accuracy = 0.7052, F1=0.7033


 75%|███████▌  | 6/8 [19:30<06:30, 195.18s/it]

Epoch: 5, Training Loss: 0.0453, Validation Loss: 0.0499, accuracy = 0.7064, F1=0.7043


 88%|████████▊ | 7/8 [22:45<03:15, 195.15s/it]

Epoch: 6, Training Loss: 0.0443, Validation Loss: 0.0475, accuracy = 0.7089, F1=0.7069


100%|██████████| 8/8 [25:59<00:00, 194.96s/it]

Epoch: 7, Training Loss: 0.0445, Validation Loss: 0.0485, accuracy = 0.6983, F1=0.6983





[VAL] Model: unaligned - acc: 0.6999 - f1: 0.6998
[TEST] Model: unaligned - acc: 0.7051 - f1: 0.7050



# Error Analysis