# Import

In [1]:
!pip install torcheval

Collecting torcheval
  Obtaining dependency information for torcheval from https://files.pythonhosted.org/packages/e4/de/e7abc784b00de9d05999657d29187f1f7a3406ed10ecaf164de06482608f/torcheval-0.0.7-py3-none-any.whl.metadata
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [2]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
import transformers
import torch, torchaudio, torchtext
import torch.nn.functional as F
import torch.nn as nn
import warnings
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torcheval.metrics.functional import multiclass_f1_score
from transformers import BertTokenizer, BertModel, AutoModel, AutoProcessor
from tqdm import tqdm
from CustomTransformer import CustomEncoder
warnings.filterwarnings('ignore')



Using device: cuda


### Constants

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

MODEL_NUM_LABELS = 3
REMOVE_OTHER = True
OTHER_LABEL = 'O'
    
if REMOVE_OTHER:
    MODEL_NUM_LABELS = 2

EMBEDDING_DIM = 768
BATCH_SIZE = 8

# Load df

In [3]:
try:
    # Try to load from Kaggle
    df_path = '/kaggle/input/multimodal-argument-mining/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = '/kaggle/input/multimodal-argument-mining/MM-USElecDeb60to16/audio_clips'
    save_path = '/kaggle/input/mm-dataset-subsampling/'
    df = pd.read_csv(df_path, index_col=0)
except FileNotFoundError:
    # Try to load from local
    df_path = 'multimodal-dataset/files/MM-USElecDeb60to16/MM-USElecDeb60to16.csv'
    audio_path = 'multimodal-dataset/files/MM-USElecDeb60to16/audio_clips'
    save_path = 'multimodal-dataset/files'
    df = pd.read_csv(df_path, index_col=0)
    
# drop rows where audio length is 0
df = df[df['NewBegin'] != df['NewEnd']]
if REMOVE_OTHER:
    # drop rows where Component is 'Other'
    df = df[df['Component'] != OTHER_LABEL]

# train, val, test split
train_df_complete = df[df['Set'] == 'TRAIN']
val_df_complete = df[df['Set'] == 'VALIDATION']
test_df_complete = df[df['Set'] == 'TEST']

# subsample datasets for memory reasons
DATASET_RATIO = 1
train_df = train_df_complete.iloc[:int(DATASET_RATIO * len(train_df_complete))]
val_df = val_df_complete.iloc[:int(DATASET_RATIO * len(val_df_complete))]
test_df = test_df_complete.iloc[:int(DATASET_RATIO * len(test_df_complete))]

In [4]:
train_df.head()

Unnamed: 0,Text,Part,Document,Order,Sentence,Start,End,Annotator,Tag,Component,...,Speaker,SpeakerType,Set,Date,Year,Name,MainTag,NewBegin,NewEnd,idClip
3,"And, after 9/11, it became clear that we had t...",1,30_2004,3,3,2418,2744,,"{""O"": 16, ""Claim"": 50}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,140.56,158.92,clip_3
4,And we also then finally had to stand up democ...,1,30_2004,4,4,2744,2974,,"{""O"": 4, ""Claim"": 13, ""Premise"": 25}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,158.92,172.92,clip_4
9,What we did in Iraq was exactly the right thin...,1,30_2004,9,9,3861,3916,,"{""Claim"": 12, ""O"": 1}",Claim,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim,224.08,226.88,clip_9
10,"If I had it to recommend all over again, I wou...",1,30_2004,10,10,3916,4010,,"{""Premise"": 19, ""O"": 1}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Premise,226.88,231.56,clip_10
11,The world is far safer today because Saddam Hu...,1,30_2004,11,11,4010,4112,,"{""Claim"": 6, ""O"": 2, ""Premise"": 13}",Premise,...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed,231.56,237.56,clip_11


In [5]:
len(train_df), len(test_df), len(val_df)

(9455, 5908, 5201)

## Distribution of classes over train df

In [6]:
num_claim = len(train_df[train_df['Component'] == 'Claim'])
print(f'Total Claim: {num_claim}: {num_claim*100/len(train_df):.2f}%')

num_premise = len(train_df[train_df['Component'] == 'Premise'])
print(f'Total Premise: {num_premise}: {num_premise*100/len(train_df):.2f}%')

if not REMOVE_OTHER:
    num_other = len(train_df[train_df['Component'] == 'O'])
    print(f'Total Other: {num_other}: {num_other*100/len(train_df):.2f}%')

Total Claim: 5029: 53.19%
Total Premise: 4426: 46.81%


Classes are not balanced, but not too bad either.

# Train and evaluation Loop

In [7]:
ce_loss = nn.CrossEntropyLoss()

class BestModel:
    """
    Class to keep track of the best performing model on validation set during training
    """
    def __init__(self):
        self.best_validation_loss = float('Infinity')
        self.best_state_dict = None
    def __call__(self, model, loss):
        if loss < self.best_validation_loss:
            self.best_validation_loss = loss
            self.best_state_dict = model.state_dict()

def evaluate(model, data_loader, loss_fn):
    """
    Evaluate the model on the set passed
    Args:
        model: model to evaluate
        data_loader: DataLoader object
        loss_fn: loss function to use
    """
    model.eval()
    valid_loss = 0.0
    num_correct = 0 
    num_examples = 0
    tot_pred, tot_targ = torch.LongTensor().to(device), torch.LongTensor().to(device)
    for batch in data_loader:
        texts, audio_features, audio_attention, targets = batch
        audio_features = audio_features.to(device)
        audio_attention = audio_attention.to(device)
        targets = targets.to(device)
        output = model(texts,audio_features,audio_attention)
        loss = loss_fn(output, targets)
        valid_loss += loss.detach()
        
        # if label O is still in the dataset we remove it from the outputs
        # since it's a binary task
        if not REMOVE_OTHER:
            not_other = targets != 2
            output = output[not_other]
            targets = targets[not_other]
        
        predicted_labels = torch.argmax(output[:, :2], dim=-1)
        tot_targ = torch.cat((tot_targ, targets))
        tot_pred = torch.cat((tot_pred, predicted_labels))            
        correct = torch.eq(predicted_labels, targets).view(-1)
        num_correct += torch.sum(correct).item()
        num_examples += correct.shape[0]
    valid_loss = valid_loss.cpu().item()
    valid_loss /= len(data_loader.dataset)
    accuracy = num_correct/num_examples
    f1 = multiclass_f1_score(tot_pred, tot_targ, num_classes=2, average="macro")
    return valid_loss, accuracy, f1, tot_pred, tot_targ

            
def train(model, loss_fn, train_loader, val_loader, epochs=10, device="cuda", lr=1e-3, lr_decay_factor=0.1, lr_decay_patience=3, weight_decay=1e-5, verbose=True):
    """
    Train the model on the train set and evaluate on the validation set with the given parameters
    Args:
        model: model to train
        loss_fn: loss function to use
        train_loader: DataLoader object for train set
        val_loader: DataLoader object for validation set
        epochs: number of epochs
        device: device to use
        lr: initial learning rate
        lr_decay_factor: factor to decay learning rate
        lr_decay_patience: patience for learning rate decay
        weight_decay: weight decay
    """
    # set up optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) 
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_decay_factor, patience=lr_decay_patience, verbose=True)
    best_model_tracker = BestModel()
    for epoch in tqdm(range(epochs)):
        training_loss = 0.0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            texts, audio_features, audio_attention, targets = batch
            audio_features = audio_features.to(device)
            audio_attention = audio_attention.to(device)
            targets = targets.to(device)
            output = model(texts,audio_features,audio_attention)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.detach()
        training_loss = training_loss.cpu().item()
        training_loss /= len(train_loader.dataset)
        valid_loss, accuracy, f1, _, _ = evaluate(model, val_loader, loss_fn)
        best_model_tracker(model, valid_loss)
        scheduler.step(valid_loss)
        if verbose:
            print(f'Epoch: {epoch}, Training Loss: {training_loss:.4f}, Validation Loss: {valid_loss:.4f}, accuracy = {accuracy:.4f}, F1={f1:.4f}')
    model.load_state_dict(best_model_tracker.best_state_dict)    

# Dataset Creation

In [8]:
# set up tokenizer and model
text_model_card = 'bert-base-uncased'
audio_model_card = 'facebook/wav2vec2-base-960h'

tokenizer = BertTokenizer.from_pretrained(text_model_card)
embedder = BertModel.from_pretrained(text_model_card).to(device)

# freeze bert layers
for params in embedder.parameters():
    params.requires_grad = False

label_2_id = {
    'Claim': 0,
    'Premise': 1,
    'O': 2
}

# Downsample audio features to 1/5 of the original size to fit in memory
DOWNSAMPLE_FACTOR = 1/5

class MM_Dataset(torch.utils.data.Dataset):
    """
    Dataset class for multimodal dataset
    """
    def __init__(self, df, audio_dir, sample_rate):
        """
        Args:
            df: dataframe containing the dataset
            audio_dir: directory containing the audio clips
            sample_rate: sample rate to use for audio clips
        """
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate

        self.audio_processor = AutoProcessor.from_pretrained(audio_model_card)
        self.audio_model = AutoModel.from_pretrained(audio_model_card).to(device)

        self.dataset = []

        # Iterate over df
        for _, row in tqdm(df.iterrows()):
            path = os.path.join(self.audio_dir, f"{row['Document']}/{row['idClip']}.wav")
            if os.path.exists(path):
                # obtain audio WAV2VEC features
                audio, sampling_rate = torchaudio.load(path)
                # resample audio if necessary
                if sampling_rate != self.sample_rate:
                    audio = torchaudio.functional.resample(audio, sample_rate, self.sample_rate)
                    # mean pooling over channels
                    audio = torch.mean(audio, dim=0, keepdim=True)
                with torch.inference_mode():
                    # run audio through model
                    input_values = self.audio_processor(audio, sampling_rate=self.sample_rate).input_values[0]
                    input_values = torch.tensor(input_values).to(device)
                    audio_model_output = self.audio_model(input_values)
                    audio_features = audio_model_output.last_hidden_state[0].unsqueeze(0)
                    # downsample audio features
                    audio_features = torch.nn.functional.interpolate(audio_features.permute(0,2,1), scale_factor=DOWNSAMPLE_FACTOR, mode='linear')
                    audio_features = audio_features.permute(0,2,1)[0]
                    audio_features = audio_features.cpu()
                
                text = row['Text']

                self.dataset.append((text, audio_features, label_2_id[row['Component']]))
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
try:
    train_dataset = torch.load(f'{save_path}/train_dataset.pkl')
    test_dataset = torch.load(f'{save_path}/test_dataset.pkl')
    val_dataset = torch.load(f'{save_path}/val_dataset.pkl')
    if REMOVE_OTHER:
        train_dataset = list(filter(lambda x: x[2] != 2, train_dataset))
        test_dataset = list(filter(lambda x: x[2] != 2, test_dataset))
        val_dataset = list(filter(lambda x: x[2] != 2, val_dataset))
    print('Restored datasets from memory')
except:
    print('Creating new datasets')
    train_dataset = MM_Dataset(train_df, audio_path, 16_000)
    test_dataset = MM_Dataset(test_df, audio_path, 16_000)
    val_dataset = MM_Dataset(val_df, audio_path, 16_000)

Restored datasets from memory


## Dataloader creation

In [10]:
def create_dataloader(dataset, batch_size):
    """
    Create a DataLoader object from the given dataset with the given batch size
    Args:
        dataset: dataset to use
        batch_size: batch size to use
    """
    def pack_fn(batch):
        """
        Function to pad the audio features and create the attention mask
        """
        texts = [x[0] for x in batch]
        audio_features = [x[1] for x in batch]
        labels = torch.tensor([x[2] for x in batch])
        
        # pad audio features
        audio_features = pad_sequence(audio_features, batch_first=True, padding_value=float('-inf'))
        audio_features_attention_mask = audio_features[:, :, 0] != float('-inf')
        audio_features[(audio_features == float('-inf'))] = 0
        return texts, audio_features, audio_features_attention_mask, labels

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pack_fn)
    return dataloader

In [11]:
train_dataloader = create_dataloader(train_dataset, BATCH_SIZE)
val_dataloader = create_dataloader(val_dataset, BATCH_SIZE)
test_dataloader = create_dataloader(test_dataset, BATCH_SIZE)

In [12]:
gc.collect()

18

In [13]:
def number_parameters(model):
    """
    Computes the number of trainable parameters in the model
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# 0-A Text-Only

In [None]:
class TextModel(nn.Module):
    """
    Class for the text-only model
    """
    def __init__(self, tokenizer, embedder, head):
        """
        Args:
            tokenizer: tokenizer to use
            embedder: embedder to use
            head: head to use
        """
        super().__init__()
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.head = head
    def forward(self, texts, audio_features, audio_attention):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['last_hidden_state']

        # pooling transformer output
        text_features_sum = (text_features * tokenizer_output.attention_mask.unsqueeze(-1)).sum(axis=1)
        text_features_pooled = text_features_sum / tokenizer_output.attention_mask.sum(axis=1).unsqueeze(-1)
        return self.head(text_features_pooled)

In [None]:
"""
Baseline model using only text
"""

text_only_head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

text_only = TextModel(tokenizer, embedder, text_only_head)

train(text_only, ce_loss, train_dataloader, val_dataloader, epochs=20, device=device)

test_loss, acc, f1, _, _ = evaluate(text_only, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

# 0-B Audio-Only

In [None]:
class AudioModel(nn.Module):        
    """
    Class for the audio-only model
    """
    def __init__(self, transformer, head):
        """
        Args:
            transformer: transformer to use
            head: head to use
        """
        super().__init__()
        self.pos_encoder = PositionalEncoding(768, dual_modality=False)
        self.transformer = transformer
        self.head = head
        
    def forward(self, texts, audio_features, audio_attention):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        audio_attention = torch.ones_like(audio_attention)
        padding_mask = ~audio_attention.to(torch.bool)
        #full_attention_mask = torch.zeros((audio_features.shape[1],audio_features.shape[1]), dtype=torch.bool).to(device)
        audio_features = self.pos_encoder(audio_features)
        # TODO: look carefully at this part of the code
        transformer_output = self.transformer(src=audio_features, is_causal=True)
        
        # pooling transformer output
        transformer_output_sum = (transformer_output * audio_attention.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / audio_attention.sum(axis=1).unsqueeze(-1)
        return self.head(transformer_output_pooled)

In [None]:
"""
Baseline model using only audio
"""
audio_only_head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

audio_only_transformer_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_DIM, nhead=4, dim_feedforward=512, batch_first=True).to(device)
audio_only_transformer_encoder = nn.TransformerEncoder(audio_only_transformer_layer, num_layers=4).to(device)

audio_only = AudioModel(audio_only_transformer_encoder, audio_only_head).to(device)
print(f'#Params: {number_parameters(audio_only)}')

train(audio_only, ce_loss, train_dataloader, val_dataloader, epochs=20, device=device, lr=1e-3)

test_loss, acc, f1, _, _ = evaluate(audio_only, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

# 1 - Multimodal-Transformer

In [21]:
class MultiModalTransformer(nn.Module):
    """
    Class for the multimodal transformer model
    """
    def __init__(self, tokenizer, embedder, transformer, head):
        """
        Args:
            tokenizer: tokenizer to use
            embedder: embedder to use
            transformer: transformer to use
            head: head to use
        """
        super().__init__()
        self.pos_encoder = PositionalEncoding(EMBEDDING_DIM, dual_modality=False)
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.transformer = transformer
        self.head = head

    def forward(self, texts, audio_features, audio_attentions):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        tokenizer_output = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = self.embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][0]
        text_attentions = tokenizer_output.attention_mask

        concatenated_attentions = torch.cat((text_attentions, audio_attentions.float()), dim=1)
        
        audio_features = self.pos_encoder(audio_features)
        
        concatenated_features = torch.cat((text_features, audio_features), dim=1)

        transformer_output = self.transformer(concatenated_features, text_attentions, audio_attentions)

        # pooling of transformer output        
        transformer_output_sum = (transformer_output * concatenated_attentions.unsqueeze(-1)).sum(axis=1)
        transformer_output_pooled = transformer_output_sum / concatenated_attentions.sum(axis=1).unsqueeze(-1)
        return self.head(transformer_output_pooled)

In [22]:
# TRAINING OF MULTIMODAL TRANSFORMER
multimodal_encoder = CustomEncoder(d_model=EMBEDDING_DIM, ffn_hidden=2048, n_head=4, n_layers=1, drop_prob=0.1)

multimodal_transformer_head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

multimodal_transformer = MultiModalTransformer(tokenizer, embedder, multimodal_encoder, multimodal_transformer_head).to(device)

multimodal_optimizer = torch.optim.Adam(multimodal_transformer.parameters(), lr=1e-3)
multimodal_criterion = nn.CrossEntropyLoss()

print(f'#parameters: {number_parameters(multimodal_transformer)}')

train(multimodal_transformer, multimodal_criterion, train_dataloader, val_dataloader, epochs=20, device=device)

test_loss, acc, f1, _, _ = evaluate(multimodal_transformer, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

#parameters: 5711362


  5%|▌         | 1/20 [01:33<29:34, 93.39s/it]

Epoch: 0, Training Loss: 0.0861, Validation Loss: 0.0836, accuracy = 0.5816, F1=0.5798


 10%|█         | 2/20 [03:05<27:47, 92.66s/it]

Epoch: 1, Training Loss: 0.0815, Validation Loss: 0.0803, accuracy = 0.6406, F1=0.6393


 15%|█▌        | 3/20 [04:38<26:13, 92.59s/it]

Epoch: 2, Training Loss: 0.0767, Validation Loss: 0.0818, accuracy = 0.6543, F1=0.6508


 20%|██        | 4/20 [06:10<24:41, 92.58s/it]

Epoch: 3, Training Loss: 0.0743, Validation Loss: 0.0762, accuracy = 0.6685, F1=0.6640


 25%|██▌       | 5/20 [07:42<23:06, 92.41s/it]

Epoch: 4, Training Loss: 0.0720, Validation Loss: 0.0753, accuracy = 0.6728, F1=0.6710


 30%|███       | 6/20 [09:14<21:29, 92.11s/it]

Epoch: 5, Training Loss: 0.0704, Validation Loss: 0.0785, accuracy = 0.6683, F1=0.6666


 35%|███▌      | 7/20 [10:45<19:54, 91.90s/it]

Epoch: 6, Training Loss: 0.0688, Validation Loss: 0.0767, accuracy = 0.6674, F1=0.6565


 40%|████      | 8/20 [12:16<18:19, 91.65s/it]

Epoch: 7, Training Loss: 0.0670, Validation Loss: 0.0767, accuracy = 0.6718, F1=0.6698


 45%|████▌     | 9/20 [13:48<16:48, 91.72s/it]

Epoch 00009: reducing learning rate of group 0 to 1.0000e-04.
Epoch: 8, Training Loss: 0.0655, Validation Loss: 0.0771, accuracy = 0.6737, F1=0.6654


 50%|█████     | 10/20 [15:20<15:18, 91.84s/it]

Epoch: 9, Training Loss: 0.0626, Validation Loss: 0.0784, accuracy = 0.6774, F1=0.6734


 55%|█████▌    | 11/20 [16:51<13:44, 91.59s/it]

Epoch: 10, Training Loss: 0.0604, Validation Loss: 0.0806, accuracy = 0.6806, F1=0.6770


 60%|██████    | 12/20 [18:22<12:11, 91.43s/it]

Epoch: 11, Training Loss: 0.0597, Validation Loss: 0.0806, accuracy = 0.6795, F1=0.6772


 65%|██████▌   | 13/20 [19:54<10:39, 91.38s/it]

Epoch 00013: reducing learning rate of group 0 to 1.0000e-05.
Epoch: 12, Training Loss: 0.0586, Validation Loss: 0.0821, accuracy = 0.6770, F1=0.6752


 70%|███████   | 14/20 [21:25<09:08, 91.33s/it]

Epoch: 13, Training Loss: 0.0579, Validation Loss: 0.0827, accuracy = 0.6781, F1=0.6759


 75%|███████▌  | 15/20 [22:56<07:36, 91.35s/it]

Epoch: 14, Training Loss: 0.0574, Validation Loss: 0.0829, accuracy = 0.6804, F1=0.6780


 80%|████████  | 16/20 [24:27<06:05, 91.29s/it]

Epoch: 15, Training Loss: 0.0573, Validation Loss: 0.0835, accuracy = 0.6795, F1=0.6771


 85%|████████▌ | 17/20 [25:59<04:33, 91.27s/it]

Epoch 00017: reducing learning rate of group 0 to 1.0000e-06.
Epoch: 16, Training Loss: 0.0573, Validation Loss: 0.0831, accuracy = 0.6797, F1=0.6772


 90%|█████████ | 18/20 [27:30<03:02, 91.17s/it]

Epoch: 17, Training Loss: 0.0572, Validation Loss: 0.0836, accuracy = 0.6799, F1=0.6774


 95%|█████████▌| 19/20 [29:01<01:31, 91.22s/it]

Epoch: 18, Training Loss: 0.0574, Validation Loss: 0.0831, accuracy = 0.6801, F1=0.6776


100%|██████████| 20/20 [30:34<00:00, 91.72s/it]

Epoch: 19, Training Loss: 0.0571, Validation Loss: 0.0832, accuracy = 0.6797, F1=0.6772





Results on Test Set: 
Test loss: 0.08729851786639937	Accuracy: 0.6541976980365606	F1: 0.6528137922286987


# 2 - Ensembling-Fusion

In [None]:
 class EnsemblingFusion(nn.Module):
    """
    Class for the ensembling model
    """
    def __init__(self, text_model, audio_model):
        """
        Args:
            text_model: text model to use
            audio_model: audio model to use
        """
        super().__init__()
        self.text_model = text_model
        self.audio_model = audio_model
        # weight to balance the two models
        self.weight = torch.nn.Parameter(torch.tensor(0.0))
        
    def forward(self, texts, audio_features, audio_attentions):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        text_logits = self.text_model(texts, audio_features, audio_attentions)
        audio_logits = self.audio_model(texts, audio_features, audio_attentions)
        
        text_probabilities = torch.nn.functional.softmax(text_logits)
        audio_probabilities = torch.nn.functional.softmax(audio_logits)
        
        # coefficient to balance the two models based on weight learned
        # (tanh + 1) / 2 to have values in [0,1]
        coefficient = (torch.tanh(self.weight) + 1) / 2
        # next step is to have values in [0.3,0.7] to avoid too much imbalance
        coefficient = coefficient*0.4 + 0.3
        
        return coefficient*text_probabilities + (1-coefficient)*audio_probabilities

In [None]:
# TRAINING OF ENSEMBLING
ensembling_text_head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

ensembling_audio_head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

ensembling_transformer_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_DIM, nhead=4, dim_feedforward=512, batch_first=True).to(device)
ensembling_transformer_encoder = nn.TransformerEncoder(ensembling_transformer_layer, num_layers=4).to(device)

ensembling_text_model = TextModel(tokenizer, embedder, ensembling_text_head)
ensembling_audio_model = AudioModel(ensembling_transformer_encoder, ensembling_audio_head)

ensembling_fusion = EnsemblingFusion(ensembling_text_model, ensembling_audio_model).to(device)

print(f'#Params: {number_parameters(ensembling_fusion)}')
def custom_loss(outputs, targets):
    return torch.nn.functional.nll_loss(torch.log(outputs), targets, reduction='mean')

train(ensembling_fusion, custom_loss, train_dataloader, val_dataloader, epochs=20, device=device)

test_loss, acc, f1, _, _ = evaluate(ensembling_fusion, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

# 3 - Unaligned Multimodal Model

In [None]:
class UnalignedPositionwiseFeedForward(nn.Module):
    """
    Class for the positionwise feed forward layer
    """
    def __init__(self, d_model: int, d_ffn: int, dropout: float = 0.1):
        """
        Args:
            d_model: dimension of the model
            d_ffn: dimension of the feed forward layer
            dropout: dropout to use
        """
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ffn)
        self.w_2 = nn.Linear(d_ffn, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Forward pass of the model
        Args:
            x: input to use
        """
        return self.w_2(self.dropout(self.w_1(x).relu()))

class CrossModalAttentionBlock(nn.Module):
    """
    Class for the cross modal attention block
    """
    def __init__(self, embedding_dim, d_ffn):
        """
        Args:
            embedding_dim: dimension of the embedding
            d_ffn: dimension of the feed forward layer
        """
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.layer_norm = nn.LayerNorm(self.embedding_dim)
        self.mh_attention = nn.MultiheadAttention(self.embedding_dim, 4, 0.1, batch_first=True)
        self.pointwise_ff = UnalignedPositionwiseFeedForward(self.embedding_dim, d_ffn=self.d_ffn)
    
    def forward(self, elem_a, elem_b, attn_mask):
        """
        Forward pass of the model
        Args:
            elem_a: elements of the modality A
            elem_b: elements of the modality B
            attn_mask: attention mask to use
        """
        elem_a = self.layer_norm(elem_a)
        elem_b = self.layer_norm(elem_b)
        attn_mask = attn_mask.to(torch.float32)
        
        # cross modal attention with elem_a as query and elem_b as key and value
        mh_out, _ = self.mh_attention(elem_a, elem_b, elem_b, key_padding_mask=attn_mask, need_weights=False)
        # residual connection
        add_out = mh_out + elem_a
        
        add_out_norm = self.layer_norm(add_out)
        out_ffn = self.pointwise_ff(add_out_norm)
        out = out_ffn + add_out
        return out
    
class UnalignedMultimodalModel(nn.Module):
    """
    Class for the unaligned multimodal model
    """
    def __init__(self, embedding_dim, d_ffn, n_blocks, head):
        """
        Args:
            embedding_dim: dimension of the embedding
            d_ffn: dimension of the feed forward layer
            n_blocks: number of blocks to use
            head: head to use
        """
        super().__init__()
        self.embedding_dim = embedding_dim
        self.d_ffn = d_ffn
        self.n_blocks = n_blocks
        self.head = head
        self.text_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.audio_crossmodal_blocks = nn.ModuleList([
            CrossModalAttentionBlock(self.embedding_dim, self.d_ffn) for _ in range(self.n_blocks)
        ])
        self.pos_encoder = PositionalEncoding(embedding_dim, dual_modality=False)
    
    def forward(self, texts, audio_features, audio_attentions):
        """
        Forward pass of the model
        Args:
            texts: texts to use
            audio_features: audio features to use
            audio_attentions: audio attentions to use
        """
        tokenizer_output = tokenizer(texts, return_tensors='pt', padding=True, truncation=False).to(device)
        embedder_output = embedder(**tokenizer_output, output_hidden_states=True)
        text_features = embedder_output['hidden_states'][0]
        text_features = self.pos_encoder(text_features)
        text_attentions = tokenizer_output.attention_mask
        
        audio_features = self.pos_encoder(audio_features)
        
        # cross modal attention blocks for text
        # using audio features as key and value and text features as query
        text_crossmodal_out = text_features
        for cm_block in self.text_crossmodal_blocks:
            text_crossmodal_out = cm_block(text_crossmodal_out, audio_features, audio_attentions)
        
        # cross modal attention blocks for audio
        # using text features as key and value and audio features as query
        audio_crossmodal_out = audio_features
        for cm_block in self.audio_crossmodal_blocks:
            audio_crossmodal_out = cm_block(audio_crossmodal_out, text_features, text_attentions)

        # pooling of transformer output
        text_crossmodal_out_mean = torch.mean(text_crossmodal_out, dim=1)
        audio_crossmodal_out_mean = torch.mean(audio_crossmodal_out, dim=1)
        
        # concatenate text and audio features
        text_audio = torch.cat((text_crossmodal_out_mean, audio_crossmodal_out_mean), dim=-1)
        
        return self.head(text_audio)

In [None]:
# TRAINING OF UNALIGNED-MODEL
unaligned_head = nn.Sequential(
    nn.Linear(EMBEDDING_DIM*2, 256),
    nn.ReLU(),
    nn.Linear(256, MODEL_NUM_LABELS)
).to(device)

unaligned_mm_model = UnalignedMultimodalModel(embedding_dim=EMBEDDING_DIM, d_ffn=100, n_blocks=4, head=unaligned_head).to(device)

train(unaligned_mm_model, ce_loss, train_dataloader, val_dataloader, epochs=20, device=device)

test_loss, acc, f1, _, _ = evaluate(unaligned_mm_model, test_dataloader, ce_loss)
print('Results on Test Set: ')
print(f'Test loss: {test_loss}\tAccuracy: {acc}\tF1: {f1}')

# Training of the models

In [None]:
def create_models(){
    """
    Creates all the models
    """
    # creating text-only model
    text_only_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    text_only = TextModel(tokenizer, embedder, text_only_head)

    # creating audio-only model
    audio_only_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    audio_only_transformer_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_DIM, nhead=4, dim_feedforward=512, batch_first=True).to(device)
    audio_only_transformer_encoder = nn.TransformerEncoder(audio_only_transformer_layer, num_layers=4).to(device)
    audio_only = AudioModel(audio_only_transformer_encoder, audio_only_head).to(device)

    # creating multimodal model
    multimodal_encoder = CustomEncoder(d_model=EMBEDDING_DIM, ffn_hidden=2048, n_head=4, n_layers=1, drop_prob=0.1)
    multimodal_transformer_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    multimodal_transformer = MultiModalTransformer(tokenizer, embedder, multimodal_encoder, multimodal_transformer_head).to(device)

    # creating ensembling model
    ensembling_text_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    ensembling_audio_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    ensembling_transformer_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_DIM, nhead=4, dim_feedforward=512, batch_first=True).to(device)
    ensembling_transformer_encoder = nn.TransformerEncoder(ensembling_transformer_layer, num_layers=4).to(device)
    ensembling_text_model = TextModel(tokenizer, embedder, ensembling_text_head)
    ensembling_audio_model = AudioModel(ensembling_transformer_encoder, ensembling_audio_head)
    ensembling_fusion = EnsemblingFusion(ensembling_text_model, ensembling_audio_model).to(device)

    # creating unaligned multimodal model
    unaligned_head = nn.Sequential(
        nn.Linear(EMBEDDING_DIM*2, 256),
        nn.ReLU(),
        nn.Linear(256, MODEL_NUM_LABELS)
    ).to(device)
    unaligned_mm_model = UnalignedMultimodalModel(embedding_dim=EMBEDDING_DIM, d_ffn=100, n_blocks=4, head=unaligned_head).to(device)
    return {
        'text_only': text_only,
        'audio_only': audio_only,
        'multimodal': multimodal_transformer,
        'ensembling': ensembling_fusion,
        'unaligned': unaligned_mm_model
    }
}

In [None]:
SEEDS = [0, 42, 69, 420, 666]

val_results = {
    'text_only': [],
    'audio_only': [],
    'multimodal': [],
    'ensembling': [],
    'unaligned': []
}

test_results = {
    'text_only': [],
    'audio_only': [],
    'multimodal': [],
    'ensembling': [],
    'unaligned': []
}

EPOCHS = 20
INITIAL_LR = 1e-3
WEIGHT_DECAY = 1e-5
LR_DECAY_FACTOR = 1e-1
LR_DECAY_PATIENCE = 3
DROPOUT = 0.1
VERBOSE_TRAIN = True

for seed in SEEDS:
    print(f'{f"TRAINING WITH SEED {seed}":=^65}')
    print()
    torch.manual_seed(seed)
    np.random.seed(seed)
    # TODO: copilot suggested this 2 lines, check if they are useful
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False
    
    models = create_models()

    for model_name, model in models.items():
        torch.manual_seed(seed)
        np.random.seed(seed)
        print(f'{f"Training model {model_name}":_^65}')
        loss = ce_loss
        if model_name == 'ensembling':
            loss = custom_loss
        train(
            model,
            loss,
            train_dataloader,
            val_dataloader,
            epochs=EPOCHS,
            device=device,
            lr=INITIAL_LR,
            lr_decay_factor=LR_DECAY_FACTOR,
            lr_decay_patience=LR_DECAY_PATIENCE,
            weight_decay=WEIGHT_DECAY,
            verbose=VERBOSE_TRAIN
        )
        _, val_acc, val_f1, val_pred, val_targ = evaluate(model, val_dataloader, loss)
        _, test_acc, test_f1, test_pred, test_targ = evaluate(model, test_dataloader, loss)
        if VERBOSE_TRAIN:
            print(f'[VAL] Model: {model_name} - acc: {val_acc:.4f} - f1: {val_f1:.4f}')
            print(f'[TEST] Model: {model_name} - acc: {test_acc:.4f} - f1: {test_f1:.4f}')
        val_results[model_name].append({
            'acc': val_acc,
            'f1': val_f1,
            'pred': val_pred,
            'targ': val_targ
        })
        test_results[model_name].append({
            'acc': test_acc,
            'f1': test_f1,
            'pred': test_pred,
            'targ': test_targ
        })

# Error Analysis