In [1]:
import torch
import torch.nn as nn
from torch.nn.functional import normalize
import torch.nn.functional as F

In [2]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from torch.optim.lr_scheduler import ExponentialLR
from sklearn.model_selection import train_test_split

import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

In [3]:
import torch.optim as optim
import numpy as np
from sklearn.metrics import classification_report, f1_score, accuracy_score

In [4]:
# Check for GPU availability
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [5]:
'''
Sample-Weighted Focal Contrastive (SWFC) Loss:
1. Divide training samples into positive and negative pairs to maximize
inter-class distances while minimizing intra-class distances;
2. Assign more importance to hard-to-classify positive pairs;
3. Assign more importance to minority classes.
'''
class SampleWeightedFocalContrastiveLoss(nn.Module):

    def __init__(self, temp_param, focus_param, sample_weight_param, dataset, class_counts, device):
        '''
        temp_param: control the strength of penalty on hard negative samples;
        focus_param: forces the model to concentrate on hard-to-classify samples;
        sample_weight_param: control the strength of penalty on minority classes;
        dataset: MELD or IEMOCAP.
        device: cpu or cuda.
        '''
        super().__init__()

        self.temp_param = temp_param
        self.focus_param = focus_param
        self.sample_weight_param = sample_weight_param
        self.dataset = dataset
        self.class_counts = class_counts
        self.device = device

        if self.dataset == 'MELD':
            self.num_classes = 7
        elif self.dataset == 'IEMOCAP':
            self.num_classes = 6
        else:
            raise ValueError('Please choose either MELD or IEMOCAP')

        self.class_weights = self.get_sample_weights()


    '''
    Use dot-product to measure the similarity between feature pairs.
    '''
    def dot_product_similarity(self, current_features, feature_sets):
        similarity = torch.sum(current_features * feature_sets, dim = -1)
        similarity_probs = torch.softmax(similarity / self.temp_param, dim = 0)

        return similarity_probs


    '''
    Calculate the loss contributed from positive pairs.
    '''
    def positive_pairs_loss(self, similarity_probs):
        pos_pairs_loss = torch.mean(torch.log(similarity_probs) * ((1 - similarity_probs)**self.focus_param), dim = 0)

        return pos_pairs_loss


    '''
    Assign more importance to minority classes.
    '''
    def get_sample_weights(self):
        total_counts = torch.sum(self.class_counts, dim = -1)
        class_weights = (total_counts / self.class_counts)**self.sample_weight_param
        class_weights = normalize(class_weights, dim = -1, p = 1.0)

        return class_weights


    def forward(self, features, labels):
        self.num_samples = labels.shape[0]
        self.feature_dim = features.shape[-1]

        features = normalize(features, dim = -1)  # normalization helps smooth the learning process

        batch_sample_weights = torch.FloatTensor([self.class_weights[label] for label in labels]).to(self.device)

        total_loss = 0.0
        for i in range(self.num_samples):
            current_feature = features[i]
            current_label = labels[i]
            feature_sets = torch.cat((features[:i], features[i + 1:]), dim = 0)
            label_sets = torch.cat((labels[:i], labels[i + 1:]), dim = 0)
            expand_current_features = current_feature.expand(self.num_samples - 1, self.feature_dim).to(self.device)
            similarity_probs = self.dot_product_similarity(expand_current_features, feature_sets)
            pos_similarity_probs = similarity_probs[label_sets == current_label]  # positive pairs with the same label
            if len(pos_similarity_probs) > 0:
                pos_pairs_loss = self.positive_pairs_loss(pos_similarity_probs)
                weighted_pos_pairs_loss = pos_pairs_loss * batch_sample_weights[i]
                total_loss += weighted_pos_pairs_loss

        loss = - total_loss / self.num_samples

        return loss

In [6]:
'''
Maximize the correlations across multimodal-fused features
extracted from MultiAttn through Soft-HGR loss.
'''
class SoftHGRLoss(nn.Module):

    def __init__(self):
        super().__init__()


    '''
    Calculate the inner products between feature mappings.
    '''
    def feature_mapping(self, feature_X, feature_Y):
        feature_mapping_X_Y = torch.mean(torch.sum(feature_X * feature_Y, dim = -1), dim = 0)

        return feature_mapping_X_Y


    '''
    Calculate the inner products between feature covariances.
    '''
    def feature_covariance(self, feature_X, feature_Y):
        cov_feature_X = torch.cov(feature_X)
        cov_feature_Y = torch.cov(feature_Y)
        # We empirically find that scaling the feature covariance by a factor of 1 / num_samples
        # leads to enhanced training stability and improvements in model performances.
        feature_covariance_X_Y = torch.trace(torch.matmul(cov_feature_X, cov_feature_Y)) / self.num_samples
        return feature_covariance_X_Y


    def forward(self, f_t, f_a, f_v):
        self.num_samples = f_t.shape[0]

        all_features = [f_t, f_a, f_v]
        total_loss = 0.0
        for i in range(len(all_features) - 1):
            for j in range(i + 1, len(all_features)):
                feature_mapping_i_j = self.feature_mapping(all_features[i], all_features[j])
                feature_covariance_i_j = self.feature_covariance(all_features[i], all_features[j])
                soft_hgr_loss_i_j = feature_mapping_i_j - feature_covariance_i_j / 2
                total_loss += soft_hgr_loss_i_j

        loss = - total_loss / self.num_samples

        return loss

In [7]:
'''
2-layer MLP with ReLU activation.
'''
class MLP(nn.Module):

    def __init__(self, input_dim, hidden_dim, num_classes, dropout_rate):
        super().__init__()

        self.linear_1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.linear_2 = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, x):
        return self.dropout(self.linear_2(self.relu(self.linear_1(x))))

In [8]:
'''
Bidirectional cross-attention layers.
'''
class BidirectionalCrossAttention(nn.Module):

    def __init__(self, model_dim, Q_dim, K_dim, V_dim):
        super().__init__()

        self.query_matrix = nn.Linear(model_dim, Q_dim)
        self.key_matrix = nn.Linear(model_dim, K_dim)
        self.value_matrix = nn.Linear(model_dim, V_dim)


    def bidirectional_scaled_dot_product_attention(self, Q, K, V):
        score = torch.bmm(Q, K.transpose(-1, -2))
        scaled_score = score / (K.shape[-1]**0.5)
        attention = torch.bmm(F.softmax(scaled_score, dim = -1), V)

        return attention


    def forward(self, query, key, value):
        Q = self.query_matrix(query)
        K = self.key_matrix(key)
        V = self.value_matrix(value)
        attention = self.bidirectional_scaled_dot_product_attention(Q, K, V)

        return attention



'''
Multi-head bidirectional cross-attention layers.
'''
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, model_dim, Q_dim, K_dim, V_dim):
        super().__init__()

        self.num_heads = num_heads
        self.attention_heads = nn.ModuleList(
            [BidirectionalCrossAttention(model_dim, Q_dim, K_dim, V_dim) for _ in range(self.num_heads)]
        )
        self.projection_matrix = nn.Linear(num_heads * V_dim, model_dim)


    def forward(self, query, key, value):
        heads = [self.attention_heads[i](query, key, value) for i in range(self.num_heads)]
        multihead_attention = self.projection_matrix(torch.cat(heads, dim = -1))

        return multihead_attention



'''
A feed-forward network, which operates as a key-value memory.
'''
class Feedforward(nn.Module):

    def __init__(self, model_dim, hidden_dim, dropout_rate):
        super().__init__()

        self.linear_W1 = nn.Linear(model_dim, hidden_dim)
        self.linear_W2 = nn.Linear(hidden_dim, model_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, x):
        return self.dropout(self.linear_W2(self.relu(self.linear_W1(x))))



'''
Residual connection to smooth the learning process.
'''
class AddNorm(nn.Module):

    def __init__(self, model_dim, dropout_rate):
        super().__init__()

        self.layer_norm = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout_rate)


    def forward(self, x, sublayer):
        output = self.layer_norm(x + self.dropout(sublayer(x)))

        return output



'''
MultiAttn is a multimodal fusion model which aims to capture the complicated interactions and
dependencies across textual, audio and visual modalities through bidirectional cross-attention layers.
MultiAttn is made up of three sub-components:
1. MultiAttn_text: integrate the textual modality with audio and visual information;
2. MultiAttn_audio: incorporate the audio modality with textual and visual information;
3. MultiAttn_visual: fuse the visual modality with textual and visual cues.
'''
class MultiAttnLayer(nn.Module):

    def __init__(self, num_heads, model_dim, hidden_dim, dropout_rate):
        super().__init__()

        Q_dim = K_dim = V_dim = model_dim // num_heads
        self.attn_1 = MultiHeadAttention(num_heads, model_dim, Q_dim, K_dim, V_dim)
        self.add_norm_1 = AddNorm(model_dim, dropout_rate)
        self.attn_2 = MultiHeadAttention(num_heads, model_dim, Q_dim, K_dim, V_dim)
        self.add_norm_2 = AddNorm(model_dim, dropout_rate)
        self.ff = Feedforward(model_dim, hidden_dim, dropout_rate)
        self.add_norm_3 = AddNorm(model_dim, dropout_rate)


    def forward(self, query_modality, modality_A, modality_B):
        attn_output_1 = self.add_norm_1(query_modality, lambda query_modality: self.attn_1(query_modality, modality_A, modality_A))
        attn_output_2 = self.add_norm_2(attn_output_1, lambda attn_output_1: self.attn_2(attn_output_1, modality_B, modality_B))
        ff_output = self.add_norm_3(attn_output_2, self.ff)

        return ff_output



'''
Stacks of MultiAttn layers.
'''
class MultiAttn(nn.Module):

    def __init__(self, num_layers, model_dim, num_heads, hidden_dim, dropout_rate):
        super().__init__()

        self.multiattn_layers = nn.ModuleList([
            MultiAttnLayer(num_heads, model_dim, hidden_dim, dropout_rate) for _ in range(num_layers)])


    def forward(self, query_modality, modality_A, modality_B):
        for multiattn_layer in self.multiattn_layers:
            query_modality = multiattn_layer(query_modality, modality_A, modality_B)

        return query_modality



class MultiAttnModel(nn.Module):

    def __init__(self, num_layers, model_dim, num_heads, hidden_dim, dropout_rate):
        super().__init__()

        self.multiattn_text = MultiAttn(num_layers, model_dim, num_heads, hidden_dim, dropout_rate)
        self.multiattn_audio = MultiAttn(num_layers, model_dim, num_heads, hidden_dim, dropout_rate)
        self.multiattn_visual = MultiAttn(num_layers, model_dim, num_heads, hidden_dim, dropout_rate)


    def forward(self, text_features, audio_features, visual_features):
        f_t = self.multiattn_text(text_features, audio_features, visual_features)
        f_a = self.multiattn_audio(audio_features, text_features, visual_features)
        f_v = self.multiattn_visual(visual_features, text_features, audio_features)

        return f_t, f_a, f_v

In [9]:
class FakeBERT(nn.Module):
    def __init__(
        self,
        device,
        roberta_model_path='roberta-base',
        num_classes=1,
        inductor=True
        ):
        super(FakeBERT, self).__init__()

        # Load pre-trained RoBERTa model
        self.roberta = RobertaModel.from_pretrained(roberta_model_path).to(device=device)
        if (inductor):
          self.roberta = torch.compile(self.roberta, backend="inductor")
        self.tokenizer = RobertaTokenizer.from_pretrained(roberta_model_path)

        # CNN
        self.conv1d_p1 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=5).to(device=device)
        self.conv1d_p2 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=4).to(device=device)
        self.conv1d_p3 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=3).to(device=device)
        self.conv1d_s1 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5).to(device=device)
        self.conv1d_s2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5).to(device=device)

        # Pooling
        self.max_pool_p1 = nn.MaxPool1d(kernel_size=5).to(device=device)
        self.max_pool_p2 = nn.MaxPool1d(kernel_size=5).to(device=device)
        self.max_pool_p3 = nn.MaxPool1d(kernel_size=5).to(device=device)
        self.max_pool_s1 = nn.MaxPool1d(kernel_size=5).to(device=device)
        self.max_pool_s2 = nn.MaxPool1d(kernel_size=10).to(device=device)

        # Fully connected layers
        self.linear1 = nn.Linear(640, 128).to(device=device)
        self.linear2 = nn.Linear(128, num_classes).to(device=device)
        self.sigmoid = nn.Sigmoid().to(device=device)

    def forward(self, x):
        # Tokenize and encode the sentences
        tokenized_sentences = self.tokenizer(x, truncation=True, padding='max_length', return_tensors='pt').to(device=device)
        # print('tokenized_sentences', tokenized_sentences.shape)

        # Forward pass to get embeddings
        with torch.no_grad():
            # Get RoBERTa embeddings
            model_output = self.roberta(**tokenized_sentences)

        # Extract embeddings from the output
        embeddings = model_output.last_hidden_state
        # print('embeddings', embeddings.shape)

        output_p1 = self.max_pool_p1(F.relu(self.conv1d_p1(embeddings.permute(0, 2, 1))))
        output_p2 = self.max_pool_p2(F.relu(self.conv1d_p2(embeddings.permute(0, 2, 1))))
        output_p3 = self.max_pool_p3(F.relu(self.conv1d_p3(embeddings.permute(0, 2, 1))))
        output_s = torch.cat((output_p1, output_p2, output_p3), dim=2)
        output_s1 = F.relu(self.conv1d_s1(output_s))
        output_s1 = self.max_pool_s1(output_s1)
        output_s2 = F.relu(self.conv1d_s2(output_s1))
        output_s2 = self.max_pool_s2(output_s2)
        output_s2 = output_s2.permute(0, 2, 1)
        output_f = output_s2.reshape(output_s2.size(0), -1)
        output_l1 = torch.relu(self.linear1(output_f))
        output_l2 = self.linear2(output_l1)
        output = self.sigmoid(output_l2)

        return output, output_l1

In [10]:
# Initialize the model
fakebert = FakeBERT(
    device,
    inductor=False
)

sentences = ["I love this product!"]

outputs, before_sigmoid = fakebert(sentences)

print(outputs.shape)
print(before_sigmoid.shape)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

torch.Size([1, 1])
torch.Size([1, 128])


In [11]:
torch.save(fakebert.state_dict(), '/content/fakebert-sentiment.pth')
torch.save(fakebert.state_dict(), '/content/fakebert-contract.pth')
torch.save(fakebert.state_dict(), '/content/fakebert-transaction.pth')

In [15]:
'''
MultiEMO consists of three key components: unimodal context modeling, multimodal fusion, and emotion classification.
'''
class MultiEMO(nn.Module):

    def __init__(self, device, multi_attn_flag=True, hidden_dim=1024, dropout=0, num_layers=6,
                 model_dim=128, num_heads=4, n_classes=1):
        super().__init__()

        self.multi_attn_flag = multi_attn_flag

        # TODO
        self.social_sentiment = FakeBERT(device=device, inductor=False)
        self.social_sentiment.load_state_dict(torch.load('/content/fakebert-sentiment.pth'))
        self.smart_contract = FakeBERT(device=device, inductor=False)
        self.smart_contract.load_state_dict(torch.load('/content/fakebert-contract.pth'))
        self.transaction = FakeBERT(device=device, inductor=False)
        self.transaction.load_state_dict(torch.load('/content/fakebert-transaction.pth'))



        self.multiattn = MultiAttnModel(num_layers, model_dim, num_heads, hidden_dim, dropout)

        self.fc = nn.Linear(model_dim * 3, model_dim)

        # if self.dataset == 'MELD':
        #     self.mlp = MLP(model_dim, model_dim * 2, n_classes, dropout)
        # elif self.dataset == 'IEMOCAP':
        #     self.mlp = MLP(model_dim, model_dim, n_classes, dropout)
        self.mlp = MLP(model_dim, model_dim, n_classes, dropout)


    def forward(self, texts, contracts, transactions):
        _, sentiment_features = self.social_sentiment(texts)
        sentiment_features = sentiment_features.unsqueeze(1).repeat(1, 10, 1)
        print('sentiment_features', sentiment_features.shape)
        _, contract_features = self.smart_contract(contracts)
        contract_features = contract_features.unsqueeze(1).repeat(1, 10, 1)
        print('contract_features', contract_features.shape)
        _, transaction_features = self.transaction(transactions)
        transaction_features = transaction_features.unsqueeze(1).repeat(1, 10, 1)
        print('transaction_features', transaction_features.shape)

        sentiment_features = sentiment_features.transpose(0, 1)
        print('sentiment_features', sentiment_features.shape)
        contract_features = contract_features.transpose(0, 1)
        print('contract_features', contract_features.shape)
        transaction_features = transaction_features.transpose(0, 1)
        print('transaction_features', transaction_features.shape)

        if self.multi_attn_flag == True:
            fused_sentiment_features, fused_contract_features, fused_transaction_features = self.multiattn(sentiment_features, contract_features, transaction_features)
        else:
            fused_sentiment_features, fused_contract_features, fused_transaction_features = sentiment_features, contract_features, transaction_features

        print('fused_sentiment_features', fused_sentiment_features.shape)
        print('fused_contract_features', fused_contract_features.shape)
        print('fused_transaction_features', fused_transaction_features.shape)
        fused_sentiment_features = fused_sentiment_features.reshape(-1, fused_sentiment_features.shape[-1])
        # fused_text_features = fused_text_features[padded_labels != -1]
        fused_contract_features = fused_contract_features.reshape(-1, fused_contract_features.shape[-1])
        # fused_audio_features = fused_audio_features[padded_labels != -1]
        fused_transaction_features = fused_transaction_features.reshape(-1, fused_transaction_features.shape[-1])
        # fused_visual_features = fused_visual_features[padded_labels != -1]

        fused_features = torch.cat((fused_sentiment_features, fused_contract_features, fused_transaction_features), dim = -1)
        fc_outputs = self.fc(fused_features)
        mlp_outputs = self.mlp(fc_outputs)

        return fused_sentiment_features, fused_contract_features, fused_transaction_features, fc_outputs, mlp_outputs

In [13]:
# Initialize the model
multiemo = MultiEMO(
    device=device
)

sentences = ["I love this product!"]

fused_text_features, fused_audio_features, fused_visual_features, fc_outputs, mlp_outputs = multiemo(sentences, sentences, sentences)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


sentiment_features torch.Size([1, 1, 128])
contract_features torch.Size([1, 1, 128])
transaction_features torch.Size([1, 1, 128])
fused_sentiment_features torch.Size([1, 1, 128])
fused_contract_features torch.Size([1, 1, 128])
fused_transaction_features torch.Size([1, 1, 128])


In [14]:
print("fused_text_features shape:", fused_text_features.shape)
print("fused_audio_features shape:", fused_audio_features.shape)
print("fused_visual_features shape:", fused_visual_features.shape)
print("fc_outputs shape:", fc_outputs.shape)
print("mlp_outputs shape:", mlp_outputs.shape)

fused_text_features shape: torch.Size([1, 128])
fused_audio_features shape: torch.Size([1, 128])
fused_visual_features shape: torch.Size([1, 128])
fc_outputs shape: torch.Size([1, 128])
mlp_outputs shape: torch.Size([1, 1])


In [16]:
# Initialize the model
multiemo = MultiEMO(
    device=device
)

sentences = ["I love this product!"]

fused_text_features, fused_audio_features, fused_visual_features, fc_outputs, mlp_outputs = multiemo(sentences, sentences, sentences)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


sentiment_features torch.Size([1, 10, 128])
contract_features torch.Size([1, 10, 128])
transaction_features torch.Size([1, 10, 128])
sentiment_features torch.Size([10, 1, 128])
contract_features torch.Size([10, 1, 128])
transaction_features torch.Size([10, 1, 128])
fused_sentiment_features torch.Size([10, 1, 128])
fused_contract_features torch.Size([10, 1, 128])
fused_transaction_features torch.Size([10, 1, 128])


In [18]:
print("fused_text_features shape:", fused_text_features.shape)
print("fused_audio_features shape:", fused_audio_features.shape)
print("fused_visual_features shape:", fused_visual_features.shape)
print("fc_outputs shape:", fc_outputs.shape)
print("mlp_outputs shape:", mlp_outputs.shape)

fused_text_features shape: torch.Size([10, 128])
fused_audio_features shape: torch.Size([10, 128])
fused_visual_features shape: torch.Size([10, 128])
fc_outputs shape: torch.Size([10, 128])
mlp_outputs shape: torch.Size([10, 1])


In [35]:
sentences = ["I love this product!", "It's terrible.", "Awesome experience.", "Worst ever.", "Great job!", "Awful.", "Excellent service.", "Hate it.", "Fantastic!", "Disappointing."]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

x_train, x_valid, y_train, y_valid = train_test_split(sentences, labels, test_size=0.2, stratify=labels, random_state=2023)

# Create DataLoader for training and validation sets
batch_size = 128

train_dataset = list(zip(x_train, y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = list(zip(x_valid, y_valid))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

num_classes = 2

def get_class_counts():
    class_counts = torch.zeros(num_classes).to(device)

    for inputs, labels in train_loader:
        class_counts += torch.bincount(labels, minlength = num_classes)

    return class_counts


# Initialize the model
SWFC_loss_param = 0.4
HGR_loss_param = 0.4
CE_loss_param = 0.2
sample_weight_param = 1.1
focus_param = 2.4
temp_param = 0.8
learning_rate = 0.0001
weight_decay = 0.00001
dataset = 'MELD'
class_counts = get_class_counts()


multiemo = MultiEMO(
    device=device
)

SWFCLoss = SampleWeightedFocalContrastiveLoss(
    temp_param,
    focus_param,
    sample_weight_param,
    dataset,
    class_counts,
    device
    )
HGR_loss = SoftHGRLoss()
# CE_loss = nn.CrossEntropyLoss()
CELoss = nn.BCELoss().to(device=device)  # Binary Cross-Entropy Loss for binary classification

optimizer = optim.Adam(multiemo.parameters(), lr = learning_rate, weight_decay = weight_decay)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.95, patience = 10, threshold = 1e-6, verbose = True)
sigmoid = nn.Sigmoid().to(device=device)

# Training loop
num_epochs = 5

for epoch in range(num_epochs):
    multiemo.train()
    total_loss = 0.0
    total_SWFC_loss, total_HGR_loss, total_CE_loss = 0.0, 0.0, 0.0
    all_labels, all_preds = [], []

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        fused_text_features, fused_audio_features, fused_visual_features, fc_outputs, mlp_outputs = multiemo(inputs, inputs, inputs)

        soft_HGR_loss = HGR_loss(fused_text_features, fused_audio_features, fused_visual_features).to(device=device)
        # print('soft_HGR_loss', soft_HGR_loss)
        # print('labels', labels)
        # print('fc_outputs', fc_outputs)
        SWFC_loss = SWFCLoss(fc_outputs, labels).to(device=device)
        # print('SWFC_loss', SWFC_loss)
        # print('mlp_outputs', mlp_outputs)
        output = sigmoid(mlp_outputs)
        # print('output', output)
        CE_loss = CELoss(output, torch.as_tensor(labels, dtype=torch.float32).unsqueeze(1).to(device=device))
        # print('CE_loss', CE_loss)

        loss = soft_HGR_loss * HGR_loss_param + SWFC_loss * SWFC_loss_param + CE_loss * CE_loss_param

        total_loss += loss.item()

        total_HGR_loss += soft_HGR_loss.item()
        total_SWFC_loss += SWFC_loss.item()
        total_CE_loss += CE_loss.item()

        loss.backward()
        optimizer.step()

        # preds = torch.argmax(mlp_outputs, dim = -1)
        preds = (output > 0.5).int()
        # print('preds', preds)
        # print('labels', labels)
        all_labels.append(labels.cpu().numpy())
        all_preds.append(preds.view(-1).cpu().numpy())

    all_labels = np.concatenate(all_labels)
    all_preds = np.concatenate(all_preds)
    avg_f1 = round(f1_score(all_labels, all_preds, average = 'weighted', zero_division=1) * 100, 4)
    print('avg_f1', avg_f1)
    avg_acc = round(accuracy_score(all_labels, all_preds) * 100, 4)
    print('avg_acc', avg_acc)
    report = classification_report(all_labels, all_preds, digits = 4, zero_division=1)
    print('report', report)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


sentiment_features torch.Size([8, 1, 128])
contract_features torch.Size([8, 1, 128])
transaction_features torch.Size([8, 1, 128])
fused_sentiment_features torch.Size([8, 1, 128])
fused_contract_features torch.Size([8, 1, 128])
fused_transaction_features torch.Size([8, 1, 128])
avg_f1 33.3333
avg_acc 50.0
report               precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667         4
           1     1.0000    0.0000    0.0000         4

    accuracy                         0.5000         8
   macro avg     0.7500    0.5000    0.3333         8
weighted avg     0.7500    0.5000    0.3333         8

sentiment_features torch.Size([8, 1, 128])
contract_features torch.Size([8, 1, 128])
transaction_features torch.Size([8, 1, 128])
fused_sentiment_features torch.Size([8, 1, 128])
fused_contract_features torch.Size([8, 1, 128])
fused_transaction_features torch.Size([8, 1, 128])
avg_f1 33.3333
avg_acc 50.0
report               precision    recall  f1-score   s