### Import libraries

In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, LayerNormalization
from tensorflow.keras.layers import MultiHeadAttention, GlobalAveragePooling1D, Layer, Add, Concatenate
from transformers import TFXLMRobertaModel

from tensorflow.keras.optimizers import Adam
from sklearn.utils import class_weight

- Load the train labels
- Load the test lables
- Load the text and audio data

### Text model &rarr; Transformer

In [None]:
roberta_model = TFXLMRobertaModel.from_pretrained("xlm-roberta-base") #another option: xlm-roberta-large

In [None]:
def transformer_block(embeddings, num_heads, dff, dropout_rate=0.3):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embeddings.shape[-1])(embeddings, embeddings)
    attn_output = Dropout(dropout_rate)(attn_output)
    out1 = LayerNormalization(epsilon=1e-6)(embeddings+attn_output)
    
    ffn_output = Dense(dff, activation=tf.nn.gelu)(out1)
    ffn_output = Dense(embeddings.shape[-1])(ffn_output)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    out2 = LayerNormalization(epsilon=1e-6)(out1+ffn_output)
    
    return out2

In [None]:
MAX_LENGTH = 252

def create_roberta_text_model():
    text_input_ids = Input(shape=(MAX_LENGTH,), dtype=tf.int32)
    roberta_output = roberta_model(text_input_ids)
    roberta_embeddings = roberta_output[0]
    
    transformer_output = transformer_block(roberta_embeddings, num_heads=6, dff=512)
    
    x = GlobalAveragePooling1D()(transformer_output)
    
    return tf.keras.Model(inputs=text_input_ids, outputs=x)

text_model = create_roberta_text_model()

### Audio model &rarr; MLP-Mixer

In [None]:
audio_input_shape = (train_audio_features.shape[1], train_audio_features.shape[2])

class MixerLayer(Layer):
    def __init__(self, tokens_mlp_dim, channels_mlp_dim, dropout_rate):
        super().__init__()
        self.tokens_mlp_dim = tokens_mlp_dim
        self.channels_mlp_dim = channels_mlp_dim
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dense1 = Dense(self.tokens_mlp_dim, activation=tf.nn.gelu)
        self.dense2 = Dense(input_shape[1], activation=tf.nn.gelu)
        self.dense3 = Dense(self.channels_mlp_dim, activation=tf.nn.gelu)
        self.dense4 = Dense(input_shape[2], activation=tf.nn.gelu)
        self.dropout = Dropout(self.dropout_rate)
        
    def call(self, inputs):
        # Token mixing
        x = self.layer_norm1(inputs)
        x_t = tf.transpose(x, perm=[0, 2, 1])
        x_t = self.dense1(x_t)
        x_t = self.dense2(x_t)
        x_t = tf.transpose(x_t, perm=[0, 2, 1])
        x = Add()([x, x_t])

        # Channel mixing
        y = self.layer_norm2(x)
        y = self.dense3(y)
        y = self.dense4(y)
        y = Add()([x, y])
        y = self.dropout(y)

        return y

In [None]:
mlp_mixer_model = Sequential([
    Input(shape=audio_input_shape),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    MixerLayer(tokens_mlp_dim=32, channels_mlp_dim=32, dropout_rate=0.5),
    GlobalAveragePooling1D()
])

### Fusion module - Cross modality

In [None]:
class CrossAttention(Layer):
    def __init__(self, units):
        super(CrossAttention, self).__init__()
        self.units = units
        self.dense_query = Dense(units)
        self.dense_key = Dense(units)
        self.dense_value = Dense(units)
        self.softmax = tf.keras.layers.Softmax(axis=-1)
        
    def call(self, inputs):
        query, key = inputs
        query = self.dense_query(query)
        key = self.dense_key(key)
        value = self.dense_value(key)
        
        score = tf.matmul(query, key, transpose_b=True)
        alignment = self.softmax(score)
        context = tf.matmul(alignment, value)
        return context

In [None]:
class CombinedModelWithText(tf.keras.Model):
    def __init__(self, text_model, mlp_mixer_model):
        super(CombinedModelWithText, self).__init__()
        self.text_model = text_model
        self.mlp_mixer_model = mlp_mixer_model
        self.cross_attention_text = CrossAttention(64)
        self.cross_attention_audio = CrossAttention(64)
        self.fclayer = Dense(128, activation=tf.nn.gelu)
        self.dropout = Dropout(0.5)
        self.classifier = Dense(1, activation='sigmoid')

    def call(self, inputs):
        text_input, audio_input = inputs
        text_output = self.text_model(text_input)
        mlp_mixer_output = self.mlp_mixer_model(audio_input)
        
        cross_attention_text_output = self.cross_attention_text([text_output, mlp_mixer_output])
        cross_attention_audio_output = self.cross_attention_audio([mlp_mixer_output, text_output])
        
        concatenated_output = Concatenate()([cross_attention_text_output, cross_attention_audio_output])
        x = self.fclayer(concatenated_output)
        x = self.dropout(x)
        x = self.classifier(x)
        
        return x

# Create the combined model
fusion_model = CombinedModelWithText(text_model, mlp_mixer_model)

In [None]:
learning_rate = 2e-6  # Change this value to adjust the learning rate
optimizer = Adam(lr=learning_rate)

fusion_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=40)

class_weights_list = class_weight.compute_sample_weight(class_weight='balanced', y=train_labels)
class_weights = {0: class_weights_list[0], 1: class_weights_list[1]}

fusion_model.fit((train_text_features, train_audio_features),train_labels, epochs=300, 
                  validation_split=0.2, batch_size=2, 
                  callbacks=[early_stopping], class_weight=class_weights)