## Model 1 - ERC (Emotion Recognition in conversation)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# torch.Size([16, 10, 24, 768])
# torch.Size([16, 1, 24, 768])
# torch.Size([16, 24, 768])
# torch.Size([16, 24, 128])
# torch.Size([16, 24, 128])
# torch.Size([16, 24, 6])

# Final Output shape: torch.Size([16, 24, 6])


class CNN(nn.Module):
    def __init__(self, in_channels, mid_channels, out_channels):
        super(CNN, self).__init__()
        """
        CNN Module
        Input shape: (batch_size, in_channels, width, length)
        Output shape: (batch_size, out_channels, width, length)
        """
        self.conv1 = nn.Conv2d(in_channels, mid_channels, kernel_size=(1, 1))
        self.conv2 = nn.Conv2d(mid_channels, out_channels, kernel_size=(1, 1))

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.conv2(x)
        return x


class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTM, self).__init__()
        """
        BiLSTM Module
        Input shape: (batch_size, seq_length, input_size)
        Output shape: (batch_size, seq_length, hidden_size * 2)
        """
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        out, _ = self.bilstm(x)
        return out

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        """
        MLP Module
        Input shape: (batch_size, seq_length, input_size)
        Output shape: (batch_size, seq_length, output_size)
        """
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class ERC_CNN(nn.Module):
    def __init__(self, cnn, bilstm, mlp):
        super(ERC_CNN, self).__init__()
        self.cnn = cnn
        self.bilstm = bilstm
        self.mlp = mlp

    def forward(self, x):
        # CNN
        x = self.cnn(x)  # Output shape: (batch_size, out_channels, width, length)

        # BiLSTM
        x = x.squeeze(1)  # Remove the singleton dimension
        x = self.bilstm(x)  # Output shape: (batch_size, seq_length, hidden_size * 2)

        # MLP
        x = self.mlp(x)  # Output shape: (batch_size, seq_length, output_size)

        return x

# Define parameters
batch_size = 16
speakers = 10  # max_speakers
num_utterances = 24  # dialogue_length
embedding_size = 768

cnn_mid_channels = 3
cnn_out_channels = 1

hidden_lstm = 64
layers_lstm = 1

inputs_mlp = hidden_lstm * 2
hidden_mlp = 64
output_mlp = number_of_emotions = 6


# Create individual components
cnn = CNN(speakers, cnn_mid_channels, cnn_out_channels)
bilstm = BiLSTM(embedding_size, hidden_lstm, layers_lstm)
mlp = MLP(inputs_mlp, hidden_mlp, output_mlp)

# Create the combined model
model = ERC_CNN(cnn, bilstm, mlp)

# Generate random input tensor
input_tensor = torch.randn(batch_size, speakers, num_utterances, embedding_size)

# Forward pass through the model
output = model(input_tensor)  # Add batch dimension

print("Final Output shape:", output.shape)


Final Output shape: torch.Size([16, 24, 6])


### CNN

In [None]:
import torch
import torch.nn as nn

s = 10  # max_speakers
n = 24  # dialogue_length
input = torch.randn(s, n, 768)

# Define the parameters for the convolution operation
mid_channels = 3
out_channels = 1  # Number of output channels
kernel_size = (1, 1)  # Kernel size (height, width)
stride = 1

# Define the convolutional layer
conv3d_1 = nn.Conv2d(in_channels=s, out_channels=mid_channels, kernel_size=kernel_size)
conv3d_2 = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=kernel_size)

# Perform convolution
mid = conv3d_1(input)
output = conv3d_2(mid)

print(input.shape)
print(mid.shape)
print(output.shape)

# Ensure the output has the desired shape
print("Output shape:", output.shape)  # Output shape should be 1*n*768

torch.Size([10, 24, 768])
torch.Size([3, 24, 768])
torch.Size([1, 24, 768])
Output shape: torch.Size([1, 24, 768])


### Attention

In [None]:
import torch
import torch.nn as nn

#! add speaker embedding maybe self attention on sparse

class SelfAttention(nn.Module):
    def __init__(self, embedding_size, num_heads):
        super(SelfAttention, self).__init__()
        self.num_heads = num_heads
        self.multihead_attn = nn.MultiheadAttention(embedding_size, num_heads)

    def forward(self, utterance_embeddings):
        # Reshape utterance embeddings to (seq_len, batch_size, embedding_dim)
        # utterance_embeddings = utterance_embeddings.unsqueeze(0)  # Add a dimension for seq_len
        output, _ = self.multihead_attn(utterance_embeddings, utterance_embeddings, utterance_embeddings)
        return output

# Example usage
# num_utterances = 24  # Number of utterances
# # Generate random utterance embeddings
# utterance_embeddings = torch.randn(1, num_utterances, embedding_size)

s = 10  # max_speakers
n = 24  # dialogue_length
num_heads = 8  # Number of attention heads
embedding_size = 768  # Size of each embedding
input = torch.randn(s, n, embedding_size)


# Define self-attention layer
attention_layer = SelfAttention(embedding_size=embedding_size, num_heads=num_heads)

# Apply self-attention
context_embeddings = attention_layer(input)

# Check the shape of the output tensor
print(context_embeddings.shape)  # Output shape: (n, embedding_dim)

torch.Size([10, 24, 768])


### Bi-LSTM

In [None]:
import torch
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Bidirectional LSTM layer
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # 2 for bidirectional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM : (h_n, c_n): A tuple containing the final hidden state $h_n$ and the final cell state $c_n$ of the LSTM,
        out, _ = self.bilstm(x, (h0, c0))  # out shape: (batch_size, seq_length, hidden_size * 2)

        # Concatenate the outputs from both directions
        out = torch.cat((out[:, :, :self.hidden_size], out[:, :, self.hidden_size:]), dim=2)

        return out

# Define input size, hidden size, and number of layers
input_size = 768
hidden_size = 64
num_layers = 1

n = 24

# Create BiLSTM instance
bilstm = BiLSTM(input_size, hidden_size, num_layers)

# Generate random input tensor
input_tensor = torch.randn(1, n, 768)  # Batch size 24, input size 768
input_tensor = output

# Forward pass through BiLSTM
output = bilstm(input_tensor)  # Add batch dimension

print("Output shape:", output.shape)  # Output shape should be [1, 24, 128] (24 vectors each of size 128)

Output shape: torch.Size([1, 24, 128])


### MLP + Softmax

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EmotionMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(EmotionMLP, self).__init__()
        self.relu = nn.ReLU()
        self.fc_1 = nn.Linear(input_size, hidden_size)
        self.fc_2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc_1(x)
        x = self.relu(x)
        x = self.fc_2(x)
        return x


# Define input size, hidden size, and output size for the MLP
input_size = 128
hidden_size = 64
output_size = 6

n = 24

# Create a shared MLP instance
shared_mlp = EmotionMLP(input_size, hidden_size, output_size)

# Generate random input tensor
input_tensor = torch.randn(1, n, 128)  # Number of utterances: n = 24, Utterance embedding size: 128
input_tensor = output

# Apply the shared MLP sequentially 24 times
outputs = []
for i in range(n):
    output = shared_mlp(input_tensor[:, i, :])  # Feed each 128-dimensional vector
    output = F.softmax(output, dim=1)  # Apply softmax along dimension 1
    outputs.append(output.unsqueeze(1))  # Add a singleton dimension for concatenation later

# Concatenate the outputs along the second dimension to get 24 6-dimensional outputs
final_output = torch.cat(outputs, dim=1).squeeze(0)

print("Final output shape:", final_output)  # Output shape should be [24, 6]

Final output shape: tensor([[0.1715, 0.1687, 0.1636, 0.1813, 0.1558, 0.1591],
        [0.1742, 0.1655, 0.1609, 0.1858, 0.1506, 0.1629],
        [0.1669, 0.1639, 0.1596, 0.1933, 0.1518, 0.1645],
        [0.1666, 0.1675, 0.1603, 0.1886, 0.1520, 0.1649],
        [0.1680, 0.1675, 0.1578, 0.1851, 0.1542, 0.1673],
        [0.1658, 0.1724, 0.1589, 0.1861, 0.1524, 0.1644],
        [0.1625, 0.1651, 0.1624, 0.1926, 0.1533, 0.1641],
        [0.1688, 0.1619, 0.1610, 0.1936, 0.1523, 0.1623],
        [0.1638, 0.1631, 0.1607, 0.1951, 0.1531, 0.1642],
        [0.1660, 0.1607, 0.1563, 0.1992, 0.1557, 0.1621],
        [0.1657, 0.1638, 0.1566, 0.1942, 0.1537, 0.1660],
        [0.1719, 0.1619, 0.1600, 0.1913, 0.1499, 0.1650],
        [0.1709, 0.1608, 0.1612, 0.1933, 0.1481, 0.1656],
        [0.1703, 0.1632, 0.1563, 0.1931, 0.1525, 0.1646],
        [0.1698, 0.1602, 0.1571, 0.1909, 0.1551, 0.1669],
        [0.1723, 0.1632, 0.1571, 0.1896, 0.1539, 0.1639],
        [0.1723, 0.1621, 0.1610, 0.1868, 0.1519, 0.1

In [None]:
# import torch
# import torch.nn as nn

# s = 10  # max_speakers
# n = 24  # dialogue_length
# input = torch.randn(s, n, 768)

# # Define the parameters for the convolution operation
# mid_channels = 3
# out_channels = 1  # Number of output channels
# kernel_size = (1, 1, 1)  # Kernel size (depth, height, width)
# stride = 1

# # Define the convolutional layer
# conv3d_1 = nn.Conv3d(in_channels=s, out_channels=mid_channels, kernel_size=kernel_size)
# conv3d_2 = nn.Conv3d(in_channels=mid_channels, out_channels=out_channels, kernel_size=kernel_size)


# # Reshape input to (batch_size, in_channels, depth, height, width)
# input_ = input.unsqueeze(0).unsqueeze(2)  # Adding batch and depth dimensions

# # Perform convolution
# mid = conv3d_1(input_)
# output = conv3d_2(mid)

# print(input.shape)
# print(input_.shape)
# print(mid.shape)
# print(output.shape)

# # Ensure the output has the desired shape
# output = output.squeeze(0).squeeze(1)  # Remove batch and depth dimensions
# print("Output shape:", output.shape)  # Output shape should be 1*n*768

torch.Size([10, 24, 768])
torch.Size([1, 10, 1, 24, 768])
torch.Size([1, 3, 1, 24, 768])
torch.Size([1, 1, 1, 24, 768])
Output shape: torch.Size([1, 24, 768])


### Attention

In [None]:
import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self, input_size, num_heads):
        super(SelfAttention, self).__init__()
        self.num_heads = num_heads
        self.multihead_attn = nn.MultiheadAttention(input_size, num_heads)

    def forward(self, sentence_embeddings):
        output, _ = self.multihead_attn(sentence_embeddings, sentence_embeddings, sentence_embeddings)
        return output

sentence_embeddings = torch.randn(20, 10, 768)  # Shape: (seq_len, batch_size, embedding_dim)

# Define self-attention layer
num_heads = 1  # Number of attention heads
attention_layer = SelfAttention(input_size=768, num_heads=num_heads)

# Apply self-attention
context_embeddings = attention_layer(sentence_embeddings)

# Check the shape of the output tensor
print(context_embeddings.shape)

torch.Size([10, 768])


In [None]:
import torch
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, embedding_size, num_heads):
        super(SelfAttention, self).__init__()
        self.num_heads = num_heads
        self.multihead_attn = nn.MultiheadAttention(embedding_size, num_heads)

    def forward(self, utterance_embeddings):
        # Reshape utterance embeddings to (seq_len, batch_size, embedding_dim)
        utterance_embeddings = utterance_embeddings.unsqueeze(0)  # Add a dimension for seq_len
        output, _ = self.multihead_attn(utterance_embeddings, utterance_embeddings, utterance_embeddings)
        return output.squeeze(0)  # Remove the added dimension

# Example usage
num_utterances = 10  # Number of utterances
embedding_size = 768  # Size of each embedding
num_heads = 8  # Number of attention heads

# Generate random utterance embeddings
utterance_embeddings = torch.randn(num_utterances, embedding_size)

# Define self-attention layer
attention_layer = SelfAttention(embedding_size=embedding_size, num_heads=num_heads)

# Apply self-attention
context_embeddings = attention_layer(utterance_embeddings)

# Check the shape of the output tensor
print(context_embeddings.shape)  # Output shape: (n, embedding_dim)

torch.Size([10, 768])
