In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert self.head_dim * heads == embed_size, "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, value, key, query):
        N = query.shape[0]
        value_len, key_len, query_len = value.shape[1], key.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        value = value.reshape(N, value_len, self.heads, self.head_dim)
        key = key.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(value)
        keys = self.keys(key)
        queries = self.queries(query)

        # Calculate the attention scores
        attention = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        attention = F.softmax(attention / (self.embed_size ** (1 / 2)), dim=-1)

        # Apply the attention scores to the values
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        # Apply a final linear layer
        out = self.fc_out(out)
        return out

class ModalAttention(nn.Module):
    def __init__(self, embed_size, heads, modalities):
        super(ModalAttention, self).__init__()
        self.modalities = modalities
        self.attention = MultiHeadAttention(embed_size, heads)
        self.fc = nn.Linear(len(modalities) * embed_size, 1)

    def forward(self, fusion, *modalities):
        attention_weights = []
        for modality in modalities:
            attention_output = self.attention(modality, modality, fusion)
            attention_weights.append(torch.mean(attention_output, dim=1))

        # Concatenate the attention weights and pass through a linear layer to get the final weight coefficient
        concatenated_weights = torch.cat(attention_weights, dim=1)
        weight_coefficient = self.fc(concatenated_weights)
        return weight_coefficient

# Example usage
embed_size = 128
heads = 4
modalities = ['text', 'video', 'audio']

# Create random tensors for fusion and modalities
fusion = torch.randn((1, 10, embed_size))  # (batch_size, sequence_length, embed_size)
text = torch.randn((1, 10, embed_size))
video = torch.randn((1, 10, embed_size))
audio = torch.randn((1, 10, embed_size))

# Initialize the ModalAttention module
modal_attention = ModalAttention(embed_size, heads, modalities)

# Compute the weight coefficient
weight_coefficient = modal_attention(fusion, text, video, audio)
print("Weight Coefficient:", weight_coefficient)


Weight Coefficient: tensor([[-0.0098]], grad_fn=<AddmmBackward0>)


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ModalAttention(nn.Module):
    def __init__(self, embed_dim):
        super(ModalAttention, self).__init__()
        self.embed_dim = embed_dim
        self.query = nn.Parameter(torch.randn(embed_dim))
        self.key = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, fusion, text, video, audio):
        # 计算每个模态的键向量
        key_text = self.key(text)
        key_video = self.key(video)
        key_audio = self.key(audio)

        # 计算每个模态与融合模态之间的相似度
        sim_text = F.cosine_similarity(self.query, key_text, dim=-1)
        sim_video = F.cosine_similarity(self.query, key_video, dim=-1)
        sim_audio = F.cosine_similarity(self.query, key_audio, dim=-1)

        # 将相似度转换为权重
        weights = F.softmax(torch.stack([sim_text, sim_video, sim_audio]), dim=0)

        return weights

# 示例使用
embed_dim = 64
batch_size = 32

model = ModalAttention(embed_dim=embed_dim)
fusion = torch.rand(batch_size, embed_dim)
text = torch.rand(batch_size, embed_dim)
video = torch.rand(batch_size, embed_dim)
audio = torch.rand(batch_size, embed_dim)

weights = model(fusion, text, video, audio)

print("Weights:", weights)


Weights: tensor([[0.3436, 0.3216, 0.3404, 0.3017, 0.3263, 0.3533, 0.3364, 0.3165, 0.3373,
         0.3369, 0.3503, 0.3557, 0.3588, 0.3330, 0.3438, 0.3302, 0.3115, 0.3377,
         0.3180, 0.3148, 0.3471, 0.3427, 0.3404, 0.3195, 0.3345, 0.3108, 0.3255,
         0.3080, 0.3442, 0.3329, 0.3338, 0.3211],
        [0.3216, 0.3352, 0.3430, 0.3357, 0.3457, 0.3154, 0.3447, 0.3612, 0.3333,
         0.3548, 0.3216, 0.3224, 0.3232, 0.3321, 0.3064, 0.3527, 0.3576, 0.3256,
         0.3411, 0.3407, 0.3162, 0.3099, 0.2915, 0.3318, 0.3333, 0.3377, 0.3436,
         0.3419, 0.3044, 0.3463, 0.3319, 0.3288],
        [0.3348, 0.3433, 0.3165, 0.3626, 0.3280, 0.3313, 0.3189, 0.3222, 0.3294,
         0.3084, 0.3281, 0.3220, 0.3179, 0.3350, 0.3498, 0.3171, 0.3309, 0.3367,
         0.3409, 0.3445, 0.3367, 0.3474, 0.3681, 0.3487, 0.3322, 0.3515, 0.3309,
         0.3500, 0.3514, 0.3209, 0.3344, 0.3501]], grad_fn=<SoftmaxBackward0>)


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ModalAttention(nn.Module):
    def __init__(self, embed_dim):
        super(ModalAttention, self).__init__()
        self.embed_dim = embed_dim
        self.query = nn.Parameter(torch.randn(embed_dim))
        self.key = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, fusion, text, video, audio):
        # 计算每个模态的键向量
        key_text = self.key(text)
        key_video = self.key(video)
        key_audio = self.key(audio)

        # 计算每个模态与融合模态之间的相似度
        sim_text = F.cosine_similarity(self.query, key_text, dim=-1)
        sim_video = F.cosine_similarity(self.query, key_video, dim=-1)
        sim_audio = F.cosine_similarity(self.query, key_audio, dim=-1)

        # 对相似度进行归一化处理，得到每个模态的权重
        total_sim = sim_text + sim_video + sim_audio
        weight_text = sim_text / total_sim
        weight_video = sim_video / total_sim
        weight_audio = sim_audio / total_sim

        return weight_text, weight_video, weight_audio

# 示例使用
embed_dim = 64
batch_size = 32

model = ModalAttention(embed_dim=embed_dim)
fusion = torch.rand(batch_size, embed_dim)
text = torch.rand(batch_size, embed_dim)
video = torch.rand(batch_size, embed_dim)
audio = torch.rand(batch_size, embed_dim)

weight_text, weight_video, weight_audio = model(fusion, text, video, audio)

print("Weight (text):", weight_text)
print("Weight (video):", weight_video)
print("Weight (audio):", weight_audio)



Weight (text): tensor([0.4532, 0.3623, 0.2640, 0.2711, 0.2826, 0.4319, 0.0107, 0.6332, 0.2323,
        0.2703, 0.4177, 0.3813, 0.2058, 0.4034, 0.5191, 0.2613, 0.2060, 0.6612,
        0.1712, 0.2346, 0.2904, 0.5253, 0.3491, 0.5234, 0.2375, 0.3362, 0.3451,
        0.1626, 0.2903, 0.0746, 0.2338, 0.4558], grad_fn=<DivBackward0>)
Weight (video): tensor([0.3646, 0.2566, 0.1637, 0.3269, 0.2450, 0.3317, 0.6042, 0.2875, 0.3048,
        0.3187, 0.2695, 0.5047, 0.4971, 0.2498, 0.2946, 0.4898, 0.4071, 0.1398,
        0.3240, 0.5675, 0.3254, 0.2062, 0.2386, 0.1410, 0.1482, 0.4671, 0.2906,
        0.4586, 0.4693, 0.3450, 0.3958, 0.3312], grad_fn=<DivBackward0>)
Weight (audio): tensor([0.1822, 0.3811, 0.5723, 0.4020, 0.4724, 0.2364, 0.3851, 0.0793, 0.4629,
        0.4110, 0.3128, 0.1141, 0.2971, 0.3468, 0.1863, 0.2490, 0.3869, 0.1990,
        0.5048, 0.1979, 0.3842, 0.2685, 0.4123, 0.3356, 0.6143, 0.1967, 0.3643,
        0.3788, 0.2404, 0.5804, 0.3704, 0.2131], grad_fn=<DivBackward0>)


In [27]:
import torch
import torch.nn as nn
from torch.nn import MultiheadAttention

class AdaptiveModalityAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(AdaptiveModalityAttention, self).__init__()
        self.multihead_attention = MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.gate = nn.Sequential(
            nn.Linear(30, embed_size),
            nn.ReLU(),
            nn.Linear(embed_size, 3),  # 3 modalities
            nn.Softmax(dim=-1)
        )

    def forward(self, fusion, text, video, audio):
        # Transpose inputs to match the expected shape for MultiheadAttention
        fusion = fusion.transpose(0, 1)  # (seq_len, batch_size, embed_size)
        text = text.transpose(0, 1)  # (seq_len, batch_size, embed_size)
        video = video.transpose(0, 1)  # (seq_len, batch_size, embed_size)
        audio = audio.transpose(0, 1)  # (seq_len, batch_size, embed_size)

        # Calculate attention weights for each modality
        text_attention, _ = self.multihead_attention(fusion, text, text)
        video_attention, _ = self.multihead_attention(fusion, video, video)
        audio_attention, _ = self.multihead_attention(fusion, audio, audio)
        
        # Concatenate the attention weights across the sequence length dimension
        weights = torch.cat([text_attention, video_attention, audio_attention], dim=0)  # (3 * seq_len, batch_size, embed_size)
        weights = weights.permute(1, 2, 0)  # (batch_size, embed_size, 3 * seq_len)
        
        # Pass the concatenated weights through the gate
        gated_weights = self.gate(weights)  # (batch_size, embed_size, 3)

        # Separate the gated weights and take the mean across the embed_size dimension
        gated_text_weight = torch.mean(gated_weights[:, :, 0], dim=1, keepdim=True)  # (batch_size, 1)
        gated_video_weight = torch.mean(gated_weights[:, :, 1], dim=1, keepdim=True)  # (batch_size, 1)
        gated_audio_weight = torch.mean(gated_weights[:, :, 2], dim=1, keepdim=True)  # (batch_size, 1)

        return gated_text_weight, gated_video_weight, gated_audio_weight

# Example usage
embed_size = 256
heads = 8
adaptive_attention = AdaptiveModalityAttention(embed_size, heads)
fusion = torch.rand(5, 10, embed_size)  # (batch_size, seq_len, embed_size)
text = torch.rand(5, 10, embed_size)  # (batch_size, seq_len, embed_size)
video = torch.rand(5, 10, embed_size)  # (batch_size, seq_len, embed_size)
audio = torch.rand(5, 10, embed_size)  # (batch_size, seq_len, embed_size)

gated_text_weight, gated_video_weight, gated_audio_weight = adaptive_attention(fusion, text, video, audio)
print("Gated Text weight:", gated_text_weight)
print("Gated Video weight:", gated_video_weight)
print("Gated Audio weight:", gated_audio_weight)


Gated Text weight: tensor([[0.3326],
        [0.3327],
        [0.3327],
        [0.3328],
        [0.3328]], grad_fn=<MeanBackward1>)
Gated Video weight: tensor([[0.3303],
        [0.3303],
        [0.3303],
        [0.3299],
        [0.3303]], grad_fn=<MeanBackward1>)
Gated Audio weight: tensor([[0.3372],
        [0.3369],
        [0.3370],
        [0.3372],
        [0.3370]], grad_fn=<MeanBackward1>)


ValueError: too many values to unpack (expected 3)

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ModalAttention(nn.Module):
    def __init__(self, feature_size, num_heads):
        super(ModalAttention, self).__init__()
        self.multi_head_attention = nn.MultiheadAttention(embed_dim=feature_size, num_heads=num_heads, batch_first=True)

    def forward(self, fusion, text, video, audio):
        t_f, text_weights = self.multi_head_attention(fusion, text, text)
        _, video_weights = self.multi_head_attention(fusion, video, video)
        _, audio_weights = self.multi_head_attention(fusion, audio, audio)
        print(f"t_f.size():{t_f.size()}")
        print(f"text_weights:{text_weights.size()}")
        # Summing up the attention weights across the sequence length dimension
        text_weights = text_weights.sum(dim=-2)
        video_weights = video_weights.sum(dim=-2)
        audio_weights = audio_weights.sum(dim=-2)

        # Concatenating the weights and applying softmax to normalize
        weights = torch.cat([text_weights, video_weights, audio_weights], dim=-1)
        normalized_weights = F.softmax(weights, dim=-1)

        # Splitting the normalized weights into separate constants for each modality
        text_constant, video_constant, audio_constant = normalized_weights.split(1, dim=-1)

        return text_constant, video_constant, audio_constant

# Example usage
batch_size = 2
seq_len = 5
feature_size = 3
num_heads = 1

fusion = torch.randn(batch_size, seq_len, feature_size)
text = torch.randn(batch_size, seq_len, feature_size)
video = torch.randn(batch_size, seq_len, feature_size)
audio = torch.randn(batch_size, seq_len, feature_size)

modal_attention = ModalAttention(feature_size, num_heads)
text_constant, video_constant, audio_constant = modal_attention(fusion, text, video, audio)

print(text_constant)
print(video_constant)
print(audio_constant)


t_f.size():torch.Size([2, 5, 3])
text_weights:torch.Size([2, 5, 5])


ValueError: too many values to unpack (expected 3)

In [1]:
import os
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
print(rank,world_size)

KeyError: 'RANK'