In [1]:
batch_size = 16

In [2]:
import pandas as pd
from torch.utils.data import Dataset
import torchaudio
import numpy as np
import torch  # 导入torch
from sklearn.preprocessing import StandardScaler
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
label_to_index = {'0': 0, '1': 1}  # 需要根据实际情况调整

class AudioTextDataset(Dataset):
    def __init__(self, csv_file, tokenizer, audio_processor, max_length=512):
        self.data = pd.read_csv(csv_file, sep="\t")
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.audio_processor = audio_processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        label_name = row['label']
        label = torch.tensor(label_name, dtype=torch.long)  # 使用映射转换标签
        text = row['dialogue']
        audio_path = row['new_audio_segments_path']
        visual_path = row['visual_segment_path']

        # 处理视觉特征
        visual_features = np.loadtxt(visual_path, delimiter=',', skiprows=1)[:, 4:].astype(np.float32)  # 转换为 float32
        scaler = StandardScaler()
        # 对视觉特征进行标准化
        visual_features = scaler.fit_transform(visual_features)


        # 处理文本
        text_input = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        text_input = {key: val.squeeze(0) for key, val in text_input.items()}  # 转换为相同的数据类型

        audio_segments_data = pd.read_csv(audio_path, sep="\t")
        audio_values = []
        for _, segment_row in audio_segments_data.iterrows():
            audio_path = segment_row['audio_segment_path']
            waveform, sample_rate = torchaudio.load(audio_path)
            segment_values = self.audio_processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt").input_values
            audio_values.append(segment_values.squeeze(0))  # 移除批次维度

        # 堆叠音频片段
        audio_values = torch.stack(audio_values, dim=0)  # 形状为 [num_segments, sequence_length]


        return {
            'audio_values': audio_values,
            'text_input': text_input,
            'visual_features': visual_features,
            'label': label
        }


In [3]:
from transformers import RobertaTokenizer, Wav2Vec2Processor
from torch.utils.data import DataLoader, random_split
from torch.utils.data.dataset import Subset

tokenizer = RobertaTokenizer.from_pretrained('./transformers/roberta')
wav2vec2processor = Wav2Vec2Processor.from_pretrained("./transformers/wav2vecprocessor")

# 假设你已经有了 AudioTextDataset 的定义
full_dataset = AudioTextDataset('train.csv', tokenizer, wav2vec2processor)

# 定义数据集的大小
total_size = len(full_dataset)
train_size = int(total_size * 0.80)
test_size = int(total_size * 0.05)
valid_size = total_size - train_size - test_size  # 保证总和为total_size

# 随机划分数据集
train_dataset, test_dataset, valid_dataset = random_split(full_dataset, [train_size, test_size, valid_size])

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# 现在你有三个独立的 DataLoader：train_loader, test_loader, valid_loader


In [4]:
next(enumerate(train_loader))[1]['audio_values'].shape

torch.Size([16, 6, 160000])

In [5]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import Wav2Vec2Model, RobertaModel

In [6]:
class VisualModal(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(VisualModal, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim * 2)  # 第一个全连接层
        self.fc2 = nn.Linear(hidden_dim * 2, output_dim)  # 第二个全连接层
        self.relu = nn.ReLU()  # 激活函数

    def forward(self, visual_input):
        # 处理视觉特征
        visual_output, _ = self.lstm(visual_input)
        visual_output = visual_output[:, -1, :]  # 获取 LSTM 的最后一个时间步的输出
        
        # 通过两个全连接层
        visual_output = self.fc1(visual_output)
        visual_output = self.relu(visual_output)
        visual_output = self.fc2(visual_output)
        return visual_output


In [7]:
class CrossEncoder(nn.Module):
    def __init__(self, feature_dim, num_heads, output_dim):
        super(CrossEncoder, self).__init__()
        # 为每种交互创建一个独立的多头注意力模块
        self.attn_audio_text = nn.MultiheadAttention(embed_dim=feature_dim, num_heads=num_heads)
        self.attn_text_audio = nn.MultiheadAttention(embed_dim=feature_dim, num_heads=num_heads)

        self.self_attn1 = nn.MultiheadAttention(embed_dim=feature_dim, num_heads=num_heads)
        self.self_attn2 = nn.MultiheadAttention(embed_dim=feature_dim, num_heads=num_heads)

        
        self.fc = nn.Linear(feature_dim * 6, output_dim)  # 假设拼接后的特征用于一个线性层

    def forward(self, audio_feature, text_feature, visual_feature):
        # 分别使用独立的注意力模块
        attn_output_1, _ = self.attn_audio_text(query=audio_feature.unsqueeze(0), key=text_feature.unsqueeze(0), value=text_feature.unsqueeze(0))
        attn_output_3, _ = self.attn_text_audio(query=text_feature.unsqueeze(0), key=audio_feature.unsqueeze(0), value=audio_feature.unsqueeze(0))

        self_attn_output_1, _ = self.self_attn1(query=attn_output_1, key=attn_output_1, value=attn_output_1)
        self_attn_output_2, _ = self.self_attn2(query=attn_output_3, key=attn_output_3, value=attn_output_3)

        
        return self_attn_output_1.squeeze(0), self_attn_output_2.squeeze(0)

In [8]:
class SecondCrossEncoder(nn.Module):
    def __init__(self, feature_dim, num_heads, hidden_dim, output_dim, dropout_rate):
        super(SecondCrossEncoder, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim=feature_dim, num_heads=num_heads)
        self.ffn = FeedForwardNetwork(feature_dim, hidden_dim, output_dim, dropout_rate)

    def forward(self, audio_feature, text_feature, visual_feature, attn_output_1, attn_output_2, attn_output_3, attn_output_4, attn_output_5, attn_output_6):
        # 使用cross attention提取三模态之间的信息
        triple_attn_output_1, _ = self.multihead_attn(query=attn_output_2.unsqueeze(0), key=attn_output_1.unsqueeze(0), value=attn_output_1.unsqueeze(0))
        triple_attn_output_2, _ = self.multihead_attn(query=attn_output_3.unsqueeze(0), key=attn_output_4.unsqueeze(0), value=attn_output_4.unsqueeze(0))
        triple_attn_output_3, _ = self.multihead_attn(query=attn_output_5.unsqueeze(0), key=attn_output_6.unsqueeze(0), value=attn_output_6.unsqueeze(0))

        # 添加残差连接
        triple_attn_output_1 = triple_attn_output_1.squeeze(0) + audio_feature
        triple_attn_output_2 = triple_attn_output_2.squeeze(0) + text_feature
        triple_attn_output_3 = triple_attn_output_3.squeeze(0) + visual_feature

        # 通过前馈神经网络
        triple_attn_output_1 = self.ffn(triple_attn_output_1)
        triple_attn_output_2 = self.ffn(triple_attn_output_2)
        triple_attn_output_3 = self.ffn(triple_attn_output_3)

        return triple_attn_output_1, triple_attn_output_2, triple_attn_output_3

In [9]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout_rate=0.2):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 256)
        self.fc3 = nn.Linear(256, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [10]:
class AudioModel(nn.Module):
    def __init__(self, hidden_dim, lstm_output_dim):
        super(AudioModel, self).__init__()
        self.wav2vec = Wav2Vec2Model.from_pretrained("./transformers/wav2vec2")
        self.lstm = nn.LSTM(input_size=self.wav2vec.config.hidden_size, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, lstm_output_dim)

        # 冻结预训练模型的参数
        for param in self.wav2vec.parameters():
            param.requires_grad = False

    def forward(self, input_values):
        # input_values 的形状应该是 [batch_size, num_segments, sequence_length]
        batch_size, num_segments, _ = input_values.shape
        wav2vec_features = []

        for i in range(num_segments):
            segment = input_values[:, i, :]
            with torch.no_grad():
                segment_features = self.wav2vec(input_values=segment).last_hidden_state
            wav2vec_features.append(segment_features)

        # 将所有片段的特征拼接起来
        wav2vec_features = torch.cat(wav2vec_features, dim=1)  # 形状为 [batch_size, num_segments * sequence_length, hidden_size]

        # 通过 LSTM
        lstm_output, _ = self.lstm(wav2vec_features)
        # 获取 LSTM 的最后一步输出
        last_hidden_state = lstm_output[:, -1, :]
        output = self.fc(last_hidden_state)
        return output

In [11]:
hidden_dim = 512
visual_lstm_final_layer_dim = 768
lstm_output_dim = 768
output_dim = 2
visual_feature_dim = 136  # 假设您的视觉特征维度为136
feedforward_hidden_dim = 128  # 假设您想要的 LSTM 隐藏层维度为128
feature_dim = 768  # 假设特征维度是768
num_heads = 8  # 假设使用8个注意力头

audio_model = AudioModel(hidden_dim, lstm_output_dim)

# 加载预训练的文本模型
text_model = RobertaModel.from_pretrained('./transformers/roberta-base', num_labels=2)

visual_model = VisualModal(input_dim=visual_feature_dim, hidden_dim=hidden_dim, output_dim=visual_lstm_final_layer_dim)

feedforward_model1 = FeedForwardNetwork(visual_lstm_final_layer_dim, hidden_dim, feedforward_hidden_dim)
feedforward_model2 = FeedForwardNetwork(visual_lstm_final_layer_dim, hidden_dim, feedforward_hidden_dim)
# 实例化cross_encoder和second_cross_encoder
cross_encoder = CrossEncoder(feature_dim=feature_dim, num_heads=num_heads, output_dim=feature_dim)
# second_cross_encoder = SecondCrossEncoder(feature_dim=feature_dim, num_heads=num_heads, hidden_dim=hidden_dim, output_dim=feature_dim, dropout_rate=0.2)

# # 冻结预训练模型的参数
for param in audio_model.parameters():
    param.requires_grad = False

for param in text_model.parameters():
    param.requires_grad = False

batch = next(iter(train_loader))
audio_batch = batch['audio_values']
text_batch = batch['text_input']
visual_input = batch['visual_features']

with torch.no_grad():
    audio_output = audio_model(input_values=audio_batch)
    text_output = text_model(**text_batch).last_hidden_state
    visual_output = visual_model(visual_input)
    audio_output_dim = audio_output.shape[-1]
    text_output_dim = text_output.shape[-1]    
    visual_output_dim = visual_output.shape[-1]

print(audio_output_dim)
print(text_output_dim)
print(visual_output_dim)

768
768
768


In [12]:
import torch
import torch.nn as nn

class MultimodalFusionModel(nn.Module):
    def __init__(self, audio_model, text_model, visual_model, cross_encoder, audio_output_dim, text_output_dim, visual_output_dim, hidden_dim, output_dim, feedforward_model1, feedforward_model2, dropout_rate=0.3):
        super(MultimodalFusionModel, self).__init__()
        self.audio_model = audio_model
        self.text_model = text_model
        self.visual_model = visual_model
        self.cross_encoder = cross_encoder
        self.feedforward_model1 = feedforward_model1
        self.feedforward_model2 = feedforward_model2
        # 定义全连接层和其他层
        self.fc1 = nn.Linear(512, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
        self.bn2 = nn.BatchNorm1d(hidden_dim // 2)
        self.fc3 = nn.Linear(hidden_dim // 2, hidden_dim // 4)
        self.bn3 = nn.BatchNorm1d(hidden_dim // 4)
        self.fc4 = nn.Linear(hidden_dim // 4, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, audio_input, text_input, visual_input):
        # 获取音频模型和文本模型的最后一层输出
        audio_output = self.audio_model(input_values=audio_input)
        text_output = self.text_model(**text_input).last_hidden_state[:, -1, :]
        visual_output = self.visual_model(visual_input)

        # 进行cross attention和second cross attention
        attn_output_1, attn_output_3 = self.cross_encoder(audio_output, text_output, visual_output)

        output1 = self.feedforward_model1(attn_output_1)
        output2 = self.feedforward_model2(attn_output_3)
    
        # 拼接三个模态的输出
        combined_output = torch.cat((output1, output2), dim=1)
        
        # 通过全连接层和其他层
        x = self.fc1(combined_output)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc4(x)
        return x

fusion_model = MultimodalFusionModel(audio_model, text_model, visual_model, cross_encoder, audio_output_dim, text_output_dim, visual_output_dim, hidden_dim, output_dim, feedforward_model1, feedforward_model2).to(device)


In [13]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_trainable_params = count_trainable_params(fusion_model)
print(f"Number of trainable parameters: {num_trainable_params}")


Number of trainable parameters: 17177730


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
# from transformers import AdamW
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import os
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


# 定义损失函数和优化器
criterion = CrossEntropyLoss()
optimizer = AdamW(fusion_model.parameters(), lr=2e-5)

def train(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct_predictions = 0
    l1_lambda = 1e-5
    # 使用tqdm包装dataloader以显示进度条
    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()
        audio_batch = batch['audio_values'].squeeze(1)
        audio_input = audio_batch.to(device)

        text_input = {key: val.to(device) for key, val in batch['text_input'].items()}

        visual_input = batch['visual_features'].to(device)
        labels = batch['label'].to(device)
        outputs = model(audio_input, text_input,visual_input)
        loss = criterion(outputs, labels)
        # 计算L1正则化损失
        l1_norm = sum(p.abs().sum() for p in fusion_model.parameters())
        loss += l1_lambda * l1_norm
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = correct_predictions.double() / len(dataloader.dataset)
    return avg_loss, avg_accuracy
    

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []  # Collect probabilities for ROC curve

    for batch in tqdm(dataloader, desc="Evaluating"):
        audio_batch = batch['audio_values'].squeeze(1)
        audio_input = audio_batch.to(device)
        text_input = {key: val.to(device) for key, val in batch['text_input'].items()}
        visual_input = batch['visual_features'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            outputs = model(audio_input, text_input, visual_input)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(outputs.softmax(dim=1)[:, 1].cpu().numpy())  # Assuming binary classification for ROC

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    fpr, tpr, _ = roc_curve(all_labels, all_probs)
    roc_auc = auc(fpr, tpr)

    return avg_loss, accuracy, precision, recall, f1, fpr, tpr, roc_auc

# 训练和评估循环
num_epochs = 10
best_f1 = 0.0
best_accuracy = 0.0
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []  # Collect probabilities for ROC curve

    for batch in tqdm(dataloader, desc="Evaluating"):
        audio_batch = batch['audio_values'].squeeze(1)
        audio_input = audio_batch.to(device)
        text_input = {key: val.to(device) for key, val in batch['text_input'].items()}
        visual_input = batch['visual_features'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            outputs = model(audio_input, text_input, visual_input)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(outputs.softmax(dim=1)[:, 1].cpu().numpy())  # Assuming binary classification for ROC

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)
    fpr, tpr, _ = roc_curve(all_labels, all_probs)
    roc_auc = auc(fpr, tpr)

    return avg_loss, accuracy, precision, recall, f1, fpr, tpr, roc_auc

best_auc = 0.0
for epoch in range(num_epochs):
    train_loss, train_acc = train(fusion_model, train_loader, optimizer, criterion)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
    
    val_loss, val_acc, val_precision, val_recall, val_f1, fpr, tpr, roc_auc = evaluate(fusion_model, valid_loader, criterion)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1 Score: {val_f1:.4f}, AUC: {roc_auc:.4f}')
    
    # Save the best model and plot ROC curve
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        best_model_path = 'best_model.pth'
        torch.save(fusion_model.state_dict(), best_model_path)
        print(f"Saved new best model with AUC: {best_auc:.4f} to {best_model_path}")

        # Plot ROC curve
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        roc_curve_path = 'roc_curve.png'
        plt.savefig(roc_curve_path)
        plt.close()
        print(f"Saved ROC curve to {roc_curve_path}")



Training: 100%|██████████| 77/77 [01:02<00:00,  1.24it/s]


Epoch 1/10, Train Loss: 137.8402, Train Accuracy: 0.5775


Evaluating: 100%|██████████| 15/15 [00:11<00:00,  1.32it/s]


Epoch 1/10, Validation Loss: 0.7000, Accuracy: 0.3974, Precision: 0.5518, Recall: 0.5171, F1 Score: 0.3480, AUC: 0.6250
Saved new best model with AUC: 0.6250 to best_model.pth
Saved ROC curve to roc_curve.png


Training: 100%|██████████| 77/77 [01:01<00:00,  1.26it/s]


Epoch 2/10, Train Loss: 137.6619, Train Accuracy: 0.6038


Evaluating: 100%|██████████| 15/15 [00:10<00:00,  1.39it/s]


Epoch 2/10, Validation Loss: 0.6314, Accuracy: 0.6376, Precision: 0.4875, Recall: 0.4993, F1 Score: 0.4005, AUC: 0.6498
Saved new best model with AUC: 0.6498 to best_model.pth
Saved ROC curve to roc_curve.png


Training: 100%|██████████| 77/77 [01:02<00:00,  1.24it/s]


Epoch 3/10, Train Loss: 137.4883, Train Accuracy: 0.6185


Evaluating: 100%|██████████| 15/15 [00:10<00:00,  1.39it/s]


Epoch 3/10, Validation Loss: 0.6576, Accuracy: 0.6594, Precision: 0.6181, Recall: 0.5972, F1 Score: 0.5988, AUC: 0.6253


Training: 100%|██████████| 77/77 [01:01<00:00,  1.26it/s]


Epoch 4/10, Train Loss: 137.3400, Train Accuracy: 0.6300


Evaluating: 100%|██████████| 15/15 [00:10<00:00,  1.39it/s]


Epoch 4/10, Validation Loss: 0.6436, Accuracy: 0.6769, Precision: 0.6412, Recall: 0.6135, F1 Score: 0.6163, AUC: 0.6567
Saved new best model with AUC: 0.6567 to best_model.pth
Saved ROC curve to roc_curve.png


Training: 100%|██████████| 77/77 [01:02<00:00,  1.23it/s]


Epoch 5/10, Train Loss: 137.2015, Train Accuracy: 0.6226


Evaluating: 100%|██████████| 15/15 [00:10<00:00,  1.40it/s]


Epoch 5/10, Validation Loss: 0.6557, Accuracy: 0.6725, Precision: 0.6362, Recall: 0.6182, F1 Score: 0.6216, AUC: 0.6560


Training: 100%|██████████| 77/77 [01:00<00:00,  1.27it/s]


Epoch 6/10, Train Loss: 137.0809, Train Accuracy: 0.6358


Evaluating: 100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


Epoch 6/10, Validation Loss: 0.6269, Accuracy: 0.6769, Precision: 0.6482, Recall: 0.5892, F1 Score: 0.5822, AUC: 0.6527


Training: 100%|██████████| 77/77 [01:02<00:00,  1.24it/s]


Epoch 7/10, Train Loss: 136.9725, Train Accuracy: 0.6382


Evaluating: 100%|██████████| 15/15 [00:11<00:00,  1.35it/s]


Epoch 7/10, Validation Loss: 0.6223, Accuracy: 0.6769, Precision: 0.6501, Recall: 0.5865, F1 Score: 0.5776, AUC: 0.6405


Training:  88%|████████▊ | 68/77 [00:55<00:07,  1.26it/s]