In [158]:
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR
from sklearn.metrics import accuracy_score, f1_score
# TensorBoard
from torch.utils.tensorboard import SummaryWriter

In [159]:
torch.cuda.is_available()

True

In [160]:
class MOSIDataset(Dataset):
    def __init__(self, data_path, split='train'):
        # Load the data from the pickle file
        with open(data_path, 'rb') as f:
            data = pickle.load(f)
        
        # Select the appropriate split
        self.vision = data[split]['vision']
        self.text = data[split]['text']
        self.audio = data[split]['audio']
        self.labels = data[split]['labels']
        self.ids = data[split]['id']

        # audio数据中存在坏点需要处理：
        self.audio[self.audio == float('inf')] = 0.0
        self.audio[self.audio == float('-inf')] = 0.0

    def __len__(self):
        # add: 返回数据集长度
        return len(self.labels)

    def __getitem__(self, idx):
        # Extract the features and label for the given index
        vision = torch.tensor(self.vision[idx], dtype=torch.float32)
        text = torch.tensor(self.text[idx], dtype=torch.float32)
        audio = torch.tensor(self.audio[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32).squeeze()

        return vision, text, audio, label

In [161]:
class MultimodalSentimentAnalysisModel(nn.Module):
    def __init__(self):
        super(MultimodalSentimentAnalysisModel, self).__init__()
 
        self.vision_norm = nn.LayerNorm(35)
        self.text_norm = nn.LayerNorm(300)
        self.audio_norm = nn.LayerNorm(74)
        
        self.vision_fc = nn.Linear(35, 128)
        self.text_fc = nn.Linear(300, 128)
        self.audio_fc = nn.Linear(74, 128)
        
        # 定义vision_lstm, text_lstm 和 audio_lstm和融合层mm_lstm. 要求hidden_size=128, num_layers=1, dropout=0.1, batch_first=True
        # add
        self.vision_lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=1, dropout=0.1, batch_first=True)
        self.text_lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=1, dropout=0.1, batch_first=True)
        self.audio_lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=1, dropout=0.1, batch_first=True)
        self.mm_lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=1, dropout=0.1, batch_first=True)
        
        # Define a fully connected layer for fusion
        self.fc = nn.Linear(128, 1)
        
    def forward(self, vision, text, audio):

        # apply layernorm 
        # add
        vision = self.vision_norm(vision)
        text = self.text_norm(text)
        audio = self.audio_norm(audio)
        
        # Process each modality
        vision = F.relu(self.vision_fc(vision))
        text = F.relu(self.text_fc(text))
        audio = F.relu(self.audio_fc(audio))
        
        # LSTM for temporal processing
        output_v, (vision_h, _) = self.vision_lstm(vision)
        output_t, (text_h, _) = self.text_lstm(text)
        output_a, (audio_h, _) = self.audio_lstm(audio)

        # 对单模态的LSTM输出进行直接相加得到feature
        # add: feature
        feature = (vision_h[-1] + text_h[-1] + audio_h[-1]).unsqueeze(1)
        _, (fusion_tensor, _) = self.mm_lstm(feature)

        # Concatenate the final hidden states
        output = self.fc(fusion_tensor[-1])
        
        # Apply sigmoid to constrain output to (0, 1)
        output = torch.sigmoid(output)
        # Scale and shift to range (-3, 3)
        output = output * 6 - 3

        return output

In [162]:
def eval_mosi_regression(y_pred, y_true, exclude_zero=False):
    test_preds = y_pred.view(-1).cpu().detach().numpy()
    test_truth = y_true.view(-1).cpu().detach().numpy()

    test_preds_a7 = np.clip(test_preds, a_min=-3., a_max=3.)
    test_truth_a7 = np.clip(test_truth, a_min=-3., a_max=3.)
    test_preds_a5 = np.clip(test_preds, a_min=-2., a_max=2.)
    test_truth_a5 = np.clip(test_truth, a_min=-2., a_max=2.)

    mae = np.mean(np.absolute(test_preds - test_truth))
    corr = np.corrcoef(test_preds, test_truth)[0][1]
    mult_a7 = multiclass_acc(test_preds_a7, test_truth_a7)
    mult_a5 = multiclass_acc(test_preds_a5, test_truth_a5)
    
    non_zeros = np.array([i for i, e in enumerate(test_truth) if e != 0])
    non_zeros_binary_truth = (test_truth[non_zeros] > 0)
    non_zeros_binary_preds = (test_preds[non_zeros] > 0)

    non_zeros_acc2 = accuracy_score(non_zeros_binary_preds, non_zeros_binary_truth)
    non_zeros_f1_score = f1_score(non_zeros_binary_preds, non_zeros_binary_truth, average='weighted')
    
    eval_results = {
        "Non0_acc_2":  round(non_zeros_acc2, 4),
        "Non0_F1_score": round(non_zeros_f1_score, 4),
        "Mult_acc_5": round(mult_a5, 4),
        "Mult_acc_7": round(mult_a7, 4),
        "MAE": round(mae, 4),
        "Corr": round(corr, 4)
    }
    return eval_results

def multiclass_acc(y_pred, y_true):
    y_pred = np.round(y_pred)
    y_true = np.round(y_true)

    # Compute the accuracy
    # 注意，这里统计的是总的分类准确率，而不是在每个类别上的准确率。
    acc = (y_pred == y_true).sum() / len(y_true)
    
    return acc


In [163]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, scheduler, device, epochs, writer):
    model.to(device)

    best_corr = 0.
    best_epoch = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for i, (vision, text, audio, labels) in enumerate(train_loader):
            vision, text, audio, labels = vision.to(device), text.to(device), audio.to(device), labels.to(device)

            optimizer.zero_grad()

            # add 模型前向获得输出：
            outputs = model(vision, text, audio)
            # add 计算损失：
            loss = criterion(outputs.view(-1), labels.view(-1))
            # add 反向传播，计算梯度
            loss.backward()
            
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

        # TensorBoard
        writer.add_scalar('Loss/train', running_loss / len(train_loader), epoch)

        val_corr = validate_model(model, valid_loader, criterion, device, epoch, writer)

        if val_corr > best_corr:
            best_corr = val_corr
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pth')

    print(f"Best model saved with val_corr {best_corr} at epoch {best_epoch}.")


In [164]:
def validate_model(model, valid_loader, criterion, device, epoch, writer):
    model.eval()
    valid_loss = 0.0

    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for vision, text, audio, labels in valid_loader:
            vision, text, audio, labels = vision.to(device), text.to(device), audio.to(device), labels.to(device)

            outputs = model(vision, text, audio)
            loss = criterion(outputs.squeeze(), labels.squeeze())

            valid_loss += loss.item()

            all_preds.append(outputs)
            all_labels.append(labels)

    print(f'Validation Loss: {valid_loss/len(valid_loader):.4f}')
    
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    # add 计算评价指标
    eval_results = eval_mosi_regression(all_preds, all_labels)
    print(eval_results)

    # TensorBoard
    writer.add_scalar('Loss/valid', valid_loss / len(valid_loader), epoch)
    writer.add_scalar('Corr/valid', eval_results["Corr"], epoch)


    return eval_results["Corr"]

In [165]:
def main():
    # TensorBoard
    writer = SummaryWriter(log_dir='./logs/experiment')
    
    # 固定随机数种子，确保实验结果可重复性
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    
    # add 定义损失函数criterion, 使用均方误差损失。可以使用pytorch封装好的函数，也可以根据公式手写：
    criterion = nn.MSELoss()
    
    learning_rate = 1e-3
    epochs = 20
    
    # add: Initialize the model.
    model = MultimodalSentimentAnalysisModel()

    data_path = './mosi_raw.pkl'
    # add 初始化训练集和验证集的数据集类
    train_dataset = MOSIDataset(data_path, split='train')
    valid_dataset = MOSIDataset(data_path, split='valid')
    # add 初始化训练集和验证集的加载器，要求batch_size=16
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

    # Initialize the optimizer and scheduler.
    # add: 使用Adam优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs*len(train_loader))

    # TensorBoard: output model graph
    sample_vision = torch.randn(1, train_dataset[0][0].shape[0], train_dataset[0][0].shape[1]).to(device)
    sample_text = torch.randn(1, train_dataset[0][1].shape[0], train_dataset[0][1].shape[1]).to(device)
    sample_audio = torch.randn(1, train_dataset[0][2].shape[0], train_dataset[0][2].shape[1]).to(device)
    writer.add_graph(model.to(device), (sample_vision, sample_text, sample_audio))

    # add 调用训练函数，注意传入对应参数：
    train_model(model, train_loader, valid_loader, criterion, optimizer, scheduler, device, epochs, writer)

    # 加载最佳epoch参数
    best_model_state = torch.load('best_model.pth')
    model.load_state_dict(best_model_state)

    # 初始化测试集的数据集类和加载器
    test_dataset = MOSIDataset(data_path, split='test')
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    print("\n========== test results: ==========\n")
    validate_model(model, test_loader, criterion, device, epochs, writer)
    writer.close()

In [166]:
if __name__ == "__main__":
    main()

cuda:0




Epoch [1/20], Loss: 1.9543
Validation Loss: 1.8385
{'Non0_acc_2': 0.7264, 'Non0_F1_score': 0.7235, 'Mult_acc_5': np.float64(0.3037), 'Mult_acc_7': np.float64(0.2664), 'MAE': np.float32(1.1248), 'Corr': np.float64(0.527)}
Epoch [2/20], Loss: 1.4019
Validation Loss: 1.7191
{'Non0_acc_2': 0.7065, 'Non0_F1_score': 0.7028, 'Mult_acc_5': np.float64(0.2617), 'Mult_acc_7': np.float64(0.2383), 'MAE': np.float32(1.1114), 'Corr': np.float64(0.6216)}
Epoch [3/20], Loss: 1.0847
Validation Loss: 1.2626
{'Non0_acc_2': 0.791, 'Non0_F1_score': 0.7927, 'Mult_acc_5': np.float64(0.3738), 'Mult_acc_7': np.float64(0.3178), 'MAE': np.float32(0.9177), 'Corr': np.float64(0.7047)}
Epoch [4/20], Loss: 0.9009
Validation Loss: 1.2395
{'Non0_acc_2': 0.7811, 'Non0_F1_score': 0.7851, 'Mult_acc_5': np.float64(0.3925), 'Mult_acc_7': np.float64(0.3178), 'MAE': np.float32(0.9045), 'Corr': np.float64(0.7168)}
Epoch [5/20], Loss: 0.7602
Validation Loss: 1.2491
{'Non0_acc_2': 0.801, 'Non0_F1_score': 0.8006, 'Mult_acc_5': np

  best_model_state = torch.load('best_model.pth')




Validation Loss: 2.2692
{'Non0_acc_2': 0.6768, 'Non0_F1_score': 0.6755, 'Mult_acc_5': np.float64(0.293), 'Mult_acc_7': np.float64(0.2682), 'MAE': np.float32(1.2028), 'Corr': np.float64(0.5149)}
