In [11]:
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR
from sklearn.metrics import accuracy_score, f1_score

In [12]:
torch.cuda.is_available()

True

In [13]:
class MOSIDataset(Dataset):
    def __init__(self, data_path, split='train'):
        # Load the data from the pickle file
        with open(data_path, 'rb') as f:
            data = pickle.load(f)
        
        # Select the appropriate split
        self.vision = data[split]['vision']
        self.text = data[split]['text']
        self.audio = data[split]['audio']
        self.labels = data[split]['labels']
        self.ids = data[split]['id']

        # audio数据中存在坏点需要处理：
        self.audio[self.audio == float('inf')] = 0.0
        self.audio[self.audio == float('-inf')] = 0.0

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Extract the features and label for the given index
        vision = torch.tensor(self.vision[idx], dtype=torch.float32)
        text = torch.tensor(self.text[idx], dtype=torch.float32)
        audio = torch.tensor(self.audio[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.float32).squeeze()

        return vision, text, audio, label

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultimodalSentimentAnalysisTransformerModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_dim = 128
        self.num_layers = 1
        self.num_heads = 4

        self.vision_norm = nn.LayerNorm(35)
        self.text_norm = nn.LayerNorm(300)
        self.audio_norm = nn.LayerNorm(74)

        self.vision_fc = nn.Linear(35, self.hidden_dim)
        self.text_fc = nn.Linear(300, self.hidden_dim)
        self.audio_fc = nn.Linear(74, self.hidden_dim)

        self.pe = PositionalEncoding(self.hidden_dim)

        encoder_layer = nn.TransformerEncoderLayer(d_model=self.hidden_dim, nhead=self.num_heads, dim_feedforward=256, dropout=0.1, batch_first=True)
        self.vision_transformer = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)
        self.text_transformer = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)
        self.audio_transformer = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)
        self.mm_transformer = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)

        self.fc = nn.Linear(self.hidden_dim, 1)

    def forward(self, vision, text, audio):
        vision = self.vision_norm(vision)
        text = self.text_norm(text)
        audio = self.audio_norm(audio)

        vision = self.pe(F.relu(self.vision_fc(vision)))
        text = self.pe(F.relu(self.text_fc(text)))
        audio = self.pe(F.relu(self.audio_fc(audio)))

        vision_feat = self.vision_transformer(vision).mean(dim=1)
        text_feat = self.text_transformer(text).mean(dim=1)
        audio_feat = self.audio_transformer(audio).mean(dim=1)

        fused = vision_feat + text_feat + audio_feat
        fused = fused.unsqueeze(1)

        fusion_tensor = self.mm_transformer(fused).squeeze(1)

        output = self.fc(fusion_tensor)
        output = torch.sigmoid(output)
        output = output * 6 - 3
        return output


In [15]:
def eval_mosi_regression(y_pred, y_true, exclude_zero=False):
    test_preds = y_pred.view(-1).cpu().detach().numpy()
    test_truth = y_true.view(-1).cpu().detach().numpy()

    test_preds_a7 = np.clip(test_preds, a_min=-3., a_max=3.)
    test_truth_a7 = np.clip(test_truth, a_min=-3., a_max=3.)
    test_preds_a5 = np.clip(test_preds, a_min=-2., a_max=2.)
    test_truth_a5 = np.clip(test_truth, a_min=-2., a_max=2.)

    mae = np.mean(np.absolute(test_preds - test_truth))
    corr = np.corrcoef(test_preds, test_truth)[0][1]
    mult_a7 = multiclass_acc(test_preds_a7, test_truth_a7)
    mult_a5 = multiclass_acc(test_preds_a5, test_truth_a5)
    
    non_zeros = np.array([i for i, e in enumerate(test_truth) if e != 0])
    non_zeros_binary_truth = (test_truth[non_zeros] > 0)
    non_zeros_binary_preds = (test_preds[non_zeros] > 0)

    non_zeros_acc2 = accuracy_score(non_zeros_binary_preds, non_zeros_binary_truth)
    non_zeros_f1_score = f1_score(non_zeros_binary_preds, non_zeros_binary_truth, average='weighted')
    
    eval_results = {
        "Non0_acc_2":  round(non_zeros_acc2, 4),
        "Non0_F1_score": round(non_zeros_f1_score, 4),
        "Mult_acc_5": round(mult_a5, 4),
        "Mult_acc_7": round(mult_a7, 4),
        "MAE": round(mae, 4),
        "Corr": round(corr, 4)
    }
    return eval_results

def multiclass_acc(y_pred, y_true):
    y_pred = np.round(y_pred)
    y_true = np.round(y_true)

    acc = (y_pred == y_true).sum() / len(y_true)
    
    return acc


In [16]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, scheduler, device, epochs):
    model.to(device)

    best_corr = 0.
    best_epoch = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for i, (vision, text, audio, labels) in enumerate(train_loader):
            vision, text, audio, labels = vision.to(device), text.to(device), audio.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(vision, text, audio)
            loss = criterion(outputs.view(-1), labels.view(-1))
            loss.backward()
            
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}')

        val_corr = validate_model(model, valid_loader, criterion, device)

        if val_corr > best_corr:
            best_corr = val_corr
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pth')

    print(f"Best model saved with val_corr {best_corr} at epoch {best_epoch}.")


In [17]:
def validate_model(model, valid_loader, criterion, device):
    model.eval()
    valid_loss = 0.0

    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for vision, text, audio, labels in valid_loader:
            vision, text, audio, labels = vision.to(device), text.to(device), audio.to(device), labels.to(device)

            outputs = model(vision, text, audio)
            loss = criterion(outputs.squeeze(), labels.squeeze())

            valid_loss += loss.item()

            all_preds.append(outputs)
            all_labels.append(labels)

    print(f'Validation Loss: {valid_loss/len(valid_loader):.4f}')
    
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    # add 计算评价指标
    eval_results = eval_mosi_regression(all_preds, all_labels)
    print(eval_results)

    return eval_results["Corr"]

In [18]:
def main():
    
    # 固定随机数种子，确保实验结果可重复性
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    
    # add 定义损失函数criterion, 使用均方误差损失。可以使用pytorch封装好的函数，也可以根据公式手写：
    criterion = nn.MSELoss()
    
    learning_rate = 1e-3
    epochs = 20
    
    # add: Initialize the model.
    model = model = MultimodalSentimentAnalysisTransformerModel()

    data_path = './mosi_raw.pkl'
    # add 初始化训练集和验证集的数据集类
    train_dataset = MOSIDataset(data_path, split='train')
    valid_dataset = MOSIDataset(data_path, split='valid')
    # add 初始化训练集和验证集的加载器，要求batch_size=16
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

    # Initialize the optimizer and scheduler.
    # add: 使用Adam优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs*len(train_loader))

    # add 调用训练函数，注意传入对应参数：
    train_model(model, train_loader, valid_loader, criterion, optimizer, scheduler, device, epochs)

    # 加载最佳epoch参数
    best_model_state = torch.load('best_model.pth')
    model.load_state_dict(best_model_state)

    # 初始化测试集的数据集类和加载器
    test_dataset = MOSIDataset(data_path, split='test')
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    print("\n========== test results: ==========\n")
    validate_model(model, test_loader, criterion, device)

In [19]:
if __name__ == "__main__":
    main()

cuda:0
Epoch [1/20], Loss: 2.5707
Validation Loss: 2.5931
{'Non0_acc_2': 0.607, 'Non0_F1_score': 0.7271, 'Mult_acc_5': np.float64(0.229), 'Mult_acc_7': np.float64(0.229), 'MAE': np.float32(1.391), 'Corr': np.float64(0.0504)}
Epoch [2/20], Loss: 1.7753
Validation Loss: 1.5293
{'Non0_acc_2': 0.7413, 'Non0_F1_score': 0.7383, 'Mult_acc_5': np.float64(0.3131), 'Mult_acc_7': np.float64(0.285), 'MAE': np.float32(1.0338), 'Corr': np.float64(0.636)}
Epoch [3/20], Loss: 1.2868
Validation Loss: 1.3809
{'Non0_acc_2': 0.806, 'Non0_F1_score': 0.8038, 'Mult_acc_5': np.float64(0.4112), 'Mult_acc_7': np.float64(0.3645), 'MAE': np.float32(0.9498), 'Corr': np.float64(0.6897)}
Epoch [4/20], Loss: 1.1111
Validation Loss: 1.2706
{'Non0_acc_2': 0.8358, 'Non0_F1_score': 0.8356, 'Mult_acc_5': np.float64(0.4019), 'Mult_acc_7': np.float64(0.3411), 'MAE': np.float32(0.9208), 'Corr': np.float64(0.709)}
Epoch [5/20], Loss: 0.9892
Validation Loss: 1.2126
{'Non0_acc_2': 0.8259, 'Non0_F1_score': 0.8281, 'Mult_acc_5': 

  best_model_state = torch.load('best_model.pth')




Validation Loss: 1.9641
{'Non0_acc_2': 0.7439, 'Non0_F1_score': 0.7439, 'Mult_acc_5': np.float64(0.3848), 'Mult_acc_7': np.float64(0.3353), 'MAE': np.float32(1.0541), 'Corr': np.float64(0.5593)}
