In [1]:
import json
import pandas as pd
import random
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.cuda.amp import autocast
from tqdm import tqdm

from config import Config
from transformer import *

In [2]:
d_model = 512  # embedding size  词嵌入维度
max_len = 20  # max length of sentences    句子的最大长度
d_ff = 2048  # feedforward neural network dimension  前馈神经网络隐藏层大小
d_k = d_v = 64  # dimension of q、k、v     Q、K、V 维度
n_layers = 6  # number of encoder and decoder layers  编码器、解码器层数
n_headers = 8  # number of heads in multihead attention    注意力头数
p_drop = 0.1  # probability of dropout    Dropout的概率

In [3]:
seed = 2022
val_ratio = 0.2
batch_size = 64
output_size = 6

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed)

In [5]:
def load_file():
    """加载数据"""
    # 加载数据集
    null = pd.read_csv(Config.Emotion_List[0], header=None, index_col=None)  # null
    like = pd.read_csv(Config.Emotion_List[1], header=None, index_col=None)  # like
    sad = pd.read_csv(Config.Emotion_List[2], header=None, index_col=None)  # sad
    disgust = pd.read_csv(Config.Emotion_List[3], header=None, index_col=None)  # disgust
    anger = pd.read_csv(Config.Emotion_List[4], header=None, index_col=None)  # anger
    happy = pd.read_csv(Config.Emotion_List[5], header=None, index_col=None) # happy


    null = [row[0].split() for row in null.values]
    like = [row[0].split() for row in like.values]
    sad = [row[0].split() for row in sad.values]
    disgust = [row[0].split() for row in disgust.values]
    anger = [row[0].split() for row in anger.values]
    happy = [row[0].split() for row in happy.values]

    # 拼接
    x = null + like + sad + disgust + anger + happy
    print(type(x))
    # null-0 like-1 sad-2 disgust-3 anger-4 happy-5
    y = np.concatenate((np.zeros(len(null), dtype=int), np.ones(len(like), dtype=int),
                        np.ones(len(sad), dtype=int) * 2, np.ones(len(disgust), dtype=int) * 3,
                        np.ones(len(anger), dtype=int) * 4, np.ones(len(happy), dtype=int) * 5))

    return x, y

In [6]:
def load_w2idx():
    with open(Config.w2indx_path, "r", encoding=Config.encoding) as f:
        return json.load(f)

In [7]:
def parse_dataset(combined):
    """将combined中的数据转换为索引表示"""
    data = []
    for sentence in combined:
        new_txt = []
        for word in sentence:
            try:
                new_txt.append(w2indx[word])
            except:
                new_txt.append(0)
        new_txt=torch.Tensor(new_txt[:max_len]).long()
        data.append(new_txt)
    return data

In [8]:
class MYDataset(Dataset):
    def __init__(self, x, y, index_dict):
        self.x = x
        self.y = y
        self.n_symbols = len(index_dict) + 1  # 所有单词的索引数，频数小于10的词语索引为0，所以加1

    def __getitem__(self, idx):
        labels = torch.zeros(6)
        labels[self.y[idx]] = 1

        # print(self.x[idx])
        return self.x[idx], labels

    def __len__(self):
        return len(self.x)

In [9]:
class TransformerClassifier(nn.Module):
    def __init__(self):
        super(TransformerClassifier, self).__init__()
        self.encoder = Encoder()    # [batch, source_len, d_model]
        self.fc = nn.Linear(max_len * d_model, output_size)

    def forward(self, inputs):
        """
        inputs: [batch, source_len]
        """
        outputs, attns = self.encoder(inputs)   # [batch, source_len, d_model]
        outputs = self.fc(outputs.view(outputs.shape[0], -1)) # [batch, output_size]

        return outputs, attns

In [10]:
print("Loading word2index....")
w2indx = load_w2idx()
w2indx['<PAD>'] = 0  # 添加填充字符

print("Loading data....")
x, y = load_file()
x = parse_dataset(x)
x = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=0)

print("get data and labels")
dataset = MYDataset(x, y, w2indx)

val_size = int(len(dataset) * val_ratio)
train_size = len(dataset) - val_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size]) # (1735017, 433754)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

Loading word2index....
Loading data....
<class 'list'>
get data and labels


In [11]:
model = TransformerClassifier()
model.cuda()

TransformerClassifier(
  (encoder): Encoder(
    (source_embedding): Embedding(43655, 512)
    (positional_embedding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (encoder_self_attn): MultiHeadAttention(
          (W_Q): Linear(in_features=512, out_features=512, bias=False)
          (W_K): Linear(in_features=512, out_features=512, bias=False)
          (W_V): Linear(in_features=512, out_features=512, bias=False)
          (fc): Linear(in_features=512, out_features=512, bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (ffn): FeedForwardNetwork(
          (ff1): Conv1d(512, 2048, kernel_size=(1,), stride=(1,))
          (ff2): Conv1d(2048, 512, kernel_size=(1,), stride=(1,))
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
      

In [12]:
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss().to('cuda')
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3,
                                      gamma=0.4)  # reduce the learning after 20 epochs by a factor of 10

In [13]:
def train(model, epoch):
    model.train()
    epoch_loss = 0.0
    epoch_acc = 0.0
    with tqdm(train_loader, unit="batch") as tepoch: # 🌟 1. 定义进度条
        for inputs, labels in tepoch:   # 🌟 2. 设置迭代器
            tepoch.set_description(f"Epoch {epoch} train: ") # 🌟 3. 设置开头

            inputs, labels = inputs.cuda(), labels.cuda()

            with autocast():
                # print(inputs.size(), labels.size())
                outputs, _ = model(inputs)  # outputs, attns
                # print(outputs.shape, labels)
                loss = loss_fn(outputs, labels)
                correct = (outputs.argmax(axis=1) == labels.argmax(axis=1)).sum().item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            epoch_acc += correct

        train_acc = epoch_acc / len(train_loader.dataset)
        # print(epoch_acc, len(train_loader.dataset))
        train_loss = epoch_loss / len(train_loader.dataset)

        tepoch.write("Train Epoch: {} Train Loss: {:.6f} Train Acc: {:.6f}".format(epoch, train_loss, train_acc))
        tepoch.close()

        return train_acc, train_loss

In [14]:
def test(model, epoch):
    epoch_loss = 0
    epoch_acc = 0
    model.eval() # set the model to evaluation mode
    with tqdm(val_loader, unit="batch") as vepoch: # 🌟 1. 定义进度条
        with torch.no_grad():
            for inputs, labels in vepoch:   # 🌟 2. 设置迭代器
                vepoch.set_description(f"Epoch {epoch} val: ") # 🌟 3. 设置开头
                inputs, labels = inputs.cuda(), labels.cuda()

                outputs, _ = model(inputs)
                loss = loss_fn(outputs, labels)
                # 计算准确率
                correct = (outputs.argmax(axis=1) == labels.argmax(axis=1)).sum().item()

                epoch_loss += loss.item()
                epoch_acc += correct

        val_acc = epoch_acc / len(val_loader.dataset)
        val_loss = epoch_loss / len(val_loader.dataset)

        vepoch.write("Test Epoch: {} Test Loss {:.6f} Test Accuracy: {:.6f}\n".format(epoch, val_loss, val_acc))
        vepoch.close()

        return val_acc, val_loss

In [15]:
train_accs, train_losses, val_accs, val_losses = [], [], [], []

In [None]:
model_path ="model/transformerencoder.pth"
n_epochs = 15
best_acc = 0.0
for epoch in range(1, n_epochs + 1):
    train_acc, train_loss = train(model, epoch)
    val_acc, val_loss = test(model, epoch)

    train_accs.append(train_acc), train_losses.append(train_loss)
    val_accs.append(val_acc), val_losses.append(val_loss)

    scheduler.step()
    if val_acc >= best_acc:
        best_acc = val_acc
        torch.save({'epoch': epoch, 'state_dict': model.state_dict()}, model_path)
        print('Epoch: {} saving model with Acc {:.3f}'.format(epoch, best_acc))

Epoch 1 train: : 100%|██████████| 27109/27109 [16:38<00:00, 27.14batch/s]


Train Epoch: 1 Train Loss: 0.008170 Train Acc: 0.808385


Epoch 1 val: : 100%|██████████| 6777/6777 [01:37<00:00, 69.67batch/s]


Test Epoch: 1 Test Loss 0.006383 Test Accuracy: 0.851656

Epoch: 1 saving model with Acc 0.852


Epoch 2 train: : 100%|██████████| 27109/27109 [16:23<00:00, 27.55batch/s]


Train Epoch: 2 Train Loss: 0.005873 Train Acc: 0.862706


Epoch 2 val: : 100%|██████████| 6777/6777 [01:38<00:00, 68.93batch/s]


Test Epoch: 2 Test Loss 0.005782 Test Accuracy: 0.864617

Epoch: 2 saving model with Acc 0.865


Epoch 3 train: :  63%|██████▎   | 17045/27109 [10:27<06:03, 27.72batch/s]

In [None]:
int(-1e9)

In [None]:
a = torch.randn(2,3)

In [None]:
 a