## AG_News 新闻主题分类

**所有样本均使用逗号作为分隔符，一共 3 列，分别对应 `类标（1 到 4）`、`标题` 和 `新闻描述`。**

**Transformer** 有两层，一层是 **Encoder** 层，一层是 **Decoder** 层，文本分类任务中并没有解码这一过程，不需要 **Decoder** 层，所以在案例中只对 **Encoder** 建立了编码层。

In [1]:
from collections import Counter
from torchtext.vocab import vocab
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
import torch
import torch.nn as nn

## MyTransformer
from torch.nn.init import xavier_uniform_
import torch.nn.functional as F
from torch.nn import Parameter
import torch.nn as nn
import copy

import re
from tqdm import tqdm
import math
import os
import time

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../input/ag-news/ag_news_csv/train.csv',header=None)
data.head()

In [3]:
data.shape

### 一、数据集构建

**定义 tokenize：**对于英文语料的处理

In [4]:
def my_tokenizer(s):
    ## 指定分词方法
    tokenizer = get_tokenizer('basic_english')
    return tokenizer(s)

**定义字符串清理：** 原始语料中有很多奇奇怪怪的字符，因此还需要对其稍微做一点处理。例如①只保留字母、数字、以及常用标点；②全部转换为小写字等

In [5]:
def clean_str(string):
    string = re.sub("[^A-Za-z0-9\-\?\!\.\,]", " ", string).lower()
    return string

**建立词表：**通过 `torchtext.vocab` 中的 `Vocab` 方法来构建词典

In [6]:
def build_vocab(tokenizer, filepath, min_freq, specials=None):
    """
    根据给定的tokenizer和对应参数返回一个Vocab类
    Args:
        tokenizer:  分词器
        filepath:  文本的路径
        min_freq: 最小词频，去掉小于min_freq的词
        specials: 特殊的字符，如<pad>，<unk>等
    Returns:
    """
    if specials is None:
        specials = ['<unk>', '<pad>']
    counter = Counter()
    with open(filepath, encoding='utf8') as f:
        for string_ in tqdm(f):
            string_ = string_.strip().split('","')[-1][:-1]      ## 新闻描述
            counter.update(tokenizer(clean_str(string_)))        ## 清洗sentence并进行分词和词计数
    return vocab(counter,min_freq=min_freq,specials=specials)    ## 返回词汇表    

定义一个类，并在类的初始化过程中根据训练语料完成字典的构建

In [7]:
class LoadSentenceClassificationDataset():
    def __init__(self, train_file_path=None,  # 训练集路径
                 tokenizer=None,
                 batch_size=20,
                 min_freq=1,  # 最小词频，去掉小于min_freq的词
                 max_sen_len='same'):  # 最大句子长度，默认设置其长度为整个数据集中最长样本的长度
        max_sen_len = None  # 时，表示按每个batch中最长的样本长度进行padding
        # 根据训练预料建立字典
        self.tokenizer = tokenizer
        self.min_freq = min_freq
        self.specials = ['<unk>', '<pad>']
        self.vocab = build_vocab(self.tokenizer,
                                 filepath=train_file_path,
                                 min_freq=self.min_freq,
                                 specials=self.specials)
        self.PAD_IDX = self.vocab['<pad>']
        self.UNK_IDX = self.vocab['<unk>']
        self.batch_size = batch_size
        self.max_sen_len = max_sen_len
    
    ## 转换为 Token 序列,构建的字典后，便可以通过如下函数来将训练集和测试集转换成 Token 序列
    def data_process(self, filepath):
        """
        将每一句话中的每一个词根据字典转换成索引的形式，同时返回所有样本中最长样本的长度
        :param filepath: 数据集路径
        :return:
        """
        ## 
        str_list = self.vocab.get_itos()
        
        raw_iter = open(filepath,encoding='utf8').readlines()
        data = []
        max_len = 0
        for raw in tqdm(raw_iter, ncols=80):
            line = raw.rstrip("\n").split('","')
            s, l = line[-1][:-1], line[0][1:]
            s = clean_str(s)
            
            ## str2index
            try:
                tensor_ = torch.tensor([self.vocab[token] for token in self.tokenizer(s)], dtype=torch.long) ## 如果在词汇表中找不到词会报错
#                 print('当前s中存在不在词汇表中的词汇：{}'.format(s))
            except:
                ## 如果数据不存在之前建立的词汇表中，就使用 <unk> 来代替，<unk> 的 index 为 0
                tensor_ = torch.tensor([self.vocab[token] if token in str_list else 0 for token in self.tokenizer(s) ] , dtype=torch.long)     
    
            l = torch.tensor(int(l) - 1, dtype=torch.long)
            max_len = max(max_len, tensor_.size(0))
            data.append((tensor_, l))
        return data, max_len
    
    ## 构造一个 DataLoader 迭代器
    def load_train_val_test_data(self, train_file_paths, test_file_paths):
        train_data, max_sen_len = self.data_process(train_file_paths)  # 得到处理好的所有样本
        if self.max_sen_len == 'same':
            self.max_sen_len = max_sen_len
        test_data, _ = self.data_process(test_file_paths)
        train_iter = DataLoader(train_data, batch_size=self.batch_size,  # 构造DataLoader,并指定填充
                                shuffle=True, collate_fn=self.generate_batch)
        test_iter = DataLoader(test_data, batch_size=self.batch_size,
                               shuffle=True, collate_fn=self.generate_batch)
        return train_iter, test_iter
    
    ## 对每个 batch 中的数据集进行 padding 处理：
    def generate_batch(self, data_batch):
        batch_sentence, batch_label = [], []
        for (sen, label) in data_batch:  # 开始对一个batch中的每一个样本进行处理。
            batch_sentence.append(sen)
            batch_label.append(label)
        batch_sentence = pad_sequence(batch_sentence,  # [batch_size,max_len]
                                      padding_value=self.PAD_IDX,
                                      batch_first=False,
                                      max_len=self.max_sen_len)
        batch_label = torch.tensor(batch_label, dtype=torch.long)
        return batch_sentence, batch_label

由于对于不同的样本来说其对应的序列长度通常来说都是不同的，但是在将数据输入到相应模型时却需要保持同样的长度。因此在这里我们就需要对 `Token` 序列化后的样本进行 `padding` 处理

In [8]:
def pad_sequence(sequences, batch_first=False, max_len=None, padding_value=0):
    """
    对一个List中的元素进行padding
        sequences:
        batch_first: 是否把batch_size放到第一个维度
        padding_value:
        max_len : 最大句子长度，默认为None，即在每个batch中以最长样本的长度对其它样本进行padding；
        当指定max_len的长度小于一个batch中某个样本的长度，那么在这个batch中还是会以最长样本的长度对其它样本进行padding
        建议指定max_len的值为整个数据集中最长样本的长度
    Returns:
    """
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    length = max_len
    max_len = max([s.size(0) for s in sequences])
    if length is not None:
        max_len = max(length, max_len)
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims
    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor
    return out_tensor

##### **开始数据的处理（演示）**

In [9]:
# path = "../input/ag-news/ag_news_csv/train.csv"
# data_loader = LoadSentenceClassificationDataset(train_file_path=path,tokenizer=my_tokenizer,max_sen_len=None)

# ## data_loader.vocab.get_stoi() 可以对 data_loader 中的 vocab 进行相对应的词汇表操作 https://pytorch.org/text/stable/vocab.html
# ## data_loader.vocab['hello']   ## 对某个字进行编码 str2index

# data, max_len = data_loader.data_process(path)
# train_iter, test_iter = data_loader.load_train_val_test_data(path, path)
# for sample, label in train_iter:
#     print(sample.shape)  # [seq_len,batch_size]

In [10]:
# a = 'ap - southern california s smog-fighting agency went after emissions of the bovine variety friday, adopting the nation s first rules to reduce air pollution from dairy cow manure.'

In [11]:
# b = data_loader.vocab.get_itos()

In [12]:
# 'aspo' in b

In [13]:
# tensor_ = torch.tensor([data_loader.vocab[token] if token in b else 0 for token in my_tokenizer(a) ]  , dtype=torch.long)

In [14]:
# raw_iter = open(config.test_corpus_file_paths,encoding='utf8').readlines()
# data = []
# max_len = 0
# i = 0
# for raw in tqdm(raw_iter, ncols=80):
#     line = raw.rstrip("\n").split('","')
#     s, l = line[-1][:-1], line[0][1:]
#     s = clean_str(s)
#     print(s)
#     i = i+1
#     if i >5:
#         break
#     tensor_ = torch.tensor([data_loader.vocab[token] for token in my_tokenizer(s)], dtype=torch.long)

In [15]:
# train_iter, test_iter = data_loader.load_train_val_test_data(config.train_corpus_file_paths, config.test_corpus_file_paths)

In [16]:
## build_vocab 中做的事：
# counter = Counter()
# with open(path, encoding='utf8') as f:
#     for string_ in tqdm(f):
#         print(string_)
#         string_ = string_.strip().split('","')[-1][:-1]  # 新闻描述
#         print(string_)
#         print(clean_str(string_))
#         print(my_tokenizer(clean_str(string_)))
#         counter.update(my_tokenizer(clean_str(string_)))        
#         print(counter.most_common)
#         break

### 二、定义模型的网络结构

**Embedding：**包含`token enbedding` 和 `PositionalEncoding` 两种

In [17]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        #>>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)  # [max_len, d_model]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # [max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))  # [d_model/2]
        pe[:, 0::2] = torch.sin(position * div_term)  # [max_len, d_model/2]
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)  # [max_len, 1, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):  # [x_len, batch_size, d_model]
        """
        :param x: [x_len, batch_size, emb_size]
        :return: [x_len, batch_size, emb_size]
        """
        x = x + self.pe[:x.size(0), :]  # [batch_size, max_len, d_model]
        return self.dropout(x)


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    """
        :param tokens: shape : [len, batch_size]
        :return: shape: [len, batch_size, emb_size]
        """

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

**MyTransformer**

In [18]:
is_print_shape = False

class MyTransformer(nn.Module):
    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
                 ):
        super(MyTransformer, self).__init__()

        """
        :param d_model:  d_k = d_v = d_model/nhead = 64, 模型中向量的维度，论文默认值为 512
        :param nhead:               多头注意力机制中多头的数量，论文默认为值 8
        :param num_encoder_layers:  encoder堆叠的数量，也就是论文中的N，论文默认值为6
        :param num_decoder_layers:  decoder堆叠的数量，也就是论文中的N，论文默认值为6
        :param dim_feedforward:     全连接中向量的维度，论文默认值为 2048
        :param dropout:             丢弃率，论文中的默认值为 0.1
        """

        #  ================ 编码部分 =====================
        encoder_layer = MyTransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = MyTransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

        # ================ 解码部分 =====================
        decoder_layer = MyTransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = MyTransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)

        self._reset_parameters()

        self.d_model = d_model
        self.nhead = nhead

    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""
        """
        初始化
        """
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None,
                memory_mask=None, src_key_padding_mask=None,
                tgt_key_padding_mask=None, memory_key_padding_mask=None):
        """
        :param src:   [src_len,batch_size,embed_dim]
        :param tgt:  [tgt_len, batch_size, embed_dim]
        :param src_mask:  None
        :param tgt_mask:  [tgt_len, tgt_len]
        :param memory_mask: None
        :param src_key_padding_mask: [batch_size, src_len]
        :param tgt_key_padding_mask: [batch_size, tgt_len]
        :param memory_key_padding_mask:  [batch_size, src_len]
        :return: [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim]
        """
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim]
        output = self.decoder(tgt=tgt, memory=memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        return output  # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim]

    def generate_square_subsequent_mask(self, sz):
        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask  # [sz,sz]


class MyTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(MyTransformerEncoderLayer, self).__init__()
        """
        :param d_model:         d_k = d_v = d_model/nhead = 64, 模型中向量的维度，论文默认值为 512
        :param nhead:           多头注意力机制中多头的数量，论文默认为值 8
        :param dim_feedforward: 全连接中向量的维度，论文默认值为 2048
        :param dropout:         丢弃率，论文中的默认值为 0.1    
        """
        self.self_attn = MyMultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of Feedforward model
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)

        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.activation = F.relu

        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        """
        :param src: 编码部分的输入，形状为 [src_len,batch_size, embed_dim]
        :param src_mask:  编码部分输入的padding情况，形状为 [batch_size, src_len]
        :return:
        """
        src2 = self.self_attn(src, src, src, attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask, )[0]  # 计算多头注意力
        # src2: [src_len,batch_size,num_heads*kdim] num_heads*kdim = embed_dim
        src = src + self.dropout1(src2)  # 残差连接
        src = self.norm1(src)  # [src_len,batch_size,num_heads*kdim]

        src2 = self.activation(self.linear1(src))  # [src_len,batch_size,dim_feedforward]
        src2 = self.linear2(self.dropout(src2))  # [src_len,batch_size,num_heads*kdim]
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src  # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim]


class MyTransformerEncoder(nn.Module):
    def __init__(self, encoder_layer, num_layers, norm=None):
        super(MyTransformerEncoder, self).__init__()
        """
        encoder_layer: 就是包含有多头注意力机制的一个编码层
        num_layers: 克隆得到多个encoder layers 论文中默认为6
        norm: 归一化层
        """
        self.layers = _get_clones(encoder_layer, num_layers)  # 克隆得到多个encoder layers 论文中默认为6
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, src, mask=None, src_key_padding_mask=None):
        """
        :param src: 编码部分的输入，形状为 [src_len,batch_size, embed_dim]
        :param mask:  编码部分输入的padding情况，形状为 [batch_size, src_len]
        :return:# [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim]
        """
        output = src
        for mod in self.layers:
            output = mod(output, src_mask=mask,
                         src_key_padding_mask=src_key_padding_mask)  # 多个encoder layers层堆叠后的前向传播过程
        if self.norm is not None:
            output = self.norm(output)
        return output  # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim]


def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class MyTransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(MyTransformerDecoderLayer, self).__init__()
        """
        :param d_model:         d_k = d_v = d_model/nhead = 64, 模型中向量的维度，论文默认值为 512
        :param nhead:           多头注意力机制中多头的数量，论文默认为值 8
        :param dim_feedforward: 全连接中向量的维度，论文默认值为 2048
        :param dropout:         丢弃率，论文中的默认值为 0.1    
        """
        self.self_attn = MyMultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout)
        # 解码部分输入序列之间的多头注意力（也就是论文结构图中的Masked Multi-head attention)
        self.multihead_attn = MyMultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout)
        # 编码部分输出（memory）和解码部分之间的多头注意力机制。
        # Implementation of Feedforward model

        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

        self.activation = F.relu

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None):
        """
        :param tgt:  解码部分的输入，形状为 [tgt_len,batch_size, embed_dim]
        :param memory: 编码部分的输出（memory）, [src_len,batch_size,embed_dim]
        :param tgt_mask: 注意力Mask输入，用于掩盖当前position之后的信息, [tgt_len, tgt_len]
        :param memory_mask: 编码器-解码器交互时的注意力掩码，一般为None
        :param tgt_key_padding_mask: 解码部分输入的padding情况，形状为 [batch_size, tgt_len]
        :param memory_key_padding_mask: 编码部分输入的padding情况，形状为 [batch_size, src_len]
        :return:
        """
        tgt2 = self.self_attn(tgt, tgt, tgt,  # [tgt_len,batch_size, embed_dim]
                              attn_mask=tgt_mask,
                              key_padding_mask=tgt_key_padding_mask)[0]
        # 解码部分输入序列之间'的多头注意力（也就是论文结构图中的Masked Multi-head attention)

        tgt = tgt + self.dropout1(tgt2)  # 接着是残差连接
        tgt = self.norm1(tgt)  # [tgt_len,batch_size, embed_dim]

        tgt2 = self.multihead_attn(tgt, memory, memory,  # [tgt_len, batch_size, embed_dim]
                                   attn_mask=memory_mask,
                                   key_padding_mask=memory_key_padding_mask)[0]

        # 解码部分的输入经过多头注意力后同编码部分的输出（memory）通过多头注意力机制进行交互
        tgt = tgt + self.dropout2(tgt2)  # 残差连接
        tgt = self.norm2(tgt)  # [tgt_len, batch_size, embed_dim]

        tgt2 = self.activation(self.linear1(tgt))  # [tgt_len, batch_size, dim_feedforward]
        tgt2 = self.linear2(self.dropout(tgt2))  # [tgt_len, batch_size, embed_dim]
        # 最后的两层全连接
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt  # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim]


class MyTransformerDecoder(nn.Module):
    def __init__(self, decoder_layer, num_layers, norm=None):
        super(MyTransformerDecoder, self).__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.norm = norm

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None,
                memory_key_padding_mask=None):
        """
        :param tgt: 解码部分的输入，形状为 [tgt_len,batch_size, embed_dim]
        :param memory: 编码部分最后一层的输出 [src_len,batch_size, embed_dim]
        :param tgt_mask: 注意力Mask输入，用于掩盖当前position之后的信息, [tgt_len, tgt_len]
        :param memory_mask: 编码器-解码器交互时的注意力掩码，一般为None
        :param tgt_key_padding_mask: 解码部分输入的padding情况，形状为 [batch_size, tgt_len]
        :param memory_key_padding_mask: 编码部分输入的padding情况，形状为 [batch_size, src_len]
        :return:
        """
        output = tgt  # [tgt_len,batch_size, embed_dim]

        for mod in self.layers:  # 这里的layers就是N层解码层堆叠起来的
            output = mod(output, memory,
                         tgt_mask=tgt_mask,
                         memory_mask=memory_mask,
                         tgt_key_padding_mask=tgt_key_padding_mask,
                         memory_key_padding_mask=memory_key_padding_mask)
        if self.norm is not None:
            output = self.norm(output)

        return output  # [tgt_len, batch_size, num_heads * kdim] <==> [tgt_len,batch_size,embed_dim]


class MyMultiheadAttention(nn.Module):
    """
    多头注意力机制的计算公式为（就是论文第5页的公式）：
    .. math::
        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
    """

    def __init__(self, embed_dim, num_heads, dropout=0., bias=True):
        super(MyMultiheadAttention, self).__init__()
        """
        :param embed_dim:   词嵌入的维度，也就是前面的d_model参数，论文中的默认值为512
        :param num_heads:   多头注意力机制中多头的数量，也就是前面的nhead参数， 论文默认值为 8
        :param dropout:     
        :param bias:        最后对多头的注意力（组合）输出进行线性变换时，是否使用偏置
        """
        self.embed_dim = embed_dim  # 前面的d_model参数
        self.head_dim = embed_dim // num_heads  # head_dim 指的就是d_k,d_v
        self.kdim = self.head_dim
        self.vdim = self.head_dim

        self.num_heads = num_heads  # 多头个数
        self.dropout = dropout

        assert self.head_dim * num_heads == self.embed_dim, "embed_dim 除以 num_heads必须为整数"
        # 上面的限制条件就是论文中的  d_k = d_v = d_model/n_head 条件

        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # embed_dim = kdim * num_heads
        # 这里第二个维度之所以是embed_dim，实际上这里是同时初始化了num_heads个W_q堆叠起来的, 也就是num_heads个头
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # W_k,  embed_dim = kdim * num_heads
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)  # W_v,  embed_dim = vdim * num_heads
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        # 最后将所有的Z组合起来的时候，也是一次性完成， embed_dim = vdim * num_heads
        self._reset_parameters()

    def _reset_parameters(self):
        """
        以特定方式来初始化参数
        :return:
        """
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
        """
        在论文中，编码时query, key, value 都是同一个输入， 解码时 输入的部分也都是同一个输入，
        解码和编码交互时 key,value指的是 memory, query指的是tgt
        :param query: # [tgt_len, batch_size, embed_dim], tgt_len 表示目标序列的长度
        :param key:  #  [src_len, batch_size, embed_dim], src_len 表示源序列的长度
        :param value: # [src_len, batch_size, embed_dim], src_len 表示源序列的长度
        :param attn_mask: # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len]
                一般只在解码时使用，为了并行一次喂入所有解码部分的输入，所以要用mask来进行掩盖当前时刻之后的位置信息
        :param key_padding_mask: [batch_size, src_len], src_len 表示源序列的长度
        :return:
        attn_output: [tgt_len, batch_size, embed_dim]
        attn_output_weights: # [batch_size, tgt_len, src_len]
        """
        return multi_head_attention_forward(query, key, value, self.num_heads,
                                            self.dropout,
                                            out_proj=self.out_proj,
                                            training=self.training,
                                            key_padding_mask=key_padding_mask,
                                            q_proj=self.q_proj,
                                            k_proj=self.k_proj,
                                            v_proj=self.v_proj,
                                            attn_mask=attn_mask)


def multi_head_attention_forward(query,  # [tgt_len,batch_size, embed_dim]
                                 key,  # [src_len, batch_size, embed_dim]
                                 value,  # [src_len, batch_size, embed_dim]
                                 num_heads,
                                 dropout_p,
                                 out_proj,  # [embed_dim = vdim * num_heads, embed_dim = vdim * num_heads]
                                 training=True,
                                 key_padding_mask=None,  # [batch_size,src_len/tgt_len]
                                 q_proj=None,  # [embed_dim,kdim * num_heads]
                                 k_proj=None,  # [embed_dim, kdim * num_heads]
                                 v_proj=None,  # [embed_dim, vdim * num_heads]
                                 attn_mask=None,  # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len]
                                 ):
    q = q_proj(query)
    #  [tgt_len,batch_size, embed_dim] x [embed_dim,kdim * num_heads] = [tgt_len,batch_size,kdim * num_heads]

    k = k_proj(key)
    # [src_len, batch_size, embed_dim] x [embed_dim, kdim * num_heads] = [src_len, batch_size, kdim * num_heads]

    v = v_proj(value)
    # [src_len, batch_size, embed_dim] x [embed_dim, vdim * num_heads] = [src_len, batch_size, vdim * num_heads]
    if is_print_shape:
        print("" + "=" * 80)
        print("进入多头注意力计算:")
        print(
            f"\t 多头num_heads = {num_heads}, d_model={query.size(-1)}, d_k = d_v = d_model/num_heads={query.size(-1) // num_heads}")
        print(f"\t query的shape([tgt_len, batch_size, embed_dim]):{query.shape}")
        print(f"\t  W_q 的shape([embed_dim,kdim * num_heads]):{q_proj.weight.shape}")
        print(f"\t   Q  的shape([tgt_len, batch_size,kdim * num_heads]):{q.shape}")
        print("\t" + "-" * 70)

        print(f"\t  key 的shape([src_len,batch_size, embed_dim]):{key.shape}")
        print(f"\t  W_k 的shape([embed_dim,kdim * num_heads]):{k_proj.weight.shape}")
        print(f"\t   K  的shape([src_len,batch_size,kdim * num_heads]):{k.shape}")
        print("\t" + "-" * 70)

        print(f"\t value的shape([src_len,batch_size, embed_dim]):{value.shape}")
        print(f"\t  W_v 的shape([embed_dim,vdim * num_heads]):{v_proj.weight.shape}")
        print(f"\t   V  的shape([src_len,batch_size,vdim * num_heads]):{v.shape}")
        print("\t" + "-" * 70)
        print("\t ***** 注意，这里的W_q, W_k, W_v是多个head同时进行计算的. 因此，Q,K,V分别也是包含了多个head的q,k,v堆叠起来的结果 *****")

    tgt_len, bsz, embed_dim = query.size()  # [tgt_len,batch_size, embed_dim]
    src_len = key.size(0)
    head_dim = embed_dim // num_heads  # num_heads * head_dim = embed_dim
    scaling = float(head_dim) ** -0.5
    q = q * scaling  # [query_len,batch_size,kdim * num_heads]

    if attn_mask is not None:  # [tgt_len,src_len] or [num_heads*batch_size,tgt_len, src_len]
        if attn_mask.dim() == 2:
            attn_mask = attn_mask.unsqueeze(0)  # [1, tgt_len,src_len]
            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
                raise RuntimeError('The size of the 2D attn_mask is not correct.')
        elif attn_mask.dim() == 3:
            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
                raise RuntimeError('The size of the 3D attn_mask is not correct.')
        # 现在 atten_mask 的维度就变成了3D

    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
    # [batch_size * num_heads,tgt_len,kdim]
    # 因为前面是num_heads个头一起参与的计算，所以这里要进行一下变形，以便于后面计算。 且同时交换了0，1两个维度
    k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)  # [batch_size * num_heads,src_len,kdim]
    v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)  # [batch_size * num_heads,src_len,vdim]
    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
    # [batch_size * num_heads,tgt_len,kdim] x [batch_size * num_heads, kdim, src_len]
    # =  [batch_size * num_heads, tgt_len, src_len]  这就num_heads个QK相乘后的注意力矩阵

    if attn_mask is not None:
        attn_output_weights += attn_mask  # [batch_size * num_heads, tgt_len, src_len]

    if key_padding_mask is not None:
        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
        # 变成 [batch_size, num_heads, tgt_len, src_len]的形状
        attn_output_weights = attn_output_weights.masked_fill(
            key_padding_mask.unsqueeze(1).unsqueeze(2),  # 扩展维度，从[batch_size,src_len]变成[batch_size,1,1,src_len]
            float('-inf'))  #
        attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len,
                                                       src_len)  # [batch_size * num_heads, tgt_len, src_len]

    attn_output_weights = F.softmax(attn_output_weights, dim=-1)  # [batch_size * num_heads, tgt_len, src_len]
    attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training)
    attn_output = torch.bmm(attn_output_weights, v)
    # Z = [batch_size * num_heads, tgt_len, src_len]  x  [batch_size * num_heads,src_len,vdim]
    # = # [batch_size * num_heads,tgt_len,vdim]
    # 这就num_heads个Attention(Q,K,V)结果

    attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
    # 先transpose成 [tgt_len, batch_size* num_heads ,kdim]
    # 再view成 [tgt_len,batch_size,num_heads*kdim]
    attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)

    Z = out_proj(attn_output)
    # 这里就是多个z  线性组合成Z  [tgt_len,batch_size,embed_dim]
    if is_print_shape:
        print(f"\t 多头注意力中,多头计算结束后的形状（堆叠）为([tgt_len,batch_size,num_heads*kdim]){attn_output.shape}")
        print(f"\t 多头计算结束后，再进行线性变换时的权重W_o的形状为([num_heads*vdim, num_heads*vdim  ]){out_proj.weight.shape}")
        print(f"\t 多头线性变化后的形状为([tgt_len,batch_size,embed_dim]) {Z.shape}")
    return Z, attn_output_weights.sum(dim=1) / num_heads  # average attention weights over heads


**Config：**    基于 `Transformer` 架构的类 `Translation` 模型配置类

In [19]:
class Config():
    """
    基于Transformer架构的类Translation模型配置类
    """

    def __init__(self):
        #   数据集设置相关配置
#         self.project_dir = os.path.dirname(os.path.abspath(__file__))
#         self.dataset_dir = os.path.join(self.project_dir, 'data')
#         self.train_corpus_file_paths = os.path.join(self.dataset_dir, 'ag_news_csv', 'train.csv')
#         self.test_corpus_file_paths = os.path.join(self.dataset_dir, 'ag_news_csv', 'test.csv')

        self.train_corpus_file_paths = '../input/ag-news/ag_news_csv/train.csv'
        self.test_corpus_file_paths = '../input/ag-news/ag_news_csv/test.csv'


        self.min_freq = 1
        self.max_sen_len = None

        #  模型相关配置
        self.batch_size = 128
        self.d_model = 512
        self.num_head = 8
        self.num_encoder_layers = 6
        self.num_decoder_layers = 6
        self.dim_feedforward = 512
        self.dim_classification = 256
        self.num_class = 4
        self.dropout = 0.1
        self.concat_type = 'avg'
        self.beta1 = 0.9
        self.beta2 = 0.98
        self.epsilon = 10e-9
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.epochs = 10
#         self.model_save_dir = os.path.join(self.project_dir, 'cache')
        self.model_save_dir = os.path.join('./', 'cache')

        self.model_save_per_epoch = 2
        if not os.path.exists(self.model_save_dir):
            os.makedirs(self.model_save_dir)

In [20]:
class ClassificationModel(nn.Module):
    def __init__(self, vocab_size=None,
                 d_model=512, nhead=8,
                 num_encoder_layers=6,
                 dim_feedforward=2048,
                 dim_classification=64,
                 num_classification=4,
                 dropout=0.1):
        super(ClassificationModel, self).__init__()
        
        ##  Transformer 中的 Embedding 操作
        self.pos_embedding = PositionalEncoding(d_model=d_model, dropout=dropout)
        self.src_token_embedding = TokenEmbedding(vocab_size, d_model)
        
        ## 定义了 Transformer 中的 EncoderLayer
        encoder_layer = MyTransformerEncoderLayer(d_model, nhead,
                                                  dim_feedforward,
                                                  dropout)
        encoder_norm = nn.LayerNorm(d_model)
        
        ## Transformer 中的 Encoder
        self.encoder = MyTransformerEncoder(encoder_layer,
                                            num_encoder_layers, encoder_norm)
        
        ## 定义分类器
        self.classifier = nn.Sequential(nn.Linear(d_model, dim_classification),
                                        nn.Dropout(dropout),
                                        nn.Linear(dim_classification, num_classification))
    ## 前向传播
    def forward(self,
                src,  # [src_len, batch_size]
                src_mask=None,
                src_key_padding_mask=None,  # [batsh_size, src_len]
                concat_type='sum'  # 解码之后取所有位置相加，还是最后一个位置作为输出
                ):
        # 执行编码器的前向传播过程
        src_embed = self.src_token_embedding(src)  # [src_len, batch_size, embed_dim]
        src_embed = self.pos_embedding(src_embed)  # [src_len, batch_size, embed_dim]
        memory = self.encoder(src=src_embed,
                              mask=src_mask,
                              src_key_padding_mask=src_key_padding_mask)
        # [src_len,batch_size,embed_dim]
        
        ## 选择以何种方式来选择分类器的输入，sum 或者 avg
        if concat_type == 'sum':
            memory = torch.sum(memory, dim=0)
        elif concat_type == 'avg':
            memory = torch.sum(memory, dim=0) / memory.size(0)
        else:
            memory = memory[-1, ::]  # 取最后一个时刻
        # [src_len, batch_size, num_heads * kdim] <==> [src_len,batch_size,embed_dim]
        out = self.classifier(memory)  # 输出logits
        return out  # [batch_size, num_class]

### **3、定义训练函数**

In [21]:
class CustomSchedule(nn.Module):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = torch.tensor(d_model, dtype=torch.float32)
        self.warmup_steps = warmup_steps
        self.step = 1.

    def __call__(self):
        arg1 = self.step ** -0.5
        arg2 = self.step * (self.warmup_steps ** -1.5)
        self.step += 1.
        return (self.d_model ** -0.5) * min(arg1, arg2)


def train_model(config):
    data_loader = LoadSentenceClassificationDataset(config.train_corpus_file_paths,
                                                    my_tokenizer,
                                                    batch_size=config.batch_size,
                                                    min_freq=config.min_freq,
                                                    max_sen_len=config.max_sen_len)
    
    train_iter, test_iter = data_loader.load_train_val_test_data(config.train_corpus_file_paths, config.test_corpus_file_paths)
    
    ## 定义模型并初始化权重
    classification_model = ClassificationModel(vocab_size=len(data_loader.vocab),
                                               d_model=config.d_model,
                                               nhead=config.num_head,
                                               num_encoder_layers=config.num_encoder_layers,
                                               dim_feedforward=config.dim_feedforward,
                                               dim_classification=config.dim_classification,
                                               num_classification=config.num_class,
                                               dropout=config.dropout)

    for p in classification_model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    model_save_path = os.path.join(config.model_save_dir, 'model.pt')
    if os.path.exists(model_save_path):
        loaded_paras = torch.load(model_save_path)
        classification_model.load_state_dict(loaded_paras)
        print("## 成功载入已有模型，进行追加训练......")
    classification_model = classification_model.to(config.device)
    
    ## 定义损失学习率与优化器
    loss_fn = torch.nn.CrossEntropyLoss()
    learning_rate = CustomSchedule(config.d_model)
    optimizer = torch.optim.Adam(classification_model.parameters(),
                                 lr=0.,
                                 betas=(config.beta1, config.beta2),
                                 eps=config.epsilon)
    classification_model.train()
    max_test_acc = 0
    
    ## 开始训练
    for epoch in range(config.epochs):
        losses = 0
        start_time = time.time()
        for idx, (sample, label) in enumerate(train_iter):
            sample = sample.to(config.device)  # [src_len, batch_size]
            label = label.to(config.device)
            
            ## 生成每个样本对应的 padding mask 向量
            padding_mask = (sample == data_loader.PAD_IDX).transpose(0, 1)
            
            logits = classification_model(sample,src_key_padding_mask=padding_mask)  # [batch_size,num_class]
            optimizer.zero_grad()
            loss = loss_fn(logits, label)
            loss.backward()
            lr = learning_rate()
            for p in optimizer.param_groups:
                p['lr'] = lr
            optimizer.step()
            losses += loss.item()

            acc = (logits.argmax(1) == label).float().mean()
            if idx % 10 == 0:
                print(f"Epoch: {epoch}, Batch[{idx}/{len(train_iter)}], "
                      f"Train loss :{loss.item():.3f}, Train acc: {acc:.3f}")
        end_time = time.time()
        train_loss = losses / len(train_iter)
        print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s")
        if (epoch + 1) % config.model_save_per_epoch == 0:
            acc = evaluate(test_iter, classification_model, config.device)
            print(f"Accuracy on test {acc:.3f}, max acc on test {max_test_acc:.3f}")
            if acc > max_test_acc:
                max_test_acc = acc
                torch.save(classification_model.state_dict(), model_save_path)


def evaluate(data_iter, model, device):
    model.eval()
    with torch.no_grad():
        acc_sum, n = 0.0, 0
        for x, y in data_iter:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            acc_sum += (logits.argmax(1) == y).float().sum().item()
            n += len(y)
        model.train()
        return acc_sum / n

### **4、开始进行训练**

In [22]:
config = Config()
train_model(config)