In [4]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler

import torchtext  # 内置的文本处理 pip install torchtext==0.6.0
from torchtext.vocab import GloVe  # 词嵌入表示的库
from torchtext.datasets import IMDB
from torch import nn
import numpy as np
import string
import re

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# 文本预处理
使用 时间机器.txt 数据集

## 1. 将文本作为字符串加载到内存中

In [54]:
def read_txt(path):
    """读取txt文件并处理，返回文本列表"""
    # 将时间机器数据集加载到文本行的列表中
    with open(path, 'r', encoding='UTF-8') as f:
        lines = f.readlines()
    # 将非字母替换为空格，并全部小写
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

In [10]:
lines = read_txt("../data/时间机器.txt")
len(lines),lines[0],lines[104]

(3557,
 'the project gutenberg ebook of the time machine by h g wells',
 'length breadth thickness and duration but through a natural')

## 2. 将字符串拆分为词元token(如单词和字符)

In [55]:
def tokenize(lines, token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知词元类型：' + token)

In [44]:
tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and']
['most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions']
['whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 're', 'use', 'it', 'under', 'the', 'terms']
['of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at']
['www', 'gutenberg', 'org', 'if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'you']
['will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before']
['using', 'this', 'ebook']
[]
['title', 'the', 'time', 'machine']


## 3. 建立词表，将拆分的词元映射到数字索引

In [56]:
import collections
def count_corpus(tokens):
    """统计词元的频率，tokens是1D或2D列表"""
    # 这里使用了短路逻辑，避免tokens[0]越界
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 如果是2D列表，将词元列表展平成1D列表，嵌套循环
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)  # 单词:词频

class Vocab:
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        """
        :param tokens: 传入的词元列表
        :param min_freq: 少于该次数的词元丢掉
        :param reserved_tokens: 已知的token
        """
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens)  # 统计词频
        # 按出现频率排序，计算性能较好
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        # 列表，根据下标idx找到对应的token
        self.idx_to_token = ['<unk>'] + reserved_tokens  # 未知词元<unk>的索引为0
        # 字典，根据token找到对应的idx
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:  # 不在词表中
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        """字典，根据token找到对应的idx"""
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        """列表，根据下标idx找到对应的token"""
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

In [43]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])
for i in [0, 10]:
    print('文本:', tokens[i])
    print('索引:', vocab[tokens[i]])

[('<unk>', 0), ('the', 1), ('and', 2), ('of', 3), ('i', 4), ('a', 5), ('to', 6), ('in', 7), ('was', 8), ('that', 9)]
文本: ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'time', 'machine', 'by', 'h', 'g', 'wells']
索引: [1, 53, 44, 314, 3, 1, 19, 46, 33, 1163, 1164, 360]
文本: ['title', 'the', 'time', 'machine']
索引: [2445, 1, 19, 46]


## 4. 将文本转换为数字索引序列，方便模型操作

In [79]:
def load_corpus(max_tokens=-1):
    """返回时光机器数据集的词元索引列表和词表"""
    lines = read_txt("../data/时间机器.txt")
    tokens = tokenize(lines)
    vocab = Vocab(tokens)
    # 因为时光机器数据集中的每个文本行不一定是一个句子或一个段落，所以将所有文本行展平到一个列表中
    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens > 0:  # 即最大文本数，避免内存溢出
        corpus = corpus[:max_tokens]
    return corpus, vocab

In [80]:
corpus, vocab = load_corpus()
len(corpus), len(vocab)

(36019, 4942)

## 简单示例
1分词 2创建词表 3词嵌入表示

In [81]:
s = 'Life is not easy for any of us.We must work,and above all we must believe in ourselves.We must believe that each one of us is able to do some thing well.And that we must work until we succeed.'
for c in string.punctuation: # 去除标点符号，替换为空格，并全部小写
    s = s.replace(c, ' ').lower()
print(s)
vocab = dict((word, index) for index, word in enumerate(np.unique(s.split())))  # 创建词表
print("词表 ",vocab)
s = [vocab.get(w) for w in s.split()]  # 将s映射为词表表示
print("s映射 ",s)

life is not easy for any of us we must work and above all we must believe in ourselves we must believe that each one of us is able to do some thing well and that we must work until we succeed 
词表  {'able': 0, 'above': 1, 'all': 2, 'and': 3, 'any': 4, 'believe': 5, 'do': 6, 'each': 7, 'easy': 8, 'for': 9, 'in': 10, 'is': 11, 'life': 12, 'must': 13, 'not': 14, 'of': 15, 'one': 16, 'ourselves': 17, 'some': 18, 'succeed': 19, 'that': 20, 'thing': 21, 'to': 22, 'until': 23, 'us': 24, 'we': 25, 'well': 26, 'work': 27}
s映射  [12, 11, 14, 8, 9, 4, 15, 24, 25, 13, 27, 3, 1, 2, 25, 13, 5, 10, 17, 25, 13, 5, 20, 7, 16, 15, 24, 11, 0, 22, 6, 18, 21, 26, 3, 20, 25, 13, 27, 23, 25, 19]


### 独热编码

In [82]:
# 转换为独热编码
b = np.zeros((len(s), len(vocab)))
for index, i in enumerate(s):
    b[index, i] = 1
b[:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

### 词嵌入表示

In [4]:
em = nn.Embedding(len(vocab), 10)  # 将42个单词映射到长度为10的张量
s_em = em(torch.LongTensor(s))
s_em[:5]

tensor([[ 1.0550,  0.4054,  0.5093, -0.9537,  0.4279,  0.6771, -0.4031, -1.0563,
          0.3883, -0.1057],
        [ 0.8376,  2.4439,  0.7858, -0.2589,  0.6345,  0.0563,  1.0418,  0.0950,
          1.4032,  0.4671],
        [ 0.0708,  1.0927,  0.7107, -1.4244,  1.1544,  2.1377, -0.1187,  0.5330,
          1.4930,  0.4909],
        [ 0.9150, -2.1348,  0.6800,  0.6468, -2.0456, -1.7328,  0.2601, -2.1585,
          1.8926,  0.5428],
        [-0.8145, -0.9149,  0.0263, -2.4155,  0.3824, -0.1913, -0.0900, -0.1396,
          0.6837,  1.7483]], grad_fn=<SliceBackward0>)

# 电影评论分类IMDB
电影评论：一个评论，label为消极、积极、未知 为3分类问题
参考 https://suool.net/archives/1d3523b.html
以下代码 适合torchtext 0.06 版本 pip install torchtext==0.6.0
所有数据集都是的子类torchtext.data.Dataset，它们继承自torch.utils.data.Dataset，并且具有split和iters实现的方法。

## 数据预处理

In [2]:
# 1 创建字段  (train.fields查看)
TEXT = torchtext.data.Field(lower=True, fix_length=200, batch_first=True)  # 表示评论，填充为200
LABEL = torchtext.data.Field(sequential=False)  # 表示标签
# 2 加载torchtext内置的IMDB电影评论数据
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL, root=r"../data")
# 3 构建词表 vocab
TEXT.build_vocab(train, max_size=10000, min_freq=10, vectors=None)  # 关注前10000个单词，次数小于10次就扔掉；
LABEL.build_vocab(train)
# 4 加载数据集
train_iter, test_iter = torchtext.data.BucketIterator.splits((train, test), batch_size=4)

In [None]:
TEXT.vocab.freqs  # 查看每个单词出现的频率

In [None]:
TEXT.vocab.stoi  # 词表本身 (长度10002， 包括填充值pad 和 unknown)

In [4]:
# 看看长什么样
def show_text(dataloader):
    b = next(iter(dataloader))
    text, label = b.text, b.label
    print(text.shape)
    print(text[:1])
    print(label.shape)
    print(label)

show_text(train_iter)

torch.Size([4, 200])
tensor([[  10,  205,  707, 2154, 7831,  116,    0, 1833,    6, 1008,  607,  248,
          133,   45,  798,   21,   24,   15, 1016,    0,  145, 1511,  777,   38,
          538,   17, 3150,  669, 1467,   37,    2,  280,  134,    2,    0,   10,
            7,   32,  573,  530,    5,    2,    0,  380,   45,    0,  210,    6,
          325,    6,    0,   65,    0,  709,   15, 1107, 5394,   12,  114,    3,
           56, 1764, 3703, 1329,  221,   16,    2,   84,   12,   14, 1740,    2,
           24, 2590,   51,   19,  997,   68,    0,    4,    0, 1610,   44,    3,
            0,    5,   30,    5,    2, 8598, 6629,    4,    2,  894,   27,    0,
          204, 1013,    2, 1966,    7,  257,   31, 2154, 7428,   96,   44,  173,
          721,    2,  132,    4,  553,   66,    0,   10,   14,    3,   20,   17,
            3,  152,  489,    4,    3, 1319,   36,  262,   43,    6,  395,    2,
          374,    8,    3,    0, 1893,   38, 1672,   44,  267,  296,   18,   23,
       

### 使用预训练的词向量
当在特定领域（例如医学和制造业）工作时，存在大量用于训练词向量的数据，此时预训练的词向量将会非常有用。
当几乎没有数据时，甚至不能有意义地训练词向量时，就可以使用这些在不同的数据语料库（如维基百科、谷歌新闻和Twitter推文）上训练好的词向量。
正确率可能会下降，因为语料库特点不一样；

In [3]:
# 1 创建字段  (train.fields查看)
TEXT = torchtext.data.Field(lower=True, fix_length=200, batch_first=True)  # 表示评论，填充为200
LABEL = torchtext.data.Field(sequential=False)  # 表示标签
# 2 加载torchtext内置的IMDB电影评论数据
train, test = torchtext.datasets.IMDB.splits(TEXT, LABEL, root=r"../data")
# 3 构建词表 vocab
# vectors使用预训练的词向量，使用6B版本的词向量，映射为100维；
TEXT.build_vocab(train, max_size=10000, min_freq=10, vectors=GloVe(name='6B', dim=100, cache=r'../data/.vector_cache'))
LABEL.build_vocab(train)
# 4 加载数据集
train_iter, test_iter = torchtext.data.BucketIterator.splits((train, test), batch_size=4)

In [5]:
TEXT.vocab.vectors  # 查看映射后的词向量

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1915, -0.2686,  0.0245,  ..., -0.4086, -0.5865,  0.0474],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

## 创建模型

In [3]:
class TextNet1(nn.Module):
    def __init__(self):
        super(TextNet1, self).__init__()
        self.em = nn.Embedding(10002, 100)   # batch*200*100  词嵌入表示 10002个单词映射到100维空间
        self.fc1 = nn.Linear(200*100, 1024)
        self.fc2 = nn.Linear(1024, 3)  # 3分类问题

    def forward(self, x):
        x = self.em(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = TextNet1()
model.to(device)

TextNet1(
  (em): Embedding(10002, 100)
  (fc1): Linear(in_features=20000, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=3, bias=True)
)

In [None]:
# 预训练模型
model = TextNet1()
model.em.weight.data = TEXT.vocab.vectors  # 使用预训练的词向量替换em层
model.em.weight.requires_grad = False  # 不再训练
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([ param for param in model.parameters() if param.requires_grad == True],  lr=0.001)

## 训练模型

In [4]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def fit(epoch, model, trainloader, testloader):
    correct = 0
    total = 0
    running_loss = 0

    model.train()
    for b in trainloader:
        x, y = b.text, b.label  # 一个批次的数据
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            correct += (y_pred == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()
#    exp_lr_scheduler.step()
    epoch_loss = running_loss / len(trainloader.dataset)
    epoch_acc = correct / total


    test_correct = 0
    test_total = 0
    test_running_loss = 0

    model.eval()
    with torch.no_grad():
        for b in testloader:
            x, y = b.text, b.label
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=1)
            test_correct += (y_pred == y).sum().item()
            test_total += y.size(0)
            test_running_loss += loss.item()

    epoch_test_loss = test_running_loss / len(testloader.dataset)
    epoch_test_acc = test_correct / test_total

    print('epoch: ', epoch,
          'loss： ', round(epoch_loss, 3),
          'accuracy:', round(epoch_acc, 3),
          'test_loss： ', round(epoch_test_loss, 3),
          'test_accuracy:', round(epoch_test_acc, 3)
             )

    return epoch_loss, epoch_acc, epoch_test_loss, epoch_test_acc

In [5]:
epochs = 5
train_loss, train_acc, test_loss, test_acc = [], [], [], []
for epoch in range(epochs):
    epoch_loss, epoch_acc, epoch_test_loss, epoch_test_acc = fit(epoch, model, train_iter, test_iter)
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    test_loss.append(epoch_test_loss)
    test_acc.append(epoch_test_acc)
## 简单模型显然不好用，过拟合了

epoch:  0 loss：  0.184 accuracy: 0.555 test_loss：  0.158 test_accuracy: 0.66
epoch:  1 loss：  0.123 accuracy: 0.81 test_loss：  0.182 test_accuracy: 0.722
epoch:  2 loss：  0.043 accuracy: 0.938 test_loss：  0.18 test_accuracy: 0.755
epoch:  3 loss：  0.018 accuracy: 0.973 test_loss：  0.236 test_accuracy: 0.759
epoch:  4 loss：  0.01 accuracy: 0.985 test_loss：  0.408 test_accuracy: 0.748


# RNN循环网络
使用IMDB数据集

In [None]:
hidden_size = 300
embeding_dim = 100

## GRUCell

In [None]:
class RNN_Encoder(nn.Module):
    """对评论(序列)依次读取，并输出最后状态，正常模式"""
    def __init__(self, input_dim, hidden_size):
        super(RNN_Encoder, self).__init__()
        # self.rnn = nn.RNNCell(input_dim, hidden_size)  # 内置的简单RNN，效果不好
        self.rnn = nn.GRUCell(input_dim, hidden_size)  # GRUCell效果很好
    def forward(self, inputs):  # inputs代表输入序列，shape=seq*batch*dim=200*batch*100
        bz = inputs.shape[1]  # batch_size
        ht = torch.zeros((bz, hidden_size)).cuda()  # 初始化hidden
        for word in inputs:  # 沿着单词(序列长度)进行展开
            ht = self.rnn(word, ht)
        # ht是整个序列的最终输出
        return ht

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(10002, embeding_dim)   # 200*batch*100 每次迭代一个单词
        self.rnn = RNN_Encoder(embeding_dim, hidden_size)     # batch*300
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)

    def forward(self, x):
        x = self.em(x)
        x = self.rnn(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = Net()
model.to(device)

## LSTMCell

In [None]:
class RNN_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(RNN_Encoder, self).__init__()
        self.rnn = nn.LSTMCell(input_dim, hidden_size)
    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros((bz, hidden_size)).cuda()
        ct = torch.zeros((bz, hidden_size)).cuda()  # 这里要多初始化一个
        for word in inputs:
            ht, ct = self.rnn(word, (ht, ct))
        return ht, ct

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(10002, embeding_dim)
        self.rnn = RNN_Encoder(embeding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)

    def forward(self, x):
        x = self.em(x)
        _, x = self.rnn(x)  # 这里用ct表示整个句子的理解
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = Net()
model.to(device)

## 使用内置的LSTM API

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(10002, embeding_dim)   # 200*batch*100
        self.rnn = nn.LSTM(embeding_dim, hidden_size)     # batch*300
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)

    def forward(self, inputs):
        bz = inputs.shape[1]
        h0 = torch.zeros((1, bz, hidden_size)).cuda()
        c0 = torch.zeros((1, bz, hidden_size)).cuda()
        x = self.em(inputs)
        r_o, _ = self.rnn(x, (h0, c0))  # 输出所有的输出
        r_o = r_o[-1]  # 选择最后的输出作为下一层的输入
        x = F.relu(self.fc1(r_o))
        x = self.fc2(x)
        return x

# 注意力机制 Transformer

## 创建模型
其他使用IMDB代码

In [None]:
import math

hidden_size = 300
embeding_dim = 100
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=200):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.em = nn.Embedding(10002, embeding_dim)   # 200*batch*100
        self.pos = PositionalEncoding(embeding_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embeding_dim, nhead=5)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
        self.fc1 = nn.Linear(200, 256)
        self.fc2 = nn.Linear(256, 3)

    def forward(self, inputs):
        x = self.em(inputs)
        x = self.pos(x)
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2)
        x = torch.sum(x, dim=-1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x