In [38]:
import math
import copy
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


In [39]:
# nn.Embeding
embed = nn.Embedding(10, 5)  # 10个词 5维度 —— 词表
input = torch.LongTensor([1, 5, 8])  # 三个词在词表中的位置 输入词索引
embed_vector = embed(input)  # 将词索引映射为词向量 —— 词输入作用到词表 这句话每个词的词向量

embed.weight

Parameter containing:
tensor([[-0.4153, -0.9413,  0.8550, -1.3293,  1.6079],
        [ 0.7442, -0.7293,  0.9848,  0.7392, -1.3974],
        [ 0.8280,  0.4740, -0.0798,  0.3997, -1.1896],
        [-1.2262, -1.8494,  1.4618,  0.5002,  0.3382],
        [-0.6768,  1.2837, -1.1490,  0.0199,  1.5975],
        [ 1.2886, -0.4951,  1.5465,  0.5698,  0.6618],
        [ 0.4021, -0.4294,  0.1554, -0.7145,  0.9706],
        [ 2.2023, -0.8830,  0.5127, -0.3469, -0.4431],
        [-1.5755,  3.0294,  0.2419,  1.5847, -0.2715],
        [-0.8835, -1.1849, -0.8417, -0.9277, -0.8115]], requires_grad=True)

In [40]:
embed_vector  # 取出

tensor([[ 0.7442, -0.7293,  0.9848,  0.7392, -1.3974],
        [ 1.2886, -0.4951,  1.5465,  0.5698,  0.6618],
        [-1.5755,  3.0294,  0.2419,  1.5847, -0.2715]],
       grad_fn=<EmbeddingBackward0>)

In [41]:
# nn.Linear
fc = nn.Linear(5, 10)  # 输入维度5 输出维度10
input = torch.randn(3, 5)  # 3个5维的输入
output = fc(input)  # 全连接层映射到10维输出

fc.weight

Parameter containing:
tensor([[ 0.4328, -0.3348,  0.1672, -0.2531,  0.0499],
        [-0.1833,  0.1492, -0.3691,  0.0408, -0.4445],
        [-0.2660, -0.0177, -0.1482, -0.1242,  0.1807],
        [ 0.1188,  0.0344, -0.3487, -0.4196,  0.2209],
        [-0.3169, -0.3924,  0.3597,  0.2004,  0.4252],
        [ 0.3007, -0.3972, -0.0343,  0.2566,  0.3572],
        [-0.1156, -0.3492, -0.2855, -0.2628, -0.0105],
        [ 0.0995, -0.2521,  0.3069,  0.0201, -0.0081],
        [ 0.0108, -0.3763,  0.2136,  0.2867, -0.2415],
        [-0.3875, -0.1295,  0.2180, -0.0164,  0.1174]], requires_grad=True)

In [42]:
output  # 点乘

tensor([[-0.0029,  1.1986, -0.9221,  0.5691, -2.6573, -0.6776,  0.2972, -0.1933,
         -0.0282, -1.2522],
        [-0.9954, -0.4001,  0.5952,  0.4435,  1.1623,  0.5016,  0.7252, -0.0366,
         -0.1845,  0.8951],
        [ 1.1373, -0.7050, -0.2704,  1.3825, -0.7205,  0.1895,  0.9271,  0.4831,
         -0.3922, -0.2671]], grad_fn=<AddmmBackward0>)

## 简单实现

In [43]:
word_to_ix = {'hello': 0, 'world': 1}
embeds = nn.Embedding(2, 5)  # 2个词 5维度 —— 词表
lookup_tensor = torch.tensor([word_to_ix['hello']], dtype=torch.long)  # 输入的词索引
hello_embed = embeds(lookup_tensor)  # 得到词向量

embeds.weight

Parameter containing:
tensor([[ 0.7559,  0.8514, -0.8851, -0.2470, -1.1481],
        [-0.5923,  0.1025, -0.0075,  0.0431,  2.0494]], requires_grad=True)

In [44]:
hello_embed

tensor([[ 0.7559,  0.8514, -0.8851, -0.2470, -1.1481]],
       grad_fn=<EmbeddingBackward0>)

## transformer中的实现

In [45]:
class Embeddings(nn.Module):
    """ d_model为词嵌入维度、vocab为词典大小"""

    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)  # self.lut为词表
        self.d_model = d_model

    def forward(self, x):
        """ 前向传播 """
        return self.lut(x) * math.sqrt(self.d_model)  # 得到词向量  归一化

In [46]:
d_model, vocab = 512, 1000  #  1000个词 512维度
x = torch.tensor([[100, 2, 421, 508], [491, 998, 1, 221]], dtype=torch.long)  # 输入预料的词索引
emb = Embeddings(d_model, vocab)
embr = emb(x)

embr  # torch.Size([2, 4, 512])  2句话 每句4个词 每词512维度

tensor([[[ 27.5474,   0.2073,   3.2926,  ...,   7.0026,  12.8985,  13.7053],
         [ 15.3772,  29.1237,  39.7162,  ...,  13.3172, -12.9345,  -8.1200],
         [ 39.2212, -23.5262,  -3.6106,  ...,  28.7280, -24.4168,  18.7876],
         [  1.6401,  21.7330, -11.0413,  ...,  27.2853, -24.3785, -14.1046]],

        [[ 41.1038,  -5.4731,  24.1499,  ...,  26.1438,   7.1153,   4.9907],
         [-14.9304, -22.4000,  32.7507,  ..., -25.9331, -23.8217, -15.4093],
         [-36.3945,  -4.3041, -28.6258,  ...,   8.7273, -28.3143,  -4.1974],
         [-13.3717,  -0.7670,  10.8779,  ...,  13.5576, -50.2387, -38.9500]]],
       grad_fn=<MulBackward0>)

In [47]:
#已知如下语料 给出其词嵌入
corpus = ["he is an old worker", "english is a useful tool", "the cinema is far away"]

# 需要生成词表
word_list = []
for sentence in corpus:  # sentence为1句话
    for word in sentence.split():  # word为1个词
        word_list.append(word)
# 需要去重 且生成词构成的索引结构
word_dirt = {}
for i in enumerate(set(word_list)):
    word_dirt[i[1]] = i[0]

word_dirt

{'away': 0,
 'useful': 1,
 'an': 2,
 'cinema': 3,
 'old': 4,
 'a': 5,
 'tool': 6,
 'far': 7,
 'english': 8,
 'worker': 9,
 'is': 10,
 'he': 11,
 'the': 12}

In [50]:
# 词嵌入矩阵
embeds = nn.Embedding(len(word_dirt), 5)  # 13个词 5维度
# 输入一句话  tensor([11, 10,  2,  4,  9])
lookup_tensor = torch.tensor([word_dirt[word] for word in corpus[0].split()], dtype=torch.long)
# 得到这句话每个词的词向量
sub_embed = embeds(lookup_tensor)

embeds.weight

Parameter containing:
tensor([[-0.1652,  1.8527, -0.5448, -0.2729, -0.1449],
        [ 1.4996, -1.0888,  0.9264, -0.7588,  0.6251],
        [-1.1544, -0.2262, -0.7112,  0.1644,  0.3335],
        [ 0.0692, -0.6363,  0.0714, -2.1863, -0.9576],
        [-0.4400, -0.4123, -0.1642, -0.1696, -1.1739],
        [ 0.5537, -1.3774, -0.1361, -1.0780,  0.4689],
        [-1.7460,  0.8354,  0.7688, -1.4681, -0.5667],
        [-0.0781,  0.1157,  0.8574,  0.2917, -0.1305],
        [-0.4630,  0.3838,  0.4448, -0.4246, -2.2026],
        [ 0.9852, -0.2545,  0.5236, -1.9448, -0.4065],
        [ 0.3120,  0.7543, -0.1107,  0.2004, -0.5817],
        [-1.6479,  0.3058, -1.3786,  0.5189,  1.1430],
        [-0.6112,  0.0971, -0.3067, -0.5186,  1.2675]], requires_grad=True)

In [51]:
sub_embed

tensor([[-1.6479,  0.3058, -1.3786,  0.5189,  1.1430],
        [ 0.3120,  0.7543, -0.1107,  0.2004, -0.5817],
        [-1.1544, -0.2262, -0.7112,  0.1644,  0.3335],
        [-0.4400, -0.4123, -0.1642, -0.1696, -1.1739],
        [ 0.9852, -0.2545,  0.5236, -1.9448, -0.4065]],
       grad_fn=<EmbeddingBackward0>)

## 把现有预料处理成训练集

In [55]:
# 数据处理
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
torch.manual_seed(1)

test_sentence = """
When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence))
]

ngrams

[(['forty', 'When'], 'winters'),
 (['winters', 'forty'], 'shall'),
 (['shall', 'winters'], 'besiege'),
 (['besiege', 'shall'], 'thy'),
 (['thy', 'besiege'], 'brow,'),
 (['brow,', 'thy'], 'And'),
 (['And', 'brow,'], 'dig'),
 (['dig', 'And'], 'deep'),
 (['deep', 'dig'], 'trenches'),
 (['trenches', 'deep'], 'in'),
 (['in', 'trenches'], 'thy'),
 (['thy', 'in'], "beauty's"),
 (["beauty's", 'thy'], 'field,'),
 (['field,', "beauty's"], 'Thy'),
 (['Thy', 'field,'], "youth's"),
 (["youth's", 'Thy'], 'proud'),
 (['proud', "youth's"], 'livery'),
 (['livery', 'proud'], 'so'),
 (['so', 'livery'], 'gazed'),
 (['gazed', 'so'], 'on'),
 (['on', 'gazed'], 'now,'),
 (['now,', 'on'], 'Will'),
 (['Will', 'now,'], 'be'),
 (['be', 'Will'], 'a'),
 (['a', 'be'], "totter'd"),
 (["totter'd", 'a'], 'weed'),
 (['weed', "totter'd"], 'of'),
 (['of', 'weed'], 'small'),
 (['small', 'of'], 'worth'),
 (['worth', 'small'], 'held:'),
 (['held:', 'worth'], 'Then'),
 (['Then', 'held:'], 'being'),
 (['being', 'Then'], 'asked

In [56]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

word_to_ix

{'And': 0,
 'Will': 1,
 'praise': 2,
 "beauty's": 3,
 'warm': 4,
 "'This": 5,
 "deserv'd": 6,
 'If': 7,
 'be': 8,
 'old,': 9,
 'winters': 10,
 'beauty': 11,
 'dig': 12,
 'blood': 13,
 'of': 14,
 'say,': 15,
 'thy': 16,
 "youth's": 17,
 'made': 18,
 'own': 19,
 'This': 20,
 'when': 21,
 'shall': 22,
 'new': 23,
 'small': 24,
 'livery': 25,
 'Thy': 26,
 'couldst': 27,
 'see': 28,
 'on': 29,
 'shame,': 30,
 'eyes,': 31,
 "feel'st": 32,
 'How': 33,
 'held:': 34,
 'Where': 35,
 'much': 36,
 'trenches': 37,
 'it': 38,
 'Then': 39,
 'field,': 40,
 'my': 41,
 'to': 42,
 'in': 43,
 'the': 44,
 'being': 45,
 'thine!': 46,
 'count,': 47,
 'lies,': 48,
 'by': 49,
 "totter'd": 50,
 'make': 51,
 "excuse,'": 52,
 'worth': 53,
 'besiege': 54,
 'sunken': 55,
 'sum': 56,
 'days;': 57,
 'To': 58,
 'child': 59,
 'cold.': 60,
 'succession': 61,
 'old': 62,
 'use,': 63,
 'where': 64,
 'praise.': 65,
 'within': 66,
 'a': 67,
 'thine': 68,
 'weed': 69,
 'treasure': 70,
 'asked,': 71,
 'thriftless': 72,
 'Prov

In [67]:
# 神经网络模型定义
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        """ 词表长度、词向量维度、上下文相关 """
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimer = optim.SGD(model.parameters(), lr=0.01)

In [68]:
for k in range(100):  # 100次训练
    for i in ngrams:  # 1次训练
        # 输入输出 不能用词要用词表的位置
        input_x = torch.tensor([word_to_ix[i] for i in i[0]], dtype=torch.long)
        input_y = torch.tensor([word_to_ix[i[1]]], dtype=torch.long)
        output0 = model(input_x)

        model.zero_grad()  # 之前的梯度数据清零
        loss = loss_function(output0, input_y)  # 算损失
        loss.backward()  # 损失 反向传播
        optimer.step()  # 更新梯度 优化器
    # 每次训练输出结果
    print(loss)

tensor(5.0125, grad_fn=<NllLossBackward0>)
tensor(4.7472, grad_fn=<NllLossBackward0>)
tensor(4.4900, grad_fn=<NllLossBackward0>)
tensor(4.2347, grad_fn=<NllLossBackward0>)
tensor(3.9782, grad_fn=<NllLossBackward0>)
tensor(3.7082, grad_fn=<NllLossBackward0>)
tensor(3.4209, grad_fn=<NllLossBackward0>)
tensor(3.1165, grad_fn=<NllLossBackward0>)
tensor(2.7959, grad_fn=<NllLossBackward0>)
tensor(2.4601, grad_fn=<NllLossBackward0>)
tensor(2.1117, grad_fn=<NllLossBackward0>)
tensor(1.7604, grad_fn=<NllLossBackward0>)
tensor(1.4246, grad_fn=<NllLossBackward0>)
tensor(1.1231, grad_fn=<NllLossBackward0>)
tensor(0.8734, grad_fn=<NllLossBackward0>)
tensor(0.6767, grad_fn=<NllLossBackward0>)
tensor(0.5336, grad_fn=<NllLossBackward0>)
tensor(0.4285, grad_fn=<NllLossBackward0>)
tensor(0.3510, grad_fn=<NllLossBackward0>)
tensor(0.2938, grad_fn=<NllLossBackward0>)
tensor(0.2495, grad_fn=<NllLossBackward0>)
tensor(0.2150, grad_fn=<NllLossBackward0>)
tensor(0.1876, grad_fn=<NllLossBackward0>)
tensor(0.16

In [69]:
model.embeddings.weight  # 训练好的词嵌入矩阵

Parameter containing:
tensor([[ 5.2145e-02,  7.7348e-01, -6.2138e-01,  1.5694e+00,  8.9320e-01,
         -2.7682e-01,  8.2514e-01, -1.6480e+00,  9.3918e-01,  5.3340e-01],
        [-1.7176e+00, -3.1630e-01,  8.9163e-01, -5.2749e-01,  4.9563e-01,
          1.6146e+00, -2.3427e-01, -2.2426e-01,  1.1302e+00, -1.1990e+00],
        [ 1.3525e+00, -2.3831e-01, -3.1669e-02, -7.8350e-02,  2.7794e-01,
         -6.7688e-01,  1.6782e+00,  1.6762e+00,  1.0122e+00, -5.6545e-01],
        [-1.3831e+00,  2.5403e-01,  7.0112e-01, -1.2497e+00,  2.7104e-01,
         -8.3010e-01, -8.5585e-01, -1.2908e+00, -4.3944e-01,  1.1033e+00],
        [-4.2272e-01, -4.6001e-01,  3.4660e+00, -6.5733e-01, -4.5240e-01,
          1.3041e+00,  3.2442e-01, -2.8194e+00, -3.6490e-01, -1.1798e+00],
        [-1.3210e+00, -2.0738e-01,  8.2298e-01,  6.8098e-01, -5.6163e-02,
         -1.5267e+00,  8.5267e-01,  1.0257e+00, -1.3615e+00, -6.5622e-01],
        [ 3.2922e-01,  9.9916e-01,  1.0737e+00, -1.0013e-01, -5.6900e-01,
          