In [2]:
import torch
import torch.nn as nn

In [2]:
# Vectorized operation
data = torch.arange(1, 36, dtype=torch.float32).reshape(5, 7)
print("Data is:", data)

print("Taking the sum over columns:")
print(data.sum(dim=0))

print("Taking thep sum over rows:")
print(data.sum(dim=1))

print("Taking the sum of all:")
print(data.sum())

print("Taking the stdev over rows:")
print(data.std(dim=1))

Data is: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19., 20., 21.],
        [22., 23., 24., 25., 26., 27., 28.],
        [29., 30., 31., 32., 33., 34., 35.]])
Taking the sum over columns:
tensor([ 75.,  80.,  85.,  90.,  95., 100., 105.])
Taking thep sum over rows:
tensor([ 28.,  77., 126., 175., 224.])
Taking the sum of all:
tensor(630.)
Taking the stdev over rows:
tensor([2.1602, 2.1602, 2.1602, 2.1602, 2.1602])


In [3]:
data = torch.tensor([[1, 2.2, 9.6],[4, -7.2, 6.3]])
row_avg = data.mean(dim=1)
col_avg = data.mean(dim=0)

print(row_avg.shape)
print(row_avg)

print(col_avg.shape)
print(col_avg)

torch.Size([2])
tensor([4.2667, 1.0333])
torch.Size([3])
tensor([ 2.5000, -2.5000,  7.9500])


In [5]:
x = torch.Tensor([
                  [[1, 2], [3, 4]],
                  [[5, 6], [7, 8]],
                  [[9, 10], [11, 12]]
                 ])
x, x.shape

(tensor([[[ 1.,  2.],
          [ 3.,  4.]],
 
         [[ 5.,  6.],
          [ 7.,  8.]],
 
         [[ 9., 10.],
          [11., 12.]]]),
 torch.Size([3, 2, 2]))

In [6]:
x[0]

tensor([[1., 2.],
        [3., 4.]])

In [9]:
x[:, 0] # shape:(3， 2)

tensor([[ 1.,  2.],
        [ 5.,  6.],
        [ 9., 10.]])

In [10]:
matr = torch.arange(1, 16).view(5, 3)
print(matr)

tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])


In [13]:
matr[0] # shape:(3, )

tensor([1, 2, 3])

In [14]:
matr[:, 0]

tensor([ 1,  4,  7, 10, 13])

In [15]:
matr[0:3]

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [16]:
matr[:, 0:2]

tensor([[ 1,  2],
        [ 4,  5],
        [ 7,  8],
        [10, 11],
        [13, 14]])

In [17]:
matr[0:3, 0:2]

tensor([[1, 2],
        [4, 5],
        [7, 8]])

In [18]:
matr[0][2]

tensor(3)

In [19]:
matr[0:3, 2]

tensor([3, 6, 9])

In [23]:
matr, matr[0:3][2]

(tensor([[ 1,  2,  3],
         [ 4,  5,  6],
         [ 7,  8,  9],
         [10, 11, 12],
         [13, 14, 15]]),
 tensor([7, 8, 9]))

In [24]:
matr[0:3]

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

In [25]:
matr[[0, 2, 4]]

tensor([[ 1,  2,  3],
        [ 7,  8,  9],
        [13, 14, 15]])

In [27]:
x, x[:, 0, 0]

(tensor([[[ 1.,  2.],
          [ 3.,  4.]],
 
         [[ 5.,  6.],
          [ 7.,  8.]],
 
         [[ 9., 10.],
          [11., 12.]]]),
 tensor([1., 5., 9.]))

In [28]:
x[:, :, :]

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

In [32]:
i = torch.tensor([0, 0, 1, 1])
x, x[i], x[[0, 0, 1, 1]]

(tensor([[[ 1.,  2.],
          [ 3.,  4.]],
 
         [[ 5.,  6.],
          [ 7.,  8.]],
 
         [[ 9., 10.],
          [11., 12.]]]),
 tensor([[[True, True],
          [True, True]],
 
         [[True, True],
          [True, True]],
 
         [[True, True],
          [True, True]],
 
         [[True, True],
          [True, True]]]))

In [33]:
i = torch.tensor([1, 2])
j = torch.tensor([0])
x, x[i, j]

(tensor([[[ 1.,  2.],
          [ 3.,  4.]],
 
         [[ 5.,  6.],
          [ 7.,  8.]],
 
         [[ 9., 10.],
          [11., 12.]]]),
 tensor([[ 5.,  6.],
         [ 9., 10.]]))

In [34]:
x[0, 0, 0]

tensor(1.)

In [35]:
x[0, 0, 0].item()   # scalar value

1.0

In [36]:
# Exercise
data = torch.tensor([[1, 2.2, 9.6], [4, -7.2, 6.3]])
first_col = data[:, 0]
first_row = data[0]
first_col, first_row

(tensor([1., 4.]), tensor([1.0000, 2.2000, 9.6000]))

In [38]:
import pprint as pp

In [39]:
x = torch.tensor([2.], requires_grad=True)
pp.pprint(x.grad)

None


In [40]:
y = x * x * 3 # 3x^2
y.backward()
pp.pprint(x.grad)

tensor([12.])


In [41]:
z = x * x * 3
z.backward()
pp.pprint(x.grad)

tensor([24.])


In [42]:
import torch.nn as nn

In [43]:
input = torch.ones(2,3,4)
linear = nn.Linear(4, 2)
linear_output = linear(input)
linear_output

tensor([[[ 0.3955, -1.1012],
         [ 0.3955, -1.1012],
         [ 0.3955, -1.1012]],

        [[ 0.3955, -1.1012],
         [ 0.3955, -1.1012],
         [ 0.3955, -1.1012]]], grad_fn=<ViewBackward0>)

In [44]:
list(linear.parameters())   # Ax + b

[Parameter containing:
 tensor([[-0.2948,  0.2432,  0.1184,  0.3072],
         [ 0.3915, -0.3828, -0.2548, -0.3784]], requires_grad=True),
 Parameter containing:
 tensor([ 0.0215, -0.4766], requires_grad=True)]

In [45]:
"""
Data of shape [batch_size, feature_dim] # 4
[batch_size, output_dim] # 2
linear layer of shape (feature_dim, output_dim)
"""

'\nData of shape [batch_size, feature_dim] # 4\n[batch_size, output_dim] # 2\nlinear layer of shape (feature_dim, output_dim)\n'

In [46]:
linear_output

tensor([[[ 0.3955, -1.1012],
         [ 0.3955, -1.1012],
         [ 0.3955, -1.1012]],

        [[ 0.3955, -1.1012],
         [ 0.3955, -1.1012],
         [ 0.3955, -1.1012]]], grad_fn=<ViewBackward0>)

In [47]:
sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.5976, 0.2495],
         [0.5976, 0.2495],
         [0.5976, 0.2495]],

        [[0.5976, 0.2495],
         [0.5976, 0.2495],
         [0.5976, 0.2495]]], grad_fn=<SigmoidBackward0>)

In [48]:
block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)
input = torch.ones(2,3,4)
output = block(input)
output

tensor([[[0.7427, 0.5655],
         [0.7427, 0.5655],
         [0.7427, 0.5655]],

        [[0.7427, 0.5655],
         [0.7427, 0.5655],
         [0.7427, 0.5655]]], grad_fn=<SigmoidBackward0>)

自定义模块
我们可以通过扩展nn.Module类构建属于我们自己的模块，而不是使用预定义好的模块。
举个例子，我们可以用之前介绍过的张量构建一个自己的nn.Linear。
我们也可以构建一个新的，更加复杂的模块，例如一个自定义的神经网络。
你将在后面的作业中练习这些内容。
为了创建一个自定义模块，我们必须要做的第一件事是扩展nn.Module。
我们可以在__init__函数中初始化我们的参数，这通过在一开始调用超类的__init__函数
完成。我们定义的所有属于nn模块对象的类属性被视为在训练中可学习的参数。张量不是参数，
但是当它们封装在nn.Parameter类当中时可以转化为参数。
所有扩展nn.Module的类最好能够实现forward(x)函数，其中x是一个张量。
这是一个在参数传递到我们模块当中时调用的函数，比如在使用model(x)时。

In [52]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MultilayerPerceptron, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size, self.input_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        output = self.model(x)
        return output

In [53]:
input = torch.randn(2, 5)
model = MultilayerPerceptron(5, 3)
model(input)

tensor([[0.3805, 0.4069, 0.4430, 0.4944, 0.4828],
        [0.3970, 0.4144, 0.4003, 0.4809, 0.4570]], grad_fn=<SigmoidBackward0>)

In [54]:
list(model.named_parameters())

[('model.0.weight',
  Parameter containing:
  tensor([[-0.3164, -0.3948,  0.0007,  0.4110, -0.1013],
          [ 0.1920, -0.3115, -0.1146, -0.2034, -0.2211],
          [ 0.2331, -0.0947,  0.2937,  0.4448,  0.3240]], requires_grad=True)),
 ('model.0.bias',
  Parameter containing:
  tensor([ 0.1063, -0.1179,  0.3035], requires_grad=True)),
 ('model.2.weight',
  Parameter containing:
  tensor([[ 0.1402,  0.4693, -0.1998],
          [ 0.0039, -0.0693, -0.3902],
          [-0.3732,  0.0601,  0.4037],
          [-0.1194,  0.1520,  0.1026],
          [-0.3807,  0.3686, -0.5746]], requires_grad=True)),
 ('model.2.bias',
  Parameter containing:
  tensor([-0.4371, -0.2789, -0.3306, -0.0484,  0.0755], requires_grad=True))]

In [55]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.3164, -0.3948,  0.0007,  0.4110, -0.1013],
         [ 0.1920, -0.3115, -0.1146, -0.2034, -0.2211],
         [ 0.2331, -0.0947,  0.2937,  0.4448,  0.3240]], requires_grad=True),
 Parameter containing:
 tensor([ 0.1063, -0.1179,  0.3035], requires_grad=True),
 Parameter containing:
 tensor([[ 0.1402,  0.4693, -0.1998],
         [ 0.0039, -0.0693, -0.3902],
         [-0.3732,  0.0601,  0.4037],
         [-0.1194,  0.1520,  0.1026],
         [-0.3807,  0.3686, -0.5746]], requires_grad=True),
 Parameter containing:
 tensor([-0.4371, -0.2789, -0.3306, -0.0484,  0.0755], requires_grad=True)]

In [56]:
import torch.optim as optim

In [58]:
y = torch.ones(10, 5)
x = y + torch.randn_like(y)
x

tensor([[-1.2668,  1.7910,  1.5346, -0.4032,  0.3213],
        [ 0.7083,  1.7907,  2.0176, -0.2192,  0.8725],
        [ 0.6034,  0.8095, -0.0816, -0.5976,  2.6324],
        [ 2.3761,  0.1692,  0.8731,  0.4185,  1.0365],
        [ 0.7275,  1.8190,  1.8786,  0.4107,  1.1052],
        [ 1.6249,  0.1531, -0.2657,  0.7569,  1.3995],
        [-0.1918,  2.6969,  0.8087,  0.8950,  0.1490],
        [ 1.0754,  1.1609,  0.2950,  1.1435,  2.4459],
        [ 0.5117, -0.6529,  0.4466, -0.3124,  0.4715],
        [ 1.0038, -0.4121,  2.8416,  2.2504, -0.0415]])

In [59]:
model = MultilayerPerceptron(5, 3)
adam = optim.Adam(model.parameters(), lr=1e-1)
loss_function = nn.BCELoss()
y_pred = model(x)
loss_function(y_pred, y).item()

0.6932042837142944

In [60]:
n_epoch = 10
for epoch in range(n_epoch):
    adam.zero_grad()
    y_pred = model(x)
    loss = loss_function(y_pred, y)
    print(f"Epoch {epoch}: traing loss: {loss}")
    loss.backward()
    adam.step()

Epoch 0: traing loss: 0.6932042837142944
Epoch 1: traing loss: 0.5639248490333557
Epoch 2: traing loss: 0.448487788438797
Epoch 3: traing loss: 0.34124821424484253
Epoch 4: traing loss: 0.24731726944446564
Epoch 5: traing loss: 0.1686665117740631
Epoch 6: traing loss: 0.10722093284130096
Epoch 7: traing loss: 0.06405559182167053
Epoch 8: traing loss: 0.0371493324637413
Epoch 9: traing loss: 0.021797971799969673


In [61]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.5380, -0.3665, -0.6788, -0.0559, -0.1045],
         [ 0.8991,  1.1448,  1.3504,  0.9494,  1.1555],
         [ 0.3568, -0.0857, -0.2778, -0.0253, -0.3758]], requires_grad=True),
 Parameter containing:
 tensor([-0.1227,  1.4232, -0.3917], requires_grad=True),
 Parameter containing:
 tensor([[-0.1119,  1.3390, -0.4672],
         [ 0.8958,  0.5136, -0.4674],
         [ 0.0234,  1.1834,  0.2619],
         [ 0.8486,  1.3662,  0.2864],
         [ 0.1000,  0.7296, -0.5668]], requires_grad=True),
 Parameter containing:
 tensor([0.6550, 0.4905, 0.4034, 0.9733, 1.2370], requires_grad=True)]

In [62]:
y_pred = model(x)
y_pred

tensor([[0.9986, 0.9398, 0.9963, 0.9991, 0.9884],
        [1.0000, 0.9880, 0.9999, 1.0000, 0.9989],
        [0.9995, 0.9605, 0.9987, 0.9997, 0.9938],
        [0.9999, 0.9790, 0.9997, 0.9999, 0.9975],
        [1.0000, 0.9917, 1.0000, 1.0000, 0.9994],
        [0.9994, 0.9559, 0.9983, 0.9996, 0.9927],
        [0.9999, 0.9782, 0.9997, 0.9999, 0.9974],
        [1.0000, 0.9902, 1.0000, 1.0000, 0.9992],
        [0.9650, 0.8192, 0.9402, 0.9756, 0.9362],
        [1.0000, 0.9889, 0.9999, 1.0000, 0.9990]], grad_fn=<SigmoidBackward0>)

Demo: Word Window Classification

In [1]:
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

In [2]:
# 单词小写化
def preprocess_sentence(sentence):
    return sentence.lower().split()

train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

对于每一个训练样本，我们应该有一个相应的标签。
还记得我们模型的目标是决定哪些词对应一个地点(LOCATION)。
换言之，我们想要我们的模型对所有不是地点的词输出0，而
对所有是地点的词输出1

In [3]:
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])
train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

将单词转换成向量
让我们进一步考察我们的训练数据。我们拥有的每一个数据点是一个单词的序列。
另一方面，我们知道机器学习模型处理的是向量中的数字。
那我们如何将单词转换成数字呢？你可能想到了词嵌入，没错你是对的！
假设我们有一个词嵌入的查找表E，E的每一行对应一个词嵌入。
因此在词汇表中的每一个单词在E中都有一个对应的嵌入行i。
每当我们想要为一个单词找到对应的嵌入时，我们将采用这些步骤：
1.word -> index
2.index -> embedding
让我们看看第一步。我们应该为词汇表中的所有单词分配一个对应的索引。
我们能做以下事情：
1.找到语料中所有去重后的词
2.为每个词分配索引

In [4]:
vocabulary = set([w for s in train_sentences for w in s])
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [5]:
# 词汇表中添加未知词"<unk>"
vocabulary.add("<unk>")

In [6]:
# 词汇表中添加填充词"<pad>"
# 两端填充
vocabulary.add("<pad>")

def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

In [7]:
vocabulary

{'<pad>',
 '<unk>',
 'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

In [8]:
# 索引到单词：列表
# 单词到索引：字典
ix_to_word = sorted(list(vocabulary))
word_to_ix = {word:ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

In [9]:
'we' in word_to_ix

True

我们已经准备好将我们的训练句子转换成一个与单词对应的索引的序列

In [10]:
# 给定一个句子，返回对应的索引
def convert_token_to_indices(sentence, word_to_ix):
    indices = []
    for token in sentence:
        if token in word_to_ix:
            indices.append(word_to_ix[token])
        else:
            indices.append(word_to_ix['<unk>'])
    return indices

# convert_token_to_indices的紧凑形式(?)
def _convert_token_to_indices(sentence, word_to_ix):
    return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]
    """
    dictionary.get(key, default)
    用于从字典中获取指定键的值，当key不存在于字典中时，返回default
    """

In [11]:
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']


In [12]:
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

我们可以用PyTorch中的nn.Embedding类创建一个嵌入表。
nn.Embedding(num_words, embedding_dimension)，其中num_words是
词汇表中的单词数，embedding_dimension是我们希望的词嵌入维数。
当训练网络时，梯度会反向一路传播到embedding层，因此我们的词嵌入将得到更新。

In [13]:
# 词向量初始化
import torch.nn as nn
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

list(embeds.parameters())

[Parameter containing:
 tensor([[-4.6027e-01, -8.4919e-01, -1.1172e+00,  4.7096e-01, -1.1386e+00],
         [ 9.7457e-01, -4.7157e-02, -1.9248e-01, -8.8030e-01,  1.7610e+00],
         [-8.8988e-01, -1.0015e+00,  2.2139e+00, -2.8757e-01, -4.0670e-03],
         [ 1.1170e+00, -2.6294e-01,  8.4020e-01,  1.2682e+00, -9.3508e-01],
         [ 1.9339e+00,  8.6110e-01,  5.9314e-01, -4.8939e-01, -6.1035e-01],
         [ 3.4974e-01, -2.1238e+00, -4.9988e-01, -7.8539e-01,  1.1435e-01],
         [ 1.1516e-01,  5.0008e-01, -7.3795e-01, -1.3123e+00,  1.8227e-01],
         [ 1.3435e+00, -3.7310e-01,  4.7257e-01, -8.9900e-01,  1.1649e+00],
         [-4.2022e-01,  5.9605e-01,  8.5568e-01,  5.5407e-01, -1.0130e-01],
         [-5.8360e-01,  8.9255e-01,  7.5948e-02, -1.2467e+00,  1.0256e+00],
         [-2.0083e+00,  1.7923e+00, -2.2006e+00,  5.6028e-01, -6.5850e-01],
         [ 1.8532e+00, -9.1294e-01,  8.7202e-01, -6.1887e-01, -1.8428e+00],
         [-4.5302e-01, -1.2417e-01, -9.4466e-01,  1.9372e+00,  8.

In [14]:
# 返回单个给定词的词向量
import torch
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([-1.6033,  0.0939, -0.5017,  1.4697, -0.7637],
       grad_fn=<EmbeddingBackward0>)

In [15]:
# 返回多个给定词(列表)的词向量
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[-1.6033,  0.0939, -0.5017,  1.4697, -0.7637],
        [ 1.1170, -0.2629,  0.8402,  1.2682, -0.9351]],
       grad_fn=<EmbeddingBackward0>)

我们可以写一个自定义函数并传递给collate_fn，以此打印批量的状态或者进行额外的处理。
在我们的例子中，我们将使用collate_fn：
1.用"<pad>"填充训练的句子
2.将训练样本中的单词转换成索引
3.将所有句子和标签填充到相同长度
当计算损失时，需要知道给定样本中确切的单词数，我们将在传递给collate_fn的函数中追踪
这个单词数。
在collate_fn函数中需要用到word_to_ix使单词转化成索引，这里使用了python中的
partial函数。

In [16]:
# DataLoader + 数据预处理
from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
    x, y = zip(*batch)
    """
    zip用于将多个可迭代对象(列表，元组，集合等)按照位置一一配对，接受一个
    或多个可迭代对象，返回迭代器，该迭代器包含输入可迭代对象中相同位置元素
    的元组
    zip的用法之一是解压元组：
    pairs = [(1, 'a'), (2, 'b'), (3, 'c')]
    numbers, letters = zip(*pairs)
    print(numbers) # -> (1,2,3)
    print(letters) # -> ('a','b','c')
    """
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window

    # 在两端用<pad>填充训练样本
    x = [pad_window(s, window_size) for s in x]

    # 将训练样本转换成索引
    def convert_tokens_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]
    x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

    # 填充样本使得一个批量中的所有样本长度相同，便于完成矩阵运算
    # 我们将batch_first参数设置为True，由此返回的矩阵的第一维是批量
    pad_token_ix = word_to_ix["<pad>"]
    # pad_sequence函数期望输入是张量，因此先转成LongTensor
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True,padding_value=pad_token_ix)    # 填充后的一致长度我们暂时不知

    # 我们还需要填充标签，在此之前，我们将记录标签的数量以便知道每个样本中的单词数
    lengths = [len(label) for label in y]
    lenghts = torch.LongTensor(lengths)

    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    return x_padded, y_padded, lenghts

In [17]:
# 无注释精简版
def _custom_collate_fn(batch, window_size, word_to_ix):
    x, y = zip(*batch)
    def pad_window(sentence, window_size, pad_token="<pad>"):
        window = [pad_token] * window_size
        return window + sentence + window
    x = [pad_window(s, window_size) for s in x]
    def convert_tokens_to_indices(sentence, word_to_ix):
        return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]
    x = [convert_tokens_to_indices(s, word_to_ix) for s in x]
    x = [torch.LongTensor(x_i) for x_i in x]
    x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=word_to_ix["<pad>"])

    # 没有对y进行pad_window
    # 无需对y进行convert_tokens_to_indices
    y = [torch.LongTensor(y_i) for y_i in y]
    y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

    lengths = [len(label) for label in y]
    lengths = torch.LongTensor(lengths)

    return x_padded, y_padded, lengths

In [18]:
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)
"""
functools.partial用于创建一个新函数，该函数对现有函数的部分参数
进行了预先设定，使这些参数在创建'partial'对象时就已经被设置
举例：
from functools import partial
add_five = partial(lambda x, y: x + y, 5)
result = add_five(10) # -> 15

join_words = partial(str.join, sep = ' ')
result = join_words(['hello', 'world']) # -> 'hello world'
"""

loader = DataLoader(data, batch_size=batch_size,shuffle=shuffle,collate_fn=collate_fn)
# 遍历一个epoch
counter = 0
for batched_x, batched_y, batched_lengths in loader:
    print(f"Iteration {counter}")
    print("Batched Input:")
    print(batched_x)
    print("Batched Labels:")
    print(batched_y)
    print("Batched Lengths:")
    print(batched_lengths)
    print("")
    counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])
Batched Lengths:
tensor([5, 4])

Iteration 1
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0],
        [ 0,  0,  9,  7,  8, 18,  0,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1],
        [0, 0, 0, 1, 0, 0]])
Batched Lengths:
tensor([6, 4])

Iteration 2
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1]])
Batched Lengths:
tensor([5])



In [19]:
# 由原始序列返回多窗口
print(f"Original Tensor: ")
print(batched_x)
print("")

chunk = batched_x.unfold(1, window_size*2 + 1, 1)
"""
unfold(dimension, size, step)
用于对张量进行滑动窗口操作
dim: 滑动窗口的维度(轴)索引
size: 滑动窗口的大小
step: 滑动窗口的步幅
"""
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0]])

Windows: 
tensor([[[ 0,  0, 22,  2,  6],
         [ 0, 22,  2,  6, 20],
         [22,  2,  6, 20, 15],
         [ 2,  6, 20, 15,  0],
         [ 6, 20, 15,  0,  0]]])


In [20]:
# 创建窗口分类器模型
import torch.nn as nn

class WordWindowClassifier(nn.Module):
    def __init__(self, hyperparameters, vocab_size, pad_ix=0):
        super(WordWindowClassifier, self).__init__()
        self.window_size = hyperparameters["window_size"]
        self.embed_dim = hyperparameters["embed_dim"]
        self.hidden_dim = hyperparameters["hidden_dim"]
        self.freeze_embeddings = hyperparameters["freeze_embeddings"]

        """
        Embedding层
        self.freeze_embeddings控制是否冻结Embedding层参数
        """
        self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
        if self.freeze_embeddings:
            self.embeds.weight.requires_grad = False

        """
        Hidden层
        """
        full_window_size = 2 * window_size + 1
        self.hidden_layer = nn.Sequential(
            nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
            nn.Tanh()
        )

        """
        Output层
        """
        self.output_layer = nn.Linear(self.hidden_dim, 1)
        """
        输出概率
        """
        self.probabilities = nn.Sigmoid()

    def forward(self, x):
        """
        参数说明：
        B: 批量大小
        L: 两端窗口填充后的句子长度
        D: 嵌入维数
        S: 窗口总长
        H: 隐层维数
        输入维数：
        形状为(B, L)的索引张量
        """
        B, L = x.size()
        """
        获取滑动窗口
        返回形状：(批量大小，窗口数量，窗口总长)
        """
        token_windows = x.unfold(1, 2 * self.window_size + 1, 1)
        _, adjusted_length, _ = token_windows.size()

        # 维数正确性检验
        assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

        """
        Embedding
        输入: (批量大小，窗口数量，窗口总长) / (B, L~, S)
        输出: (批量大小，窗口数量，窗口总长，嵌入维数) / (B, L~, S, D)
        """
        embedded_windows = self.embeds(token_windows)

        """
        Reshapeing
        (B, L~, S, D) -> (B, L~, S * D)
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)

        """
        (B, L~, S * D) -> (B, L~, H)
        """
        layer_1 = self.hidden_layer(embedded_windows)

        """
        (B, L~, H) -> (B, L~, 1)
        """
        layer_2 = self.output_layer(layer_1)

        """
        probabilities output
        (B, L~, 1) -> (B, L~)
        1 probability for 1 window
        """
        output = self.probabilities(layer_2)
        output = output.view(B, -1)

        return output

In [21]:
# 准备数据，定义模型、损失函数、优化器
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

loader = DataLoader(data, batch_size=batch_size,shuffle=shuffle,collate_fn=collate_fn)

model_hyperparameters = {
    "batch_size":4,
    "window_size":2,
    "embed_dim":25,
    "hidden_dim":25,
    "freeze_embeddings":False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

def loss_function(batch_outputs, batch_labels, batch_lengths):
    bceloss = nn.BCELoss()
    """
    nn.BCELoss
    measures the Binary Cross Entropy between the target and the input probabilities
    计算目标和输入概率的二分类交叉熵
    l(x, y) = mean(-w_n*[y_n*log(x_n)+(1-y_n)*log(1-x_n)]) / sum(-w_n*[y_n*log(x_n)+(1-y_n)*log(1-x_n)])
    y: label
    x: probability
    """
    loss = bceloss(batch_outputs, batch_labels.float())

    loss = loss / batch_lengths.sum().float()

    return loss

In [22]:
def train_epoch(loader, model, loss_function, optimizer):
    total_loss = 0
    # batch_lengths存放每个样本在两端窗口填充，批量长度一致填充前的单词数
    for batch_inputs, batch_labels, batch_lengths in loader:
        optimizer.zero_grad()
        outputs = model.forward(batch_inputs)
        batch_loss = loss_function(outputs, batch_labels, batch_lengths)
        batch_loss.backward()
        optimizer.step()
        total_loss += batch_loss.item()

    return total_loss

def train(loader, model, loss_function, optimizer, num_epochs=10000):
    for epoch in range(num_epochs):
        epoch_loss = train_epoch(loader, model, loss_function, optimizer)
        if epoch % 100 == 0: print(epoch_loss)

In [23]:
num_epochs = 1000
train(loader, model, loss_function, optimizer, num_epochs=num_epochs)

0.2751699239015579
0.2535048946738243
0.21989352256059647
0.17295077443122864
0.14909899234771729
0.12179402261972427
0.09929980896413326
0.08036387152969837
0.07015462033450603
0.05990997422486544


In [26]:
# 模型预测
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]

test_sentences, test_labels

([['she', 'comes', 'from', 'paris']], [[0, 0, 0, 1]])

In [27]:
# list of tuples
test_data = list(zip(test_sentences, test_labels))
test_data

[(['she', 'comes', 'from', 'paris'], [0, 0, 0, 1])]

In [28]:
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data,
                                           batch_size=1,
                                           shuffle=False,
                                           collate_fn=collate_fn)

In [29]:
for test_instance, labels, _ in test_loader:
  outputs = model.forward(test_instance)
  print(labels)
  print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.2002, 0.2195, 0.1137, 0.9366]], grad_fn=<ViewBackward0>)


模块梳理
data = list(zip(sentences, labels))
sentences:[[word1, word2, ...], ...], labels:[[label1, label2, ...], ...]
torch.utils.data.DataLoader(data, batch_size, shuffle, collate_fn)
collate_fn is used for data preprocessing and is implemented using partial
collate_fn = partial(custom_collate_fn, argument1=value1, argument2=value2)
Here, argument1 is window_size, and argument2 is word_to_ix
custom_collate_fn returns window-padded and batch-padded sequences of indices and batch-padded labels and the number of words per example given batches.
senteces -> indices, "<pad>" for padding and "<unk>" represents word not in the vocabulary
In WindowClassifier model, a dictionary is used to manage hyperparameters, such as window_size, embed_dim, hidden_dim and
freeze_embeddings.
The dimension conversion include:
x.unfold: to get sliding windows, (B, L) -> (B, L~(the number of windows), S(full_window_size))
batch-padding ensures that the number of windows is equal for each example.
nn.Embedding: (B, L~, S) -> (B, L~, S, D)
tensor.view: (B, L~, S, D) -> (B, L~, S * D)
nn.Linear: (B, L~, S * D) -> (B, L~, self.hidden_dim)
nn.Linear: (B, L~, self.hidden_dim) -> (B, L~, 1)
nn.Sigmoid: to get probabilities
tensor.view: (B, L~, 1) -> (B, L~)

At last, define optimizer, loss_function and training process
loss_function: the number of windows is equal to the number of words per example, thus equal to the number of labels.

Iteration 0
Batched Input:
tensor([[ 0,  0, 19, 16, 12,  8,  4,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])
Batched Lengths:
tensor([5, 4])

1.Window-padding first and batch-padding second is equivalent to
batch-padding first and window-padding second.
2.Only batch-padding for labels.
So the length of model output is equal to the length of labels for each example in a batch.