In [66]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import pandas as pd


In [76]:
df_train = pd.read_csv('new_train.tsv', header=0, sep='\t')
train_sentences = list(np.array(df_train)[:, 0])
train_labels = list(np.array(df_train)[:, 1])
df_test = pd.read_csv('new_test.tsv', header=0, sep='\t')
test_sentences = list(np.array(df_test)[:, 0])
test_labels = list(np.array(df_test)[:, 1])

In [106]:
"""
1.Basic Embedding Model
    1-1. NNLM(Neural Network Language Model)
"""

dtype = torch.FloatTensor

word_list = " ".join(train_sentences+test_sentences).split()  # 制作词汇表
# print(word_list)
word_list = list(set(word_list))  # 去除词汇表中的重复元素
# print("去重后的word_list:", word_list)
word_dict = {w: i for i, w in enumerate(word_list)}  # 将每个单词对应于相应的索引
number_dict = {i: w for i, w in enumerate(word_list)}  # 将每个索引对应于相应的单词
n_class = len(word_dict)  # 单词的总数

# NNLM parameters
n_step = 2  # 根据前2个单词预测第3个单词 [0:1]--->[2]
n_hidden = 256  # 隐藏层神经元的个数
m = 1000  # 词向量的维度

In [107]:
max_length = 0
for sentence in train_sentences:
    if len(sentence.split()) > max_length:
        max_length = len(sentence.split())
max_length

52

In [110]:
def make_batch(sentences, n_step, max_length = 64):
    input_batch = torch.zeros(len(sentences), max_length)
    target_batch = torch.zeros(len(sentences))
    for i in range(len(sentences)):
        word = sentences[i].split()
        print(word)
        input = [word_dict[w] for w in word[:n_step]]
        print(input)
        target = word_dict[word[n_step+1]]
        input_batch[i, 0:len(sentences[i].split())] = torch.tensor(input)
        # target_batch[i] = target
    return input_batch, target_batch

In [111]:
# Model


class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, embedding_dim=m)
        self.H = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
        self.W = nn.Parameter(torch.randn(n_step * m, n_class).type(dtype))
        self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
        self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
        self.b = nn.Parameter(torch.randn(n_class).type(dtype))

    def forward(self, x):
        x = self.C(x)
        x = x.view(-1, n_step * m)
        # x: [batch_size, n_step*n_class]
        tanh = torch.tanh(self.d + torch.mm(x, self.H))
        # tanh: [batch_size, n_hidden]
        output = self.b + torch.mm(x, self.W) + torch.mm(tanh, self.U)
        # output: [batch_size, n_class]
        return output

    def embed(self, x):
        return self.C(x)
        # print(self.C.weight)



model = NNLM()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 制作输入
input_batch, target_batch = make_batch(train_sentences, n_step)
input_batch = Variable(input_batch)
target_batch = Variable(target_batch)

input_batch

['Even', 'fans', 'of', 'Ismail', 'Merchant', "'s", 'work', ',', 'I', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']
[4881, 7757]


RuntimeError: The expanded size of the tensor (21) must match the existing size (2) at non-singleton dimension 0.  Target sizes: [21].  Tensor sizes: [2]

In [6]:
# 开始训练
for epoch in range(1000):
    optimizer.zero_grad()
    output = model(input_batch)
    # output : [batch_size, n_class], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1) % 1000 == 0:
        print("Epoch:{}".format(epoch + 1), "Loss:{:.3f}".format(loss))
    loss.backward()
    optimizer.step()

# 预测
predict = model(input_batch).data.max(
    1, keepdim=True)[1]  # [batch_size, n_class]
print("predict: \n", predict)
# 测试
print([sentence.split()[:3] for sentence in sentences], "---->",
      [number_dict[n.item()] for n in predict.squeeze()])
model.embed(torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
for word in word_list:
    word_id = word_dict[word]
    print(f"{word} -> {word_id}")
    print(f"vector of '{word}': {model.embed(torch.tensor([word_id]))}")

Epoch:1000 Loss:0.041
predict: 
 tensor([[0],
        [2],
        [8]])
[['i', 'really', 'like'], ['i', 'doubtfully', 'love'], ['i', 'sincerely', 'hate']] ----> ['dog', 'coffee', 'milk']
dog -> 0
vector of 'dog': tensor([[ 0.7593, -0.3896, -0.6971]], grad_fn=<EmbeddingBackward0>)
doubtfully -> 1
vector of 'doubtfully': tensor([[-1.1559,  1.8775, -0.9968]], grad_fn=<EmbeddingBackward0>)
coffee -> 2
vector of 'coffee': tensor([[ 0.6769, -0.5308, -1.5418]], grad_fn=<EmbeddingBackward0>)
like -> 3
vector of 'like': tensor([[-0.6478,  0.2158,  0.7785]], grad_fn=<EmbeddingBackward0>)
sincerely -> 4
vector of 'sincerely': tensor([[-0.8695, -1.4554,  1.0802]], grad_fn=<EmbeddingBackward0>)
hate -> 5
vector of 'hate': tensor([[-0.1706,  0.8658,  1.0102]], grad_fn=<EmbeddingBackward0>)
really -> 6
vector of 'really': tensor([[-0.7004,  1.5601, -0.5770]], grad_fn=<EmbeddingBackward0>)
love -> 7
vector of 'love': tensor([[ 0.9230, -0.9520,  0.8691]], grad_fn=<EmbeddingBackward0>)
milk -> 8
vector

In [9]:
sentences[0].split()

['i', 'really', 'like', 'dog']

In [58]:
def sen2matrix(sentence):
    # to_return = torch.zeros(len(sentence.split()), m, requires_grad=False)
    a = torch.tensor([word_dict[word] for word in sentence.split()])
    to_return = model.C(a)
    return to_return

sen2matrix(sentences[0])

tensor([[ 0.9750,  0.1776, -0.7511],
        [-0.7004,  1.5601, -0.5770],
        [-0.6478,  0.2158,  0.7785],
        [ 0.7593, -0.3896, -0.6971]], grad_fn=<EmbeddingBackward0>)

In [59]:
def corr2d(X, K):  #@save
    """计算二维互相关运算"""
    h, w = K.shape
    Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[i:i + h, j:j + w] * K).sum()
    return Y

In [60]:
k = torch.tensor([[-1.,1.],[-1.,1.]])
corr2d(sen2matrix(sentences[0]), k)


tensor([[ 1.4631, -3.0658],
        [ 3.1241, -1.5744],
        [-0.2854,  0.2553]], grad_fn=<CopySlices>)

In [61]:
class Conv2D(nn.Module):
    def __init__(self, kernel_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(kernel_size))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        return corr2d(x, self.weight) + self.bias

In [64]:
# 构造一个二维卷积层，它具有1个输出通道和形状为（1，2）的卷积核
conv2d = nn.Conv2d(1,1, kernel_size=(3, 3), bias=False)

# 这个二维卷积层使用四维输入和输出格式（批量大小、通道、高度、宽度），
# 其中批量大小和通道数都为1
X = sen2matrix(sentences[0]).reshape((1, 1, 4, 3))
Y = torch.tensor([[0.],[1.]]).reshape((1, 1, 2, 1))
lr = 3e-2  # 学习率

for i in range(10):
    Y_hat = conv2d(X)
    l = (Y_hat - Y) ** 2
    conv2d.zero_grad()
    l.sum().backward(retain_graph=True)
    # 迭代卷积核
    conv2d.weight.data[:] -= lr * conv2d.weight.grad
    if (i + 1) % 2 == 0:
        print(f'epoch {i+1}, loss {l.sum():.3f}')

epoch 2, loss 0.245
epoch 4, loss 0.036
epoch 6, loss 0.005
epoch 8, loss 0.001
epoch 10, loss 0.000
