In [1]:
'''
  code by Tae Hwan Jung(Jeff Jung) @graykode, modify by wmathor
  Reference : https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM_Attn.py
'''
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.utils.data as Data

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
# Bi-LSTM(Attention) Parameters
batch_size = 2
embedding_dim = 2
n_hidden = 5  # number of hidden units in one cell
num_classes = 2  # 0 or 1

# 3 words sentences (=sequence_length is 3)
sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.

vocab = list(set(" ".join(sentences).split()))
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_size = len(word2idx)


def make_data(sentences):
    inputs = []
    for sen in sentences:
        inputs.append(np.asarray([word2idx[n] for n in sen.split()]))

    targets = []
    for out in labels:
        targets.append(out)  # To using Torch Softmax Loss function

    return torch.LongTensor(inputs), torch.LongTensor(targets)


inputs, targets = make_data(sentences)
print(inputs)
print(targets)
# dataset = TensorDataset(inps, tgts)
dataset = Data.TensorDataset(inputs, targets)
loader = Data.DataLoader(dataset, batch_size, True)

tensor([[ 2, 13,  3],
        [ 8,  0,  1],
        [ 6,  5, 11],
        [ 2,  9,  3],
        [ 4,  7, 10],
        [15, 12, 14]])
tensor([1, 1, 1, 0, 0, 0])


In [23]:
for i, j in loader:
    print(i)
    print(j)
    print('--------')

tensor([[ 2,  9,  3],
        [ 4,  7, 10]])
tensor([0, 0])
--------
tensor([[ 8,  0,  1],
        [ 2, 13,  3]])
tensor([1, 1])
--------
tensor([[15, 12, 14],
        [ 6,  5, 11]])
tensor([0, 1])
--------


In [53]:
class BiLSTM_Attention(nn.Module):
    def __init__(self):
        super(BiLSTM_Attention, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=n_hidden, num_layers=1, bidirectional=True,
                            batch_first=True)
        self.fc = nn.Linear(in_features=n_hidden * 2, out_features=num_classes)

    # 应该的[n_step(句子长度),batch_size,  n_hidden * num_directions(=2)]

    # 此处的lstm_output : [batch_size, n_step(句子长度), n_hidden * num_directions(=2)], F matrix----使用batch_first之后output的batch也会在最前
    # self.attention_net(output, final_hidden_state)
    def attention_net(self, lstm_output, final_state):
        batch_size = len(lstm_output)  # batch_size:
        # batch_size = lstm_output.shape[1]  # batch_size:
        print('batch_size:', batch_size, lstm_output.shape)
        print('hn final_state:', final_state, final_state.shape)# torch.Size([2, 2, 5])

        hidden = final_state.view(batch_size, -1, 1)

        # lstm_output:torch.Size([2, 3, 10])
        # hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
        #[2, 5*2, 1]
        # [2,3,10]bmm[2,10,1]-->[2,3,1]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)  # attn_weights : [batch_size, n_step(句子长度)] [2,3]
        print('attn_weights',attn_weights.shape)# [2,3]
        # b = F.softmax(input,dim=0) # 按列SoftMax,列和为1
        # c = F.softmax(input,dim=1)  # 按行SoftMax,行和为1
        soft_attn_weights = F.softmax(attn_weights, dim=1)
        print('soft_attn_weights',soft_attn_weights.shape)
        #[2,3]
        # context : [batch_size, n_hidden * num_directions(=2)]
        # lstm_output:torch.Size([2, 3, 10])-->[2,10,3]*[2,3,1]
        ##[2,10,1]-sq->[2,10]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        print("context",context.shape)# [2,10]
        return context, soft_attn_weights

    def forward(self, X):
        '''
        X: [batch_size, seq_len]
        '''
        input = self.embedding(X)  # input : [batch_size, seq_len, embedding_dim]
        print("input.shape", input.shape)  # torch.Size([2, 3, 2])
        # input = input.transpose(0, 1)
        # input : [seq_len, batch_size, embedding_dim]

        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        output, (final_hidden_state, final_cell_state) = self.lstm(input)
        print('lstm中h0的shape', final_hidden_state.shape)
        print('lstm中output的shape', output.shape)
        # output = output.transpose(0, 1)
        # output : [batch_size, seq_len, n_hidden]
        attn_output, attention = self.attention_net(output, final_hidden_state)
        return self.fc(attn_output), attention  # model : [batch_size, num_classes], attention : [batch_size, n_step]


model = BiLSTM_Attention().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [65]:
# Training
for epoch in range(1000):
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        pred, attention = model(x)
        loss = criterion(pred, y)
        if (epoch + 1) % 1 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

input.shape torch.Size([2, 3, 2])
lstm中h0的shape torch.Size([2, 2, 5])
lstm中output的shape torch.Size([2, 3, 10])
batch_size: 2 torch.Size([2, 3, 10])
hn final_state: tensor([[[ 0.0430,  0.0105, -0.0860,  0.0919,  0.1130],
         [ 0.0343,  0.0550, -0.0297,  0.2401,  0.0551]],

        [[-0.0545,  0.1114, -0.2408,  0.2723, -0.2515],
         [ 0.0186,  0.0677, -0.2474,  0.1571, -0.1745]]],
       grad_fn=<StackBackward0>) torch.Size([2, 2, 5])
attn_weights torch.Size([2, 3])
soft_attn_weights torch.Size([2, 3])
context torch.Size([2, 10])
Epoch: 0001 cost = 0.621468
input.shape torch.Size([2, 3, 2])
lstm中h0的shape torch.Size([2, 2, 5])
lstm中output的shape torch.Size([2, 3, 10])
batch_size: 2 torch.Size([2, 3, 10])
hn final_state: tensor([[[ 0.1865,  0.0725, -0.0467,  0.3273,  0.0246],
         [ 0.1054, -0.0171, -0.2518,  0.0575,  0.0955]],

        [[ 0.0551,  0.0337, -0.2221,  0.1019, -0.1564],
         [-0.1278,  0.0978, -0.0724,  0.3830, -0.3027]]],
       grad_fn=<StackBackward0>) tor

In [67]:
# Test

# sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
# labels = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is not good.
test_text = 'i hate you'
tests = [np.asarray([word2idx[n] for n in test_text.split()])]
test_batch = torch.LongTensor(tests).to(device)
print(test_batch.shape)
# X: [batch_size, seq_len]
# Predict
predict, _ = model(test_batch)
print(predict)
print(predict.data)
print('----------')
predict = torch.max(predict.data,dim=1, keepdim=True)[1]
print(predict.shape)
if predict[0][0] == 0:
    print(test_text, ": is Bad Mean...")
else:
    print(test_text, ": is Good Mean!!")

# fig = plt.figure(figsize=(6, 3)) # [batch_size, n_step]
# ax = fig.add_subplot(1, 1, 1)
# ax.matshow(attention.cpu().data, cmap='viridis')
# ax.set_xticklabels(['']+['first_word', 'second_word', 'third_word'], fontdict={'fontsize': 14}, rotation=90)
# ax.set_yticklabels(['']+['batch_1', 'batch_2', 'batch_3', 'batch_4', 'batch_5', 'batch_6'], fontdict={'fontsize': 14})
# plt.show()

torch.Size([1, 3])
input.shape torch.Size([1, 3, 2])
lstm中h0的shape torch.Size([2, 1, 5])
lstm中output的shape torch.Size([1, 3, 10])
batch_size: 1 torch.Size([1, 3, 10])
hn final_state: tensor([[[ 0.9605, -0.7003, -0.7533,  0.9440, -0.7098]],

        [[-0.4577, -0.5374,  0.5778, -0.1836, -0.7265]]],
       grad_fn=<StackBackward0>) torch.Size([2, 1, 5])
attn_weights torch.Size([1, 3])
soft_attn_weights torch.Size([1, 3])
context torch.Size([1, 10])
tensor([[ 3.5253, -2.8781]], grad_fn=<AddmmBackward0>)
tensor([[ 3.5253, -2.8781]])
----------
torch.Size([1, 1])
i hate you : is Bad Mean...
