In [49]:
import pandas as pd
import datetime
import json
import numpy as np
import string
import math
import re
from IPython.display import HTML

import torchtext
from torchtext.vocab import Vectors
from torchtext import data, datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from itertools import chain
import sys
sys.path.append('..')

from model import weights_init
from utils.func import preprocessing_text, \
                                       tokenizer_punctuation, \
                                       tokenizer_with_preprocessing

# Dataの作成

In [50]:
max_length = 256
batch_size = 64

# 読み込んだ内容に対して行う処理を定義
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, 
                            use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, 
                            init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, dtype=torch.float)

In [51]:
train_ds = torchtext.data.TabularDataset.splits(
    path='../data/news', train='text_train.tsv',
    format='tsv',
    fields=[('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)])
train_ds = train_ds[0]
# print(vars(train_ds[1]))

val_ds = torchtext.data.TabularDataset.splits(
    path='../data/news', train='text_val.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
val_ds = val_ds[0]

test_ds = torchtext.data.TabularDataset.splits(
    path='../data/news', train='text_test.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
test_ds = test_ds[0]

japanese_fasttext_vectors = Vectors(name='../data/news/cc.ja.300.vec')
TEXT.build_vocab(train_ds, 
                                 vectors=japanese_fasttext_vectors,
                                 min_freq=10)
TEXT.vocab.freqs

train_dl = torchtext.data.Iterator(
    train_ds, batch_size=batch_size, train=True)
val_dl = torchtext.data.Iterator(
    val_ds, batch_size=batch_size, train=False, sort=False)
test_dl = torchtext.data.Iterator(
    test_ds, batch_size=len(vars(test_ds)['examples']), train=False, sort=False)

In [52]:
# 動作確認
batch = next(iter(train_dl))
print(batch.Text1[0])
print(batch.Label)

tensor([[   2,   83, 1234,  ...,    1,    1,    1],
        [   2,  120,    4,  ...,    1,    1,    1],
        [   2,    4,  326,  ...,    1,    1,    1],
        ...,
        [   2, 2209, 1906,  ...,    1,    1,    1],
        [   2,   73,   37,  ...,    1,    1,    1],
        [   2,  775,  456,  ...,    1,    1,    1]])
tensor([12.7182, -0.7692,  2.5641, -2.0115, -2.3256,  7.6557,  1.8398,  2.3576,
         0.9238,  6.0573,  0.9320, -1.1215,  4.2220, -0.0500, -1.3091,  1.6836,
        -0.7884, -0.2865,  0.7254,  2.8846,  3.1707,  0.0000,  2.4390,  1.0309,
        -1.9149,  0.8581,  1.8579, -0.9628,  3.0747, -1.4286, -0.8850, -0.9743,
        -1.5686, -4.2553, -0.2719,  3.8732, -0.0567, -1.6550,  0.2066,  2.5185,
        -1.4205,  0.3779,  0.4413, -0.3490, -0.8331,  0.7059,  0.8224, -0.1575,
         1.4052,  0.9434, -0.4289, -0.6289, -1.2242, -5.6122, -1.1444, -1.2658,
         2.3438, -2.2436,  0.1333,  4.1026, -2.7537,  6.2657, -2.3723,  4.8485])


# モデル構築

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [72]:
class EncoderRNN(nn.Module):
    def __init__(self, emb_dim, h_dim, v_size, device='cpu', v_vec=None, batch_first=True):
        super(EncoderRNN, self).__init__()
        self.device = device
        self.h_dim = h_dim
        self.embed = nn.Embedding(v_size, emb_dim)
        if v_vec is not None:
            self.embed.weight.data.copy_(v_vec)
        self.lstm = nn.LSTM(emb_dim, h_dim, dropout=0.1, batch_first=batch_first,
                            bidirectional=True)

    def init_hidden(self, b_size):
        h0 = torch.zeros(1*2, b_size, self.h_dim).to(self.device)
        c0 = torch.zeros(1*2, b_size, self.h_dim).to(self.device)
        return (h0, c0)

    def forward(self, sentence, lengths=None):
        self.hidden = self.init_hidden(sentence.size(0))
        emb = self.embed(sentence)
        packed_emb = emb

        if lengths is not None:
            lengths = lengths.view(-1).tolist()
            packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths)
        
        print(packed_emb.shape, ' ', self.hidden[0].shape, ' ', self.hidden[1].shape)
        out, hidden = self.lstm(packed_emb, self.hidden)

        if lengths is not None:
            out = nn.utils.rnn.pad_packed_sequence(output)[0]

        out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]

        return out

In [73]:
class Attn(nn.Module):
    def __init__(self, h_dim):
        super(Attn, self).__init__()
        self.h_dim = h_dim
        self.fc = nn.Sequential(
            nn.Linear(h_dim, 24),
            nn.ReLU(True),
            nn.Dropout(0.1),
            nn.Linear(24,1)
        )

    def forward(self, encoder_outputs):
        b_size = encoder_outputs.size(0)
        attn_ene = self.fc(encoder_outputs.reshape(-1, self.h_dim))
        return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2)

In [74]:
class AttnClassifier(nn.Module):
    def __init__(self, h_dim, c_num):
        super(AttnClassifier, self).__init__()
        self.attn = Attn(h_dim)
        self.main = nn.Linear(h_dim, c_num)


    def forward(self, encoder_outputs):
        attns = self.attn(encoder_outputs) #(b, s, 1)
        feats = (encoder_outputs * attns).sum(dim=1) # (b, s, h) -> (b, h)
        return F.log_softmax(self.main(feats)), attns

In [75]:
torch.manual_seed(0)
emb_dim = 300
h_dim = 32
learning_rate = 1e-3

# make model
encoder = EncoderRNN(emb_dim, h_dim, len(TEXT.vocab), 
                     device=device, v_vec = TEXT.vocab.vectors).to(device)
classifier = AttnClassifier(h_dim, 2).to(device)

In [76]:
enc_out = encoder(x)
out, attn = classifier(enc_out)

torch.Size([64, 256, 300])   torch.Size([2, 64, 32])   torch.Size([2, 64, 32])


  # This is added back by InteractiveShellApp.init_path()


In [60]:

for m in encoder.modules():
    print(m.__class__.__name__)
    weights_init(m)

for m in classifier.modules():
    print(m.__class__.__name__)
    weights_init(m)

optimizer = optim.Adam(
    chain(encoder.parameters(),classifier.parameters()), lr=learning_rate)

EncoderRNN
Embedding
LSTM
AttnClassifier
Attn
Sequential
Linear
ReLU
Dropout
Linear
Linear


In [38]:
def binary_accuracy(pred, y):
    #round predictions to the closest integer
    correct = (pred == y).float() #convert into float for division 
#     print(correct)
    acc = correct.sum()
    return acc

# 損失関数
criterion = nn.CrossEntropyLoss()

# 辞書オブジェクトにまとめる
dataloaders_dict = {'train': train_dl, 'val': val_dl}

In [40]:
# train model 
num_epochs = 30
for epoch in range(num_epochs):
    for phase in ['train', 'val']:
        if phase == 'train':
            encoder.train()
            classifier.train()
        else:
            encoder.eval()
            classifier.eval()

        epoch_loss = 0
        epoch_corrects = 0

        for idx, batch in enumerate(dataloaders_dict[phase]):
            x = batch.Text1[0].to(device)
            y = batch.Label.to(device)

            optimizer.zero_grad()
            encoder_outputs = encoder(x)
            output, attn = classifier(encoder_outputs)
            loss = criterion(output, y)

            if phase == 'train':
                loss.backward()
                optimizer.step()

            pred = output.data.max(1, keepdim=True)[1]

            epoch_loss += loss.item() * x.size(0)
            epoch_corrects += binary_accuracy(pred.view(-1), y)


        # epochごとのlossと正解率
        epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
        epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)

        print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format(
            epoch+1,
            num_epochs,
            phase,
            epoch_loss,
            epoch_acc))


  # This is added back by InteractiveShellApp.init_path()


RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target' in call to _thnn_nll_loss_forward

In [None]:
# HTMLを作成する関数を実装
def highlight(word, attn):
    "Attentionの値が大きいと文字の背景が濃い赤になるhtmlを出力させる関数"

    html_color = '#%02X%02X%02X' % (
        255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}"> {}</span>'.format(html_color, word)


def mk_html(sentence, attens, label, pred):
    # ラベルと予測結果を文字に置き換え
    if label == 0:
        label_str = "Negative"
    else:
        label_str = "Positive"

    if pred == 0:
        pred_str = "Negative"
    else:
        pred_str = "Positive"

#     # 表示用のHTMLを作成する
    html = '正解ラベル：{}<br>推論ラベル：{}<br><br>'.format(label_str, pred_str)
#     html = ""
    # 1段目のAttention
    html += '[TransformerBlockの1段目のAttentionを可視化]<br>'
    for word, attn in zip(sentence, attens):
        html += highlight(TEXT.vocab.itos[word], attn)
    html += "<br><br>"

    return html

In [None]:
# for batch in test_dl:
idx = 19
batch = next(iter(train_dl))
x = batch.Text[0].to(device)
y = batch.Label.to(device)
encoder_outputs = encoder(x)
output, attns = classifier(encoder_outputs)
pred = output.data.max(1, keepdim=True)[1]
attn = attns.data[idx,:,0]
html_output = mk_html(x[idx].cpu().detach().numpy(), 
                                           attn, 
                                           y[idx].item(),
                                           pred[idx].item())
HTML(html_output)

In [None]:
attn.shape