# 単語分割

In [13]:
import MeCab
m_t = MeCab.Tagger('-Ochasen')

In [14]:
text = '機械学習が好きです'

In [15]:
print(m_t.parse(text).strip())

機械	キカイ	機械	名詞-一般		
学習	ガクシュウ	学習	名詞-サ変接続		
が	ガ	が	助詞-格助詞-一般		
好き	スキ	好き	名詞-形容動詞語幹		
です	デス	です	助動詞	特殊・デス	基本形
EOS


In [16]:
def tokenizer_mecab(text):
    text = m_t.parse(text) # mecab
    ret = text.strip().split()  # text処理
    return ret

# 前処理

In [17]:
import re

def preprocessing_text(text):
    # 前処理
    text = re.sub('\r', '', text) # 
    text = re.sub('\n', '', text) # 改行
    text = re.sub(' ', '', text) # 半角
    text = re.sub('　　', '', text) # 全角
    text = re.sub(r'[0-9 ０-９]', '0', text)

    return text

# tokenizerの作成

In [18]:
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text) # textの前処理
    ret = tokenizer_mecab(text) # mecabの単語分割
    return ret

# torchtext

In [37]:
import torchtext

max_length = 25
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing,
                                                 use_vocab=True, lower=True, include_lengths=True, 
                                                 batch_first=True, fix_length=max_length)
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

# Datasetの作成

In [26]:
train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path='./data/', train='text_train.tsv',
    validation='text_val.tsv', test='text_test.tsv', format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])

In [32]:
type(vars(train_ds[0])['Label'])

str

# ボキャブラリーの作成

In [33]:
TEXT.build_vocab(train_ds, min_freq=1)
TEXT.vocab.freqs

Counter({'王と王子と女王と姫と男性と女性がいました。': 1,
         '機械学習が好きです。': 1,
         '本章から自然言語処理に取り組みます。': 1,
         '本章では商品レビューの短い文章に対して、その文章がネガティブな評価をしている文章なのか、ポジティブな評価をしている文章なのか、2値のクラス分類する分類モデルを構築します。': 1})

In [30]:
print(TEXT.vocab.vectors)
print(TEXT.vocab.stoi)

None
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fe22deeaa90>>, {'<unk>': 0, '<pad>': 1, '本章から自然言語処理に取り組みます。': 2, '本章では商品レビューの短い文章に対して、その文章がネガティブな評価をしている文章なのか、ポジティブな評価をしている文章なのか、2値のクラス分類する分類モデルを構築します。': 3, '機械学習が好きです。': 4, '王と王子と女王と姫と男性と女性がいました。': 5})


# DataLoaderの作成

In [28]:
train_dl= torchtext.data.Iterator(train_ds, batch_size=2, train=True)
val_dl = torchtext.data.Iterator(val_ds, batch_size=2, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=2, train=False, sort=False)

In [29]:
# 動作確認
batch = next(iter(train_dl))
print(batch.Text)
print(batch.Label)

(tensor([[5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1],
        [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]]), tensor([1, 1]))
tensor([0, 1])


# IMDbデータセットの実装

In [24]:
from glob import glob
import os
import io

In [25]:
# 訓練データの作成
with open('./data/IMDb_train.tsv', 'w') as f:

    path = './data/aclImdb/train/pos/'
    for fname in glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding='utf-8') as ff:
            text = ff.readline()
            text = text.replace('\t', ' ')
            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)


    path = './data/aclImdb/train/neg/'
    for fname in glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding='utf-8') as ff:
            text = ff.readline()
            text = text.replace('\t', ' ')
            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)

In [26]:
# テストデータの作成
with open('./data/IMDb_test.tsv', 'w') as f:

    path = './data/aclImdb/test/pos/'
    for fname in glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding='utf-8') as ff:
            text = ff.readline()
            text = text.replace('\t', ' ')
            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    path = './data/aclImdb/test/neg/'
    for fname in glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding='utf-8') as ff:
            text = ff.readline()
            text = text.replace('\t', ' ')
            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)

#  前処理

In [2]:
import string
import re

# 前処理
def preprocessing_text(text):
    text = re.sub('<br />', '', text)
    
    # カンマ・ピリオド以外の記号をスペースに変換
    for p in string.punctuation:
        if (p =='.') or (p == ','):
            continue
        else:
            text = text.replace(p, ' ')
        
    # ピリオドの前後にはスペースを入れる
    text = text.replace('.', ' . ')
    text = text.replace(',', ' , ')
    text = re.sub(r'[0-9 ０-９]', '0', text)
    
    return text

# 分かち書き
def tokenizer_punctuation(text):
    return text.strip().split()

# 前処理と分かち書きをまとめる
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret

In [3]:
print(tokenizer_with_preprocessing('I lick cats.'))

['I0lick0cats0.0']


# Datasetの作成

In [4]:
import torchtext

max_length = 256
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

In [5]:
train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path='./data/', train='IMDb_train.tsv',
    test='IMDb_test.tsv', format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])

In [6]:
print(vars(train_val_ds[0]))

{'Text': ['very0different0topic0treated0in0this0film0.00a0straightforward0and0simple0description0of0local0chinese0customs0,00by0looking0at0the0daily0operation0of0a0public0bath0,00run0by0the0old0owner0and0his0retarded0son0,00when0older0son0returns0home0,00wrongly0believing0his0father0has0died0.00how0every0man0in0town0makes0his0daily0visit0to0chat0,00play0games0,00discuss0personal0matters0and0get0honest0advice0,00besides0the0usual0spa0like0therapies0.00when0old0man0dies0,00strong0and0loyal0family0ties0make0older0son0take0charge0,00so0public0bath0operation0is0not0disrupted0.00and0finally0,00the0arrival0of0modernization0to0end0this0way0of0spending0relaxed0hours0and0getting0along0.00the0public0bath0has0to0be0demolished0,00making0place0for0a0commercial0complex0to0be0constructed0.0'], 'Label': '1'}


# 訓練と検証を分ける

In [7]:
import random

train_ds, val_ds = train_val_ds.split(split_ratio=0.8, random_state=random.seed(1234))

# ボキャブラリーの作成

In [8]:
from torchtext.vocab import Vectors

In [9]:
english_fasttext_vocabs = Vectors(name='./data/wiki-news-300d-1M.vec')

In [10]:
print('単語の次元数', english_fasttext_vocabs.dim)
print('単語数', len(english_fasttext_vocabs))

単語の次元数 300
単語数 999994


In [11]:
TEXT.build_vocab(train_ds, vectors=english_fasttext_vocabs, min_freq=10)

In [12]:
print(TEXT.vocab.vectors)
print(TEXT.vocab.stoi)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f1e0932d5d0>>, {'<unk>': 0, '<pad>': 1, '<cls>': 2, '<eos>': 3})


# DataLoaderの作成

In [13]:
train_dl = torchtext.data.Iterator(train_ds, batch_size=24, train=True)
val_dl = torchtext.data.Iterator(val_ds, batch_size=24, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=24, train=False, sort=False)

In [14]:
batch = next(iter(val_dl))
print(batch.Text)
print(batch.Label)

(tensor([[2, 0, 3,  ..., 1, 1, 1],
        [2, 0, 3,  ..., 1, 1, 1],
        [2, 0, 3,  ..., 1, 1, 1],
        ...,
        [2, 0, 3,  ..., 1, 1, 1],
        [2, 0, 3,  ..., 1, 1, 1],
        [2, 0, 3,  ..., 1, 1, 1]]), tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]))
tensor([1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0])


# Transformerの作成

In [15]:
# Embedder (単語数　→ 単語数*分散表現数)
# Positional Encoder 単語数*分散表現数の位置情報を加える(単語数*分散表現数 → 単語数*分散表現数)
# Transformer Blockモジュール

# Embedder

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Embedder(nn.Module):
    def __init__(self, text_embedding_vectors):
        super(Embedder, self).__init__()
        
        self.embeddings = nn.Embedding.from_pretrained(
        embeddings=text_embedding_vectors, freeze=True)
        
    def forward(self, x):
        y = self.embeddings(x)
        return y

In [17]:
import glob
import os
import io
import string
import re
import random
import math
import spacy
import torchtext
from torchtext.vocab import Vectors


def get_IMDb_DataLoaders_and_TEXT(max_length=256, batch_size=24):
    """IMDbのDataLoaderとTEXTオブジェクトを取得する。 """

    # 訓練データのtsvファイルを作成します
    f = open('./data/IMDb_train.tsv', 'w')

    path = './data/aclImdb/train/pos/'
    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()

            # タブがあれば消しておきます
            text = text.replace('\t', " ")

            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    path = './data/aclImdb/train/neg/'
    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()

            # タブがあれば消しておきます
            text = text.replace('\t', " ")

            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)

    f.close()

   # テストデータの作成
    f = open('./data/IMDb_test.tsv', 'w')

    path = './data/aclImdb/test/pos/'
    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()

            # タブがあれば消しておきます
            text = text.replace('\t', " ")

            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    path = './data/aclImdb/test/neg/'
    for fname in glob.glob(os.path.join(path, '*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()

            # タブがあれば消しておきます
            text = text.replace('\t', " ")

            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)
    f.close()

    def preprocessing_text(text):
        # 改行コードを消去
        text = re.sub('<br />', '', text)

        # カンマ、ピリオド以外の記号をスペースに置換
        for p in string.punctuation:
            if (p == ".") or (p == ","):
                continue
            else:
                text = text.replace(p, " ")

        # ピリオドなどの前後にはスペースを入れておく
        text = text.replace(".", " . ")
        text = text.replace(",", " , ")
        return text

    # 分かち書き（今回はデータが英語で、簡易的にスペースで区切る）
    def tokenizer_punctuation(text):
        return text.strip().split()


    # 前処理と分かち書きをまとめた関数を定義
    def tokenizer_with_preprocessing(text):
        text = preprocessing_text(text)
        ret = tokenizer_punctuation(text)
        return ret


    # データを読み込んだときに、読み込んだ内容に対して行う処理を定義します
    # max_length
    TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True,
                                lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
    LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

    # フォルダ「data」から各tsvファイルを読み込みます
    train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
        path='./data/', train='IMDb_train.tsv',
        test='IMDb_test.tsv', format='tsv',
        fields=[('Text', TEXT), ('Label', LABEL)])

    # torchtext.data.Datasetのsplit関数で訓練データとvalidationデータを分ける
    train_ds, val_ds = train_val_ds.split(
        split_ratio=0.8, random_state=random.seed(1234))

    # torchtextで単語ベクトルとして英語学習済みモデルを読み込みます
    english_fasttext_vectors = Vectors(name='data/wiki-news-300d-1M.vec')

    # ベクトル化したバージョンのボキャブラリーを作成します
    TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors, min_freq=10)

    # DataLoaderを作成します（torchtextの文脈では単純にiteraterと呼ばれています）
    train_dl = torchtext.data.Iterator(
        train_ds, batch_size=batch_size, train=True)

    val_dl = torchtext.data.Iterator(
        val_ds, batch_size=batch_size, train=False, sort=False)

    test_dl = torchtext.data.Iterator(
        test_ds, batch_size=batch_size, train=False, sort=False)

    return train_dl, val_dl, test_dl, TEXT

In [18]:
train_dl, val_dl, test_dl, TEXT = get_IMDb_DataLoaders_and_TEXT(
    max_length=256, batch_size=24)

In [19]:
# 動作確認

## ミニバッチ
batch = next(iter(train_dl))

## モデル構築
net1 = Embedder(TEXT.vocab.vectors)

## 入出力
x = batch.Text[0]
x1 = net1(x)

In [20]:
print(x.shape)
print(x1.shape)

torch.Size([24, 256])
torch.Size([24, 256, 300])


# Positional Encoder

In [21]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model=300, max_seq_len=256):
        super(PositionalEncoder, self).__init__()
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # 単語ベクトルの次元数
        self.d_model = d_model
        
        # 単語の順番posとベクトルの次元位置iの(p, i)によって一意に定まる表を作成する
        pe = torch.zeros(max_seq_len, d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(
                                        pos / (10000 ** ((2*i)/d_model)))
                pe[pos, i+1] = math.cos(
                                        pos / (10000 ** ((2*(i+1))/d_model)))
        
        self.pe = pe.to(device).unsqueeze(0)
        
        # 勾配を計算しないようにする
        self.pe.requires_grad = False
        
    def forward(self, x):
        # 入力xとPositional Encoderを足し算する
        ret = math.sqrt(self.d_model)*x + self.pe
        return ret

In [22]:
# # 動作確認

# ## モデル構築
# net1 = Embedder(TEXT.vocab.vectors)
# net2 = PositionalEncoder(d_model=300, max_seq_len=256)

# ## 入出力
# x = batch.Text[0]
# x1 = net1(x)
# x2 = net2(x1)

In [25]:
# print('入力テンソルサイズ', x1.shape)
# print('出力テンソルサイズ', x2.shape)

# TransformerBlockモジュール

In [26]:
# LayerNormalization: 特徴量の正規化を行う
# Dropout: 過学習防止
# Attention
# FeedForward: 特徴量変換
# からなる
# * 実際のTransformerのAttentionではMulti-Headed Attentionを採用している
# <pad>の部分にはmask=0をつけるが，Attentionでは-1e9とすることでsoftmaxの出力を0にする

In [None]:
# Attentionの作成
class Attention(nn.Module):
    def __init__(self, d_model=300):
        super().__init__()
        
        # 特徴量の作成
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        
        # 出力の全結合層
        self.out = nn.Linear(d_model, d_model)
        
        # Attentionの大きさ調整の変数
        self.d_k = d_model
        
    def forward(self, q, k, v, mask):
        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)
        
        # Attentionの値を計算する
        weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.d_k)
        
        # maskを計算
        mask = mask.unsqueeze(1)
        weights = weights.masked_fill(mask==0, -1e9)
        
        # softmaxで規格化する
        normalized_weights = F.softmax(weights, dim=-1)
        
        # AttentionをValueと掛け算
        output = torch.matmul(normalized_weights, v)
        
        # 特徴量を変換
        output = self.out(output)
        
        return output, normalized_weights

In [None]:
# FeedForwardの作成
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()
        
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        return x

In [None]:
# Transformer Blockの作成
class TransformerBlock(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        
        # LayerNorm層
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        
        # Attention層
        self.attn = Attention(d_model)
        
        # 全結合層
        self.ff = FeedForward(d_model)
        
        # Dropout
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # 正規化とAttention
        x_normalized = self.norm_1(x)
        output, normalized_weights = self.attn(
            x_normalized, x_normalized, x_normalized, mask)
        
        x2 = x + self.dropout_1(output)
        
        # 正規化と全結合層構築
        x_normalized2 = self.norm_2(x2)
        output = x2 + self.dropout_2(self.ff(x_normalized2))
        
        return output, normalized_weights

In [None]:
# 動作確認

## モデル構築
net1 = Embedder(TEXT.vocab.vectors)
net2 = PositionalEncoder(d_model=300, max_seq_len=256)
net3 = TransformerBlock(d_model=300)

## maskの作成
x = batch.Text[0]
input_pad = 1   # padding ID
input_mask = (x != input_pad)
# print(input_mask[0])

## 入出力
x1 = net1(x)
x2 = net2(x1)
x3, normalized_weights = net3(x2, input_mask)

# classificationHeadモジュール

In [None]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model=300, output_dim=2):
        super().__init__()
        
        # 全結合層
        self.linear = nn.Linear(d_model, output_dim)
        
        # 重み初期化
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)
        
    def forward(self, x):
        x0 = x[:, 0, :]   # 各文の先頭の単語の特徴量を取り出す
        out = self.linear(x0)
        
        return out

# Transformerの実装

In [None]:
class TransformerClassification(nn.Module):
    def __init__(self, text_embedding_vectors, d_model=300, max_seq_len=256,
                           output_dim=2):
        super().__init__()
        
        # モデルの構築
        self.net1 = Embedder(text_embedding_vectors)
        self.net2 = PositionalEncoder(d_model, max_seq_len)
        self.net3_1 = TransformerBlock(d_model)
        self.net3_2 = TransformerBlock(d_model)
        self.net4 = ClassificationHead(d_model, output_dim)
        
    def forward(self, x, mask):
        x1 = self.net1(x)
        x2 = self.net2(x1)
        x3_1, normalized_weights_1 = self.net3_1(x2, mask)
        x3_2, normalized_weights_2 = self.net3_2(x3_1, mask)
        x4 = self.net4(x3_2)
        return x4, normalized_weights_1, normalized_weights_2

In [24]:
# 動作確認

## モデル構築
net = TransformerClassification(TEXT.vocab.vectors, d_model=300, max_seq_len=256,
                               output_dim=2)

## 入出力
x = batch.Text[0]
input_pad = 1
input_mask = (x != input_pad)
out, normalized_weights_1, normalized_weights_2 = net(x, input_mask)

NameError: name 'TransformerClassification' is not defined

# Transformerの学習・推論

In [71]:
# 読み込み
train_dl, val_dl, test_dl, TEXT = get_IMDb_DataLoaders_and_TEXT(
    max_length=256, batch_size=24)

In [72]:
# 辞書オブジェクトにまとめる
dataloaders_dict = {'train': train_dl, 'val': val_dl}

In [73]:
# モデルの構築
net = TransformerClassification(
    text_embedding_vectors=TEXT.vocab.vectors, d_model=300, max_seq_len=256, output_dim=2)

In [74]:
# パラメータの初期化を定義
def weights_init(m):
    classname =  m.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)


# 訓練モード
net.train()

# パラメータ初期化
net.net3_1.apply(weights_init)
net.net3_2.apply(weights_init)

TransformerBlock(
  (norm_1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (norm_2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (attn): Attention(
    (q_linear): Linear(in_features=300, out_features=300, bias=True)
    (v_linear): Linear(in_features=300, out_features=300, bias=True)
    (k_linear): Linear(in_features=300, out_features=300, bias=True)
    (out): Linear(in_features=300, out_features=300, bias=True)
  )
  (ff): FeedForward(
    (linear_1): Linear(in_features=300, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear_2): Linear(in_features=1024, out_features=300, bias=True)
  )
  (dropout_1): Dropout(p=0.1, inplace=False)
  (dropout_2): Dropout(p=0.1, inplace=False)
)

In [86]:
# 損失関数
criterion = nn.CrossEntropyLoss()

# 最適化手法
learning_rate = 2e-5
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [91]:
# 訓練と検証

def train_model(net, datalloaders_dict, criterion, optimizer, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('----start----')
    net.to(device)
    
    torch.backends.cudnn.benchmark = True
    
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()
            
            epoch_loss = 0.0
            epoch_corrects = 0
            
            for batch in (dataloaders_dict[phase]):
                inputs = batch.Text[0].to(device)
                labels = batch.Label.to(device)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    
                    # maskの作成
                    input_pad = 1
                    input_mask = (inputs != input_pad)
                    
                    # Transformerに入力
                    outputs, _, _ = net(inputs, input_mask)
                    loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
                    # 更新
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    # 結果の計算
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
            
            # epochごとのlossと正解率
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format(
                                                                         epoch+1,
                                                                         num_epochs,
                                                                         phase,
                                                                         epoch_loss,
                                                                         epoch_acc))
    return net

In [92]:
num_epochs = 10
net_trained = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

----start----
Epoch 1/10 | train | Loss: 0.3472 Acc: 0.8495
Epoch 1/10 |  val  | Loss: 0.3467 Acc: 0.8486
Epoch 2/10 | train | Loss: 0.3416 Acc: 0.8508
Epoch 2/10 |  val  | Loss: 0.3535 Acc: 0.8472
Epoch 3/10 | train | Loss: 0.3307 Acc: 0.8589
Epoch 3/10 |  val  | Loss: 0.3429 Acc: 0.8496
Epoch 4/10 | train | Loss: 0.3268 Acc: 0.8605
Epoch 4/10 |  val  | Loss: 0.3712 Acc: 0.8394
Epoch 5/10 | train | Loss: 0.3184 Acc: 0.8655
Epoch 5/10 |  val  | Loss: 0.4830 Acc: 0.7958
Epoch 6/10 | train | Loss: 0.3168 Acc: 0.8647
Epoch 6/10 |  val  | Loss: 0.3417 Acc: 0.8544
Epoch 7/10 | train | Loss: 0.3077 Acc: 0.8674
Epoch 7/10 |  val  | Loss: 0.3524 Acc: 0.8514
Epoch 8/10 | train | Loss: 0.3042 Acc: 0.8703
Epoch 8/10 |  val  | Loss: 0.3517 Acc: 0.8562
Epoch 9/10 | train | Loss: 0.2962 Acc: 0.8749
Epoch 9/10 |  val  | Loss: 0.3643 Acc: 0.8462
Epoch 10/10 | train | Loss: 0.2911 Acc: 0.8767
Epoch 10/10 |  val  | Loss: 0.3666 Acc: 0.8492


# テストデータでの判定根拠の可視化

In [96]:
# テストデータでの正答率を求める
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

net_trained.eval()
net_trained.to(device)

epoch_corrects = 0

for batch in test_dl:
    inputs = batch.Text[0].to(device)
    labels = batch.Label.to(device)
    
    with torch.set_grad_enabled(False):
        input_pad = 1
        input_mask = (inputs != input_pad)
        
        outputs, _, _ = net_trained(inputs, input_mask)
        
        _, preds = torch.max(outputs, 1)
        
        epoch_corrects += torch.sum(preds == labels.data)

In [98]:
epoch_acc = epoch_corrects.double() / len(test_dl.dataset)
print('テストデータ{}個の正解率: {:.4f}'.format(len(test_dl.dataset), epoch_acc))

テストデータ25000個の正解率: 0.8509


In [106]:
# HTMLを作成する関数を実装


def highlight(word, attn):
    "Attentionの値が大きいと文字の背景が濃い赤になるhtmlを出力させる関数"

    html_color = '#%02X%02X%02X' % (
        255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}"> {}</span>'.format(html_color, word)


def mk_html(index, batch, preds, normlized_weights_1, normlized_weights_2, TEXT):
    "HTMLデータを作成する"

    # indexの結果を抽出
    sentence = batch.Text[0][index]  # 文章
    label = batch.Label[index]  # ラベル
    pred = preds[index]  # 予測

    # indexのAttentionを抽出と規格化
    attens1 = normlized_weights_1[index, 0, :]  # 0番目の<cls>のAttention
    attens1 /= attens1.max()

    attens2 = normlized_weights_2[index, 0, :]  # 0番目の<cls>のAttention
    attens2 /= attens2.max()

    # ラベルと予測結果を文字に置き換え
    if label == 0:
        label_str = "Negative"
    else:
        label_str = "Positive"

    if pred == 0:
        pred_str = "Negative"
    else:
        pred_str = "Positive"

    # 表示用のHTMLを作成する
    html = '正解ラベル：{}<br>推論ラベル：{}<br><br>'.format(label_str, pred_str)

    # 1段目のAttention
    html += '[TransformerBlockの1段目のAttentionを可視化]<br>'
    for word, attn in zip(sentence, attens1):
        html += highlight(TEXT.vocab.itos[word], attn)
    html += "<br><br>"

    # 2段目のAttention
    html += '[TransformerBlockの2段目のAttentionを可視化]<br>'
    for word, attn in zip(sentence, attens2):
        html += highlight(TEXT.vocab.itos[word], attn)

    html += "<br><br>"

    return html

In [123]:
from IPython.display import HTML

batch = next(iter(test_dl))

inputs = batch.Text[0].to(device)
labels = batch.Label.to(device)

input_pad = 1
input_mask = (inputs != input_pad)

outputs, normilized_weights_1, normilized_weights_2 = net_trained(inputs, input_mask)
_, preds = torch.max(outputs, 1)

index = 10
html_output = mk_html(index, batch, preds, normilized_weights_1, normilized_weights_2, TEXT)
HTML(html_output)  # HTML形式で出力

In [124]:
HTML(html_output)  # HTML形式で出力