In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import re
from collections import defaultdict
import glob

np.random.seed(1234)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
sns.set_style('darkgrid')

%matplotlib inline

  (fname, cnt))
  (fname, cnt))


In [2]:
import torch
import torch.nn as nn
from torch import optim
from torch.nn import functional as F

In [3]:
torch.manual_seed(1234)

<torch._C.Generator at 0x7fc8c8d05470>

In [4]:
device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda')

In [13]:
device = torch.device("cpu")

In [14]:
print(torch.__version__)

0.4.0


In [15]:
class Vocab:
    def __init__(self):
        self.word2index = defaultdict(int)
        self.word2count = defaultdict(int)
        self.index2word = defaultdict(str)
        self.n_words = 0
    def add_sentence(self, sentence):
        for word in sentence.split(" "):
            self.add_word(word)
    
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 0
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [16]:
%%time
vocab = Vocab()

for path in glob.glob('../preprocessed/*.csv'):
    series = pd.read_csv(path, header=None, dtype={0: str}, encoding='utf-8').dropna(axis=0)[0]
    for sentence in series:
        vocab.add_sentence(sentence)

# defaultdictは未知のkeyに対応するvalueを要求すると、defaultのvalueを作成してしまう
# 後々のバグを防ぐため、通常のdictに変えてロックする
vocab.word2index = dict(vocab.word2index)
vocab.index2word = dict(vocab.index2word)
vocab.word2count = dict(vocab.word2count)

CPU times: user 672 ms, sys: 16 ms, total: 688 ms
Wall time: 687 ms


In [17]:
def make_padded_array(reviews, vocab=vocab):  
    review_list = list()
    len_list = list()
    for r in reviews:
        r = str(r)
        review_indexes = [vocab.word2index[w] for w in r.split()]
        review_list.append(review_indexes)
        len_list.append(len(review_indexes))
    
    len_array = np.sort(len_list)[::-1].copy() # torch.Tensorは配列逆にしているとエラーを起こすので、コピーする
    idxes = np.argsort(len_list)[::-1].copy()
    text_array = np.zeros((len(review_list), max(len_list)), dtype=int)
    for i, idx in enumerate(idxes):
        text_array[i, :len(review_list[idx])] = review_list[idx]
    return text_array, len_array, idxes + reviews.index[0] # idxesは0スタートなので、入力reviewsのindexと一致するように調整する

In [18]:
class BatchIterator(object):
    def __init__(self, df, batch_len):
        self.df = df
        self.batch_len = batch_len
        self.n_batch = df.shape[0] // batch_len + 1
        self.n_data = df.shape[0]
    
    def __iter__(self):
        df = self.df.sample(frac=1).reset_index(drop=True) # DFをシャッフルする
        for b_idx in range(0, self.df.shape[0], self.batch_len):
            text_batch = df.loc[b_idx:b_idx+self.batch_len-1, "text"]
            target_batch = df.loc[b_idx:b_idx+self.batch_len-1, "label"]
            
            text_array, len_array, idxes = make_padded_array(text_batch)
            target_array = target_batch[idxes].values
            
            text_tensor = torch.LongTensor(text_array).to(device)
            lengths_tensor = torch.LongTensor(len_array).to(device)
            target_tensor = torch.LongTensor(target_array).to(device)
            
            yield text_tensor, lengths_tensor, target_tensor

# データの作成

多値分類のデータセットを作る  
クラスを0~5に降る

In [19]:
vg_train = pd.read_csv('../preprocessed/vg_train.csv', header=None, encoding='utf-8')
hk_train = pd.read_csv('../preprocessed/hk_train.csv', header=None, encoding='utf-8')
so_train = pd.read_csv('../preprocessed/so_train.csv', header=None, encoding='utf-8')
csj_train = pd.read_csv('../preprocessed/csj_train.csv', header=None, encoding='utf-8')
hpc_train = pd.read_csv('../preprocessed/hpc_train.csv', header=None, encoding='utf-8')
aa_train = pd.read_csv('../preprocessed/aa_train.csv', header=None, encoding='utf-8')

In [20]:
train_data = pd.concat([vg_train, hk_train, so_train, csj_train, hpc_train, aa_train], axis=0).reset_index(drop=True)

In [21]:
i = 1000
train_data['label'] = pd.Series([0]*i+[1]*i+[2]*i+[3]*i+[4]*i+[5]*i)

In [22]:
train_data = train_data.sample(frac=1).reset_index(drop=True)

In [23]:
train_data.columns = ['text', 'label']

In [24]:
vg_test = pd.read_csv('../preprocessed/vg_test.csv', header=None, encoding='utf-8')
hk_test = pd.read_csv('../preprocessed/hk_test.csv', header=None, encoding='utf-8')
so_test = pd.read_csv('../preprocessed/so_test.csv', header=None, encoding='utf-8')
csj_test = pd.read_csv('../preprocessed/csj_test.csv', header=None, encoding='utf-8')
hpc_test = pd.read_csv('../preprocessed/hpc_test.csv', header=None, encoding='utf-8')
aa_test = pd.read_csv('../preprocessed/aa_test.csv', header=None, encoding='utf-8')

test_data = pd.concat([vg_test, hk_test, so_test, csj_test, hpc_test, aa_test], axis=0).reset_index(drop=True)

i = 1000
test_data['label'] = pd.Series([0]*i+[1]*i+[2]*i+[3]*i+[4]*i+[5]*i)

test_data = test_data.sample(frac=1).reset_index(drop=True)

test_data.columns = ['text', 'label']

In [25]:
%%time
text_iterator = BatchIterator(train_data, 10)
cnt = 0
for te, l, ta in text_iterator:
    cnt += 1

CPU times: user 880 ms, sys: 0 ns, total: 880 ms
Wall time: 879 ms


# モデルの作成

基本的にここに準拠  
https://qiita.com/itok_msi/items/ad95425b6773985ef959

Embedding(146467→100)  
LSTM(100→32)  
Attention(24)  
MLP(32→1)  

Target(2), neg→[1,0], pos→[0,1]  

損失関数に binary_cross_entropy_with_logits を噛ませるので、モデルの出力を[0,1]に制限しなくても良い(sigmoidとlossを別にするより学習が安定する)  
予測時には出力層にsigmoid関数を噛ませる

## Attention有りのbiLSTM

In [28]:
class AttnClassifer(nn.Module):
    def __init__(self, emb_dim, h_dim, v_size, n_class=2, bidirectional=True,
                 batch_first=True):
        super(LSTMClassifer, self).__init__()
        self.h_dim = h_dim
        self.bi = 2 if bidirectional else 1
        self.emb = nn.Embedding(v_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first, 
                            bidirectional = bidirectional)
        
        self.attn = nn.Sequential(
            nn.Linear(h_dim, 24),
            nn.ReLU(True),
            nn.Linear(24, 1)
        )
        
        self.affine = nn.Linear(self.h_dim, n_class)
        
    def init_hidden(self, b_size):
        h0 = torch.zeros(self.bi, b_size, self.h_dim, device=device)
        return (h0, h0) # LSTMはhiddenとcell2つの隠れ層が必要
    
    def forward(self, sentences, lengths):
        batch_len = sentences.shape[0]
        hidden, cell = self.init_hidden(batch_len)
        embed = self.emb(sentences)
        packed_input = nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True)
        output, hidden = self.lstm(packed_input, (hidden, cell))
        output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0] # (b, s, h)
        output = output[:, :, :self.h_dim] + output[:, :, self.h_dim:] # 正方向の隠れ層と逆方向の隠れ層を加算
        
        # Attention
        attn = self.attn(output.view(-1, self.h_dim)) # (b,s,h)→(b*s,h)→(b*s,1)
        attn = F.softmax(attn.view(batch_len, -1), dim=1).unsqueeze(2) # (b*s,1)→(b,s)→(b,s,1)
        
        output = (output * attn).sum(dim=1) # (b, s, h)→(b, h)
        output = self.affine(output) # (b,h)→(b,c)
        output = F.log_softmax(output, dim=1) # (b, c), 各データが各クラスに属した場合の対数尤度を計算
        return output, attn

In [27]:
class LSTMClassifer2(nn.Module):
    def __init__(self, emb_dim, h_dim, v_size, n_class=2, bidirectional=True,
                 batch_first=True):
        super(LSTMClassifer2, self).__init__()
        self.h_dim = h_dim
        self.bi = 2 if bidirectional else 1
        self.emb = nn.Embedding(v_size, emb_dim)
        self.flstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first, 
                            bidirectional = False)
        self.blstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first, 
                            bidirectional = False)
        self.affine = nn.Linear(self.h_dim * self.bi, n_class)
        
    def init_hidden(self, b_size):
        h0 = torch.zeros(1, b_size, self.h_dim, device=device)
        return (h0, h0) # LSTMはhiddenとcell2つの隠れ層が必要
    
    def forward(self, sentences, l):
        hidden, cell = self.init_hidden(sentences.shape[0])
        embed = self.emb(sentences)
        f_l = []
        b_l = []
        
        fout, fhidden = self.flstm(embed[:,0,:].unsqueeze(1), (hidden, cell))
        bout, bhidden = self.blstm(embed[:,-1,:].unsqueeze(1), (hidden, cell))
        f_l.append(fout)
        b_l.append(b_out)
        for i in range(sentences.shape[1]-1):
            fout, fhidden = self.flstm(embed[:,i+1,:].unsqueeze(1), fhidden)
            bout, bhidden = self.blstm(embed[:,-i-2,:].unsqueeze(1), bhidden)
            f_l.append(fout)
            g_l.append(gout)
        
        output = torch.cat((fout, bout), dim=2).squeeze(0)
        
        output = self.affine(output)
        output = F.log_softmax(output, dim=1) # (b, n_class), 各データが各クラスに属した場合の対数尤度を計算
        # logをとっても、最終的に選択されるカテゴリは対数尤度が最も大きい次元になる
        # 損失関数は、F.nll_loss(negative loss likelihood loss)
        # 入力するものは、各ラベルに所属する対数尤度(log_softmaxで計算)と正解ラベル
        # 損失関数の中では、正解ラベルに所属する対数尤度の符号を反転させて足し合わせたものが加算されていく
        # 式の形を見ればわかるが、cross entropyは正解ラベルに属するlog(sigmoid(x))のみを足し合わせている
        # 従って、ネットワークのアウトプットをlog_softmax(今回は2カテゴリなのでlog_sigmoidとなる)とすれば良い
        # 但し、log_softmaxはexp(-x)を計算しているので、xが極端に小さいとオーバーフローを起こすので注意
        # とはいえlogとsoftmaxを別々で計算するよりは計算が安定する、、ここら辺は詳細不明
        return output

# 学習関数の設定

In [19]:
def train_model(epoch, train_iter, optimizer, log_interval=100):
    model.train()
    correct = 0
    all_ = 0
    epoch_loss = 0
    
    for idx, (x, x_l, y) in enumerate(train_iter):
        optimizer.zero_grad()
        output, attn = model(x, x_l)
        loss = F.nll_loss(output, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
        pred = output.data.max(dim=1)[1]
        correct += pred.eq(y).sum().item() # 予測と実測の正答数を加算
        all_ += len(y)
     
        if idx % log_interval == 0:
            # バッチ毎の更新で十分にaccuracyが上がっていくので、そこの進捗を表示する
            print('train epoch: {} [{}/{}], acc:{:.4f}, loss:{:.4f}'.format(
                epoch, idx+1, train_iter.n_batch, correct/all_, loss))
            correct = 0
            all_ = 0
    return epoch_loss / idx+1

In [20]:
def test_model(epoch, test_iter, log_interval=5):
    model.eval()
    with torch.no_grad():
        correct = 0
        epoch_loss = 0
        for idx, (x, x_l, y) in enumerate(test_iter):
            output, attn = model(x, x_l)
            loss = F.nll_loss(output, y)
            epoch_loss += loss.item()
            
            pred = output.data.max(dim=1)[1]
            correct += pred.eq(y).sum().item()
            
    if epoch % log_interval == 0:
        print('test epoch: {}, acc:{:.4f}, loss:{:.4f}'.format(
        epoch, correct/test_iter.n_data, epoch_loss))
    return epoch_loss / idx+1

In [21]:
def predict_model(review):
    review_idxes = [vocab.word2index[w] for w in str(review).split()]
    review_tensor = torch.LongTensor(review_idxes).to(device).unsqueeze(0)
    length_tensor = torch.LongTensor([len(review_idxes)]).to(device)
    model.eval()
    with torch.no_grad():
        out, attn = model(review_tensor, length_tensor)
    
    return out.max(dim=1)[1].item(), attn

# 学習の実行

In [28]:
model = LSTMClassifer(100, 32, vocab.n_words, n_class=6).to(device)

In [29]:
train_iter = BatchIterator(train_data, batch_len=1)
test_iter = BatchIterator(test_data, batch_len=1)

In [30]:
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)
patience = 5
n_epoch = 10

In [31]:
print(train_data.shape, test_data.shape)

(6000, 2) (6000, 2)


In [32]:
%%time
train_lc = []
test_lc = []
cnt = 0
for epoch in range(n_epoch):
    train_loss = train_model(epoch, train_iter, optimizer, log_interval=1000) # 学習が高速に進む+iter時間かかる→batch毎に進捗プリント
    train_lc.append(train_loss)
    
    test_loss = test_model(epoch, test_iter, log_interval=1)
    test_lc.append(test_loss)
    
    if epoch > 0:
        if test_loss > min(test_lc[:-1]):
            cnt += 1
        else:
            cnt = 0
    
    if cnt >= patience:
            print('early stopping: epoch {}'.format(epoch))
            break
    
print("Done !")

train epoch: 0 [1/6001], acc:0.0000, loss:1.7187
train epoch: 0 [1001/6001], acc:0.3340, loss:1.3989
train epoch: 0 [2001/6001], acc:0.4890, loss:0.6746
train epoch: 0 [3001/6001], acc:0.5640, loss:1.3322
train epoch: 0 [4001/6001], acc:0.5970, loss:1.3639
train epoch: 0 [5001/6001], acc:0.6370, loss:0.0262
test epoch: 0, acc:0.6742, loss:5191.3131
train epoch: 1 [1/6001], acc:1.0000, loss:0.4417
train epoch: 1 [1001/6001], acc:0.7960, loss:0.0403
train epoch: 1 [2001/6001], acc:0.7770, loss:1.6920
train epoch: 1 [3001/6001], acc:0.7930, loss:2.6870
train epoch: 1 [4001/6001], acc:0.7950, loss:0.0237
train epoch: 1 [5001/6001], acc:0.7870, loss:0.0615
test epoch: 1, acc:0.7143, loss:4667.1000
train epoch: 2 [1/6001], acc:1.0000, loss:0.0230
train epoch: 2 [1001/6001], acc:0.9220, loss:0.0107
train epoch: 2 [2001/6001], acc:0.9120, loss:2.0058
train epoch: 2 [3001/6001], acc:0.9050, loss:0.0073
train epoch: 2 [4001/6001], acc:0.8990, loss:0.0001
train epoch: 2 [5001/6001], acc:0.9050, l

In [60]:
test_loss

2.843774317522648

In [62]:
test_lc

[2.7891019178164127, 2.843774317522648]

データ数60万(10万×6カテゴリ)だと学習が遅いし別にここまでデータ増やす必要もないかも  
データ数6万(1万×6カテゴリ)に変えようかな

In [26]:
torch.save(model.state_dict(), "../output/attn_params_600k.picke")

In [28]:
te.cpu().numpy()

array([[  18,  136,   53, ...,    3,   23,   68],
       [ 336,  386,  193, ...,    0,    0,    0],
       [ 537, 1318, 1278, ...,    0,    0,    0],
       ...,
       [1332,   20,  208, ...,    0,    0,    0],
       [  18,  413,  118, ...,    0,    0,    0],
       [  23, 2384,   28, ...,    0,    0,    0]])

In [30]:
vocab.index2word[0]

'solid'

index:0の単語と、0埋めした0が被っていそう  
全体の精度には影響しないけど、気持ち悪いので最終的には取り除く