# 第9章: RNN, CNN

In [1]:
import spacy
from stemming.porter2 import stem
from collections import Counter
import numpy as np
from tqdm import tqdm
import pickle
import random

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils.rnn as rnn
from torch.nn.utils.rnn import pad_sequence

## 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
with open("../data/NewsAggregatorDataset/newsCorpora.csv", "r") as intxt:
    inline = intxt.readlines()
    
select_publisher = ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]
datas = []

for line in inline:
    if line != "":
        ll = line.split("\t")
        if ll[3] in select_publisher:
            datas.append((ll[4],ll[1]))

In [5]:
def get_feat_from_sentence(string):
    sl = nlp.make_doc(string)
    string = [i.lemma_.lower() for i in sl]
    return string

def data2stemmed(data):
    return [(label, get_feat_from_sentence(string)) for label,string in data]

def get_feature_stems(stemmed):
    counter = Counter([tok for _,toks in stemmed for tok in toks])
    return [stem for stem,num in counter.most_common() if 2<=num]

def make_feature_dic(features):
    dic = {}
    for i, word in enumerate(features):
        dic[word] = i+1
    return dic

def get_id(stemmed, feature_dic):
    if stemmed in feature_dic:
        return feature_dic[stemmed]
    else:
        return 0

def get_ids(stemmedl, feature_dic):
    return [get_id(i, feature_dic) for i in stemmedl]

def stemmed2ids(stemmed, feature_dic):
    return [(label, get_ids(steml, feature_dic)) for label,steml in stemmed]

In [6]:
stemmed = data2stemmed(datas)
features = get_feature_stems(stemmed)
feature_dic = make_feature_dic(features)
ids = stemmed2ids(stemmed, feature_dic)

In [7]:
len(features)

8600

In [8]:
print(get_id("-", feature_dic))
print(get_id("hogehoge", feature_dic))

1
0


In [9]:
stemmed[0][1]

['europe', 'reach', 'crunch', 'point', 'on', 'bank', 'union']

In [10]:
get_ids(stemmed[0][1], feature_dic)

[247, 858, 0, 927, 13, 60, 957]

In [11]:
for i in ids[:3]:
   print(i)

for i in ids[-3:]:
   print(i)

('b', [247, 858, 0, 927, 13, 60, 957])
('b', [54, 604, 1, 2586, 43, 0, 61, 54, 5, 1249, 3, 419, 72, 45, 12, 7, 6])
('b', [43, 0, 0, 9, 6314, 116, 550, 4, 1900, 243, 4220])
('m', [14, 0, 1611, 116, 8600, 4, 669, 1737, 1, 126])
('m', [246, 1301, 49, 0, 4943, 4, 126, 65, 8, 4975, 158])
('m', [337, 707, 1692, 745, 3, 246, 1, 104, 2123, 2124, 20, 2898, 11, 463, 7, 6])


In [12]:
def ids2ids_onehot(ids, features):
    size = len(features) + 1
    identity = np.identity(size)
    return [(label, identity[i]) for label,i in tqdm(ids)]

In [13]:
#ids_onehot = ids2ids_onehot(ids, features)

In [14]:
#for i in ids_onehot[:3]:
#   print(i)

#for i in ids_onehot[-3:]:
#   print(i)

## 81. RNNによる予測

In [15]:
#random.shuffle(ids_onehot)
random.shuffle(ids)

In [16]:
class Model81(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, out_dim):
        super(Model81, self).__init__()
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=1, batch_first=True)
        self.out = nn.Linear(hidden_dim, out_dim)
        nn.init.normal_(self.embed.weight, 0.0, 1.0)
        #self.loss = nn.CrossEntropyLoss()
    
    def predict(self, x, h=None):
        x = self.embed(x.to(torch.int64))
        x, hp = self.rnn(x, h)
        h = self.out(hp)
        #print(h)
        h = h.squeeze(0)
        return h

    def forward(self, x):
        label = self.predict(x, None)
        #return self.loss(label, ans)
        return label

In [17]:
model = Model81(8601, 300, 50, 4)

In [18]:
print(model.predict(torch.tensor(ids[0][1]).unsqueeze(0)))
print(ids[0][0])

tensor([[-0.0797,  0.3024, -0.2443, -0.1584]], grad_fn=<SqueezeBackward1>)
b


In [19]:
print(model.predict(torch.tensor(ids[1][1]).unsqueeze(0)))
print(ids[1][0])

tensor([[-0.5165, -0.2733,  0.4425,  0.5172]], grad_fn=<SqueezeBackward1>)
b


## 82. 確率的勾配降下法による学習

In [20]:
label_dic = {"b":0, "e":1, "m":2, "t":3}

def div_vecs(vecs):
    x = [np.array(i[1]) for i in vecs]
    label = [label_dic[i[0]] for i in vecs]
    return x,label

trains = ids[:len(ids)*4//5]
print("train data :",len(trains))

tests = ids[len(ids)*8//10:len(ids)*9//10]
print("valid data :",len(tests))

valids = ids[len(ids)*9//10:]
print("test data :",len(valids))

train data : 10684
valid data : 1336
test data : 1336


In [21]:
trainx, trainy = div_vecs(trains)
testx, testy = div_vecs(tests)
validx, validy = div_vecs(valids)

In [22]:
datasets = [trainx, trainy, testx, testy, validx, validy]

In [23]:
def try_gpu(e):
    if torch.cuda.is_available():
        return e.cuda()
    return e

In [24]:
class Model81(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, out_dim):
        super(Model81, self).__init__()
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=1, batch_first=True)
        self.out = nn.Linear(hidden_dim, out_dim)
        nn.init.normal_(self.embed.weight, 0.0, 1.0)
    
    def predict(self, x, h=None):
        x = self.forward(x, h)
        x = F.softmax(x, dim=1)
        return x

    def forward(self, x, h=None):
        x = self.embed(x.to(torch.int64))
        x, hp = self.rnn(x, h)
        h = self.out(hp)
        #print(h)
        h = h.squeeze(0)
        return h

In [25]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, status, use_gpu=False):
        dic = {"train":0, "test":1, "valid":2}
        self.status = status
        vecs = dataset
        self.data = vecs[dic[status]*2]
        self.label = vecs[dic[status]*2+1]
        self.data_num = len(self.data)
        self.use_gpu = use_gpu
    
    def __len__(self):
        return self.data_num

    def __getitem__(self, idx):
        out_data = self.data[idx]
        out_label =  self.label[idx]
        return out_data, out_label
    
    def collate(self, batch):
        datas, labels = list(zip(*batch))
        datas = [torch.tensor(data) for data in datas]
        datas = pad_sequence(datas, batch_first=True)
        if self.use_gpu:
            return try_gpu(datas), try_gpu(torch.tensor(labels))
        else:
            return datas, torch.tensor(labels)

def generator(dataset, status, batch_size, shuffle=True, use_gpu=False):
    data_set = Dataset(dataset, status, use_gpu)
    return torch.utils.data.DataLoader(data_set, collate_fn = data_set.collate, batch_size=batch_size, shuffle=shuffle)

In [26]:
ds = generator(datasets, "train", 1)
for x, y in ds:
  print(x,y)
  break

tensor([[  10,  801,  345,   12,    2, 1238,    1,  254,   15, 1086,   11,  444,
         1009,   28,    9,    0,   11, 3499,   80,    7,    6]]) tensor([1])


In [27]:
ds = generator(datasets, "train", 100)
for x, y in ds:
  print(x,y)
  break

tensor([[ 103,   86,   16,  ...,    0,    0,    0],
        [  14,   47,    1,  ...,    0,    0,    0],
        [  10,  392, 2479,  ...,    0,    0,    0],
        ...,
        [3625, 3626,    1,  ...,    0,    0,    0],
        [ 186, 4670, 4775,  ...,    0,    0,    0],
        [5089,  330,    8,  ...,    0,    0,    0]]) tensor([1, 0, 3, 0, 1, 3, 0, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 3, 0, 3, 1, 0,
        3, 3, 0, 1, 1, 1, 0, 1, 2, 0, 2, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 0, 0, 3, 1, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 2, 1, 0, 1, 0, 1, 1, 0,
        1, 1, 1, 3, 0, 0, 3, 0, 1, 1, 3, 0, 0, 1, 0, 1, 3, 0, 1, 3, 1, 1, 1, 1,
        3, 1, 3, 2])


In [28]:
def accuracy(dataset, md, use_gpu=False):
    with torch.no_grad():
        if use_gpu:
            predicts = [md.predict(try_gpu(torch.tensor(dat).unsqueeze(0))) for dat in dataset.data]
        else:
            predicts = [md.predict(torch.tensor(dat).unsqueeze(0)) for dat in dataset.data]
        return np.mean([int(p.argmax()==a) for p,a in zip(predicts, dataset.label)])

In [29]:
## batch>=2のときスコアが明らかに悪い(パディングがヤバそう)　要改善
class TRAINER:
    def __init__(self, model, criterion, vecs, optimizer, gen_batchsize, max_iter, use_gpu=False):
        self.train_generator = generator(datasets, "train", gen_batchsize, True,  use_gpu)
        self.valid_generator = generator(datasets, "valid", 1, True, use_gpu)
        self.train_dataset = Dataset(datasets, "train")
        self.valid_dataset = Dataset(datasets, "valid")
        self.model = model
        if use_gpu:
            self.model = try_gpu(self.model)
        self.criterion = criterion
        self.optimizer = optimizer
        self.max_iter = max_iter
        self.use_gpu = use_gpu

    def train(self):
        self.model.train()
        train_losses = []
        for x,y in self.train_generator:
            out = self.model(x)
            loss = self.criterion(out, y)
            #self.optimizer.zero_grad()
            self.model.zero_grad()
            loss.backward()
            self.optimizer.step()
            train_losses.append(loss.item())
        return accuracy(self.train_dataset, self.model, self.use_gpu), np.mean(train_losses)
        
    def valid(self):
        self.model.eval()
        valid_losses = []
        for x, y in self.valid_generator:
            with torch.no_grad():
                out = self.model(x)
                loss = self.criterion(out, y)
                valid_losses.append(loss.item())
        return accuracy(self.valid_dataset, self.model, self.use_gpu), np.mean(valid_losses)

    def learning(self):
        for ep in range(self.max_iter):
            train_acc, train_loss = self.train()
            valid_acc, valid_loss = self.valid()
            print(train_acc, valid_acc, train_loss, valid_loss)
        return train_acc, valid_acc, train_loss, valid_loss

In [30]:
model = Model81(8601, 300, 50, 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, False)

In [31]:
train_dataset = Dataset(datasets, "train")
accuracy(train_dataset, model)

0.2283788843129914

In [32]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.7329651815799326 0.6968562874251497 0.9790753210313864 0.8765985816995987
0.7616997379258704 0.6923652694610778 0.7898157525766475 0.8894363186980213
0.7730250842381131 0.7080838323353293 0.6881962693207351 0.7887042267920734
0.801572444777237 0.7335329341317365 0.6126789602504298 0.8201215744227162
0.830026207412954 0.7462574850299402 0.5632478011491072 0.769931346654434


(0.830026207412954, 0.7462574850299402, 0.5632478011491072, 0.769931346654434)

## 83. ミニバッチ化・GPU上での学習

In [34]:
print(torch.cuda.is_available())

True


In [34]:
model = Model81(8601, 300, 50, 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
trainer = TRAINER(model, criterion, datasets, optimizer, 4, 10, True)

In [35]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.5416510670160989 0.5351796407185628 1.1382277091314443 1.1290728987423246
0.5765630849868963 0.5494011976047904 1.1112408380814591 1.089036562262538
0.6157806065144141 0.5808383233532934 1.0950248600480619 1.0514136997614791
0.6278547360539124 0.5988023952095808 1.049371959564169 1.0268958297496784
0.5312616997379259 0.5224550898203593 1.0379044489662372 1.1977723392540822
0.662392362411082 0.6392215568862275 1.0663345205743675 1.0203484702788428
0.5674840883564208 0.5576347305389222 1.0102407737690515 1.0974903584061981
0.5950954698614751 0.5785928143712575 1.1121971204860164 1.0818854991101219
0.6106327218270311 0.594311377245509 1.098163985014408 1.0786356266119523
0.6334706102583302 0.5995508982035929 1.086937260556337 1.0383929484440182


(0.6334706102583302, 0.5995508982035929, 1.086937260556337, 1.0383929484440182)

In [36]:
model = Model81(8601, 300, 50, 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, True)

In [37]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.7122800449269936 0.6811377245508982 1.0110051939769524 0.9227620379296605
0.7519655559715462 0.688622754491018 0.8233362262847495 0.8620346756930837
0.799045301385249 0.7297904191616766 0.7301055453871127 0.7879362361516781
0.8005428678397604 0.7290419161676647 0.662836114913281 0.7820159638534763
0.8200112317484088 0.7425149700598802 0.6067876322846965 0.785502018975819


(0.8200112317484088, 0.7425149700598802, 0.6067876322846965, 0.785502018975819)

## 84. 単語ベクトルの導入

In [32]:
import gensim
googlenews_w2v = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

In [33]:
import copy
def init_emb_w2v(model):
    for i, token in enumerate(features):
        if token in googlenews_w2v:
            vc = copy.deepcopy(googlenews_w2v[token])
            model.embed.weight.data[i] = torch.tensor(vc)

In [34]:
model = Model81(8601, 300, 50, 4)
init_emb_w2v(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, False)

In [41]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.667633845001872 0.625748502994012 1.0264221317438498 1.0009513532988237
0.744945713216024 0.6931137724550899 0.8813914419030676 0.8295636649946209
0.7675964058405091 0.7238023952095808 0.7443110938247597 0.7984993853013751
0.8102770497940847 0.7297904191616766 0.671383581644966 0.7591646424652227
0.7967989517034818 0.7312874251497006 0.6248228048183266 0.7885292599853937


(0.7967989517034818,
 0.7312874251497006,
 0.6248228048183266,
 0.7885292599853937)

## 85. 双方向RNN・多層化

In [35]:
class Model85(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, out_dim):
        super(Model85, self).__init__()
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.out = nn.Linear(hidden_dim*2, out_dim)
        nn.init.normal_(self.embed.weight, 0.0, 1.0)
    
    def predict(self, x, h=None):
        x = self.forward(x, h)
        x = F.softmax(x, dim=1)
        return x

    def forward(self, x, h=None):
        x = self.embed(x.to(torch.int64))
        x, hp = self.rnn(x, h)
        h = hp[-2:]
        h = h.transpose(0,1)
        h = h.contiguous().view(-1, h.size(1) * h.size(2))
        h = self.out(h)
        return h

In [36]:
model = Model85(8601, 300, 50, 4)
init_emb_w2v(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, False)

In [44]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.7647884687383003 0.7245508982035929 0.8773009215159225 0.7467689115706062
0.8490265818045676 0.7754491017964071 0.606193734710893 0.6142555233059747
0.8977910894795956 0.8098802395209581 0.44001128170168397 0.5140026786830494
0.9133283414451516 0.8001497005988024 0.3168558874900571 0.5699538128082151
0.9269000374391614 0.8143712574850299 0.23804451305247773 0.6041384062741062


(0.9269000374391614,
 0.8143712574850299,
 0.23804451305247773,
 0.6041384062741062)

In [45]:
model = Model85(8601, 300, 50, 4)
init_emb_w2v(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.02)
trainer = TRAINER(model, criterion, datasets, optimizer, 8, 10, True)

In [46]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.6843878697117185 0.6601796407185628 0.9570134239282436 0.92695612000848
0.7208910520404342 0.6841317365269461 0.7932636824932819 0.8341679519521976
0.7338075627105953 0.6744011976047904 0.6850977506555483 0.8166256576865732
0.8158929239985024 0.7537425149700598 0.5939160730622842 0.6831368532277153
0.8390116061400225 0.7649700598802395 0.5103748515805977 0.6672177050195768
0.8638150505428679 0.7761976047904192 0.44409274949530464 0.6393207655160013
0.8807562710595283 0.7956586826347305 0.3791992886014238 0.6303435368184558
0.9128603519281168 0.8031437125748503 0.33079202953882203 0.5759019294006382
0.9039685511044553 0.7919161676646707 0.2786156015727156 0.6224816416105824
0.9465555971546238 0.812874251497006 0.23896729661162922 0.5855396860641634


(0.9465555971546238,
 0.812874251497006,
 0.23896729661162922,
 0.5855396860641634)

## 86. 畳み込みニューラルネットワーク (CNN)

In [37]:
class Model86(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, out_dim):
        super(Model86, self).__init__()
        self.embed = nn.Embedding(input_dim, embed_dim)
        self.conv = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1)
        self.act = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=3)
        self.out = nn.Linear(hidden_dim, out_dim)
        nn.init.normal_(self.embed.weight, 0.0, 1.0)
    
    def predict(self, x):
        x = self.forward(x)
        x = F.softmax(x, dim=1)
        return x

    def forward(self, x):
        x = self.embed(x.to(torch.int64))
        x = self.conv(x.transpose(-1,-2))
        x = self.act(x)
        x = F.max_pool1d(x, x.size(-1))
        x = x.squeeze(-1)
        x = self.out(x)
        return x

## 87. 確率的勾配降下法によるCNNの学習

In [38]:
model = Model86(8601, 300, 50, 4)

In [39]:
print(model.predict(torch.tensor(ids[0][1]).unsqueeze(0)))

tensor([[0.2813, 0.2301, 0.1819, 0.3068]], grad_fn=<SoftmaxBackward>)


In [40]:
model = Model86(8601, 300, 50, 4)
init_emb_w2v(model)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, False)

In [51]:
print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

train_accuracy / valid_accuracy / train_loss / valid_loss
0.7155559715462374 0.6534431137724551 1.520299936121617 2.336850895940203
0.8147697491576189 0.7410179640718563 6.438891968266377 6.224710974875008
0.8407899663047548 0.7589820359281437 9.905072320440432 11.071071534945808
0.8916136278547361 0.7776946107784432 7.799505887268891 10.71391025770076
0.907244477723699 0.7904191616766467 5.352192184770551 14.411713592215547


(0.907244477723699, 0.7904191616766467, 5.352192184770551, 14.411713592215547)

## 88. パラメータチューニング

In [41]:
def obj(param1):
    print("#################################")
    print("lr :", param1)
    model = Model86(8601, 300, 50, 4)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=param1)
    trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, False)
    trainer.learning()

In [42]:
for param1 in [5e-4, 5e-3, 5e-2]:
    obj(param1)

#################################
lr : 0.0005
0.7641332834144515 0.7260479041916168 0.9085624298658825 0.7925801760953444
0.8458442530887308 0.7761976047904192 0.6179975375333033 0.6329990813025367
0.9090228378884313 0.8031437125748503 0.43359856729503315 0.5518283525968459
0.9584425308873081 0.812874251497006 0.30569945089689055 0.5245845201402435
0.9790340696368401 0.8233532934131736 0.21584127549701265 0.5083019131219224
#################################
lr : 0.005
0.8746724073380756 0.7769461077844312 0.7315544986436615 0.6259733908467116
0.8446274803444402 0.7223053892215568 0.44217909202098127 1.0277974685956648
0.9259640584050918 0.7934131736526946 0.30350672302174136 0.9479423128073134
0.9031261699737926 0.75 0.265390673933642 1.560742645719535
0.9519842755522276 0.8083832335329342 0.24084692438854516 1.2885715534745141
#################################
lr : 0.05
0.4170722575814302 0.4408682634730539 nan nan
0.4170722575814302 0.4408682634730539 nan nan
0.4170722575814302 0.440

## 89. 事前学習済み言語モデルからの転移学習

In [55]:
!pip install transformers

Collecting transformers
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 3.6 MB/s eta 0:00:01
Collecting filelock
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 926 kB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 20.1 MB/s eta 0:00:01
[?25hCollecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 56.6 MB/s eta 0:00:01
Collecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 1.3 MB/s  eta 0:00:01
Using legacy setup.py install for sacremoses, since package 'wheel' is not installed.
Installing collected pack

In [57]:
import transformers
import time

In [58]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [61]:
label_dic = {"b":0, "e":1, "m":2, "t":3}

datas_bert = [(label_dic[data[0]], torch.tensor(tokenizer.encode(data[1]), dtype=torch.long)) for data in datas]

In [63]:
def div_vecs(vecs):
    x = [i[1] for i in vecs]
    label = [i[0] for i in vecs]
    return x,label

trains = datas_bert[:len(datas_bert)*4//5]
print("train data :",len(trains))

tests = datas_bert[len(datas_bert)*8//10:len(datas_bert)*9//10]
print("valid data :",len(tests))

valids = datas_bert[len(datas_bert)*9//10:]
print("test data :",len(valids))

trainx, trainy = div_vecs(trains)
testx, testy = div_vecs(tests)
validx, validy = div_vecs(valids)

datasets = [trainx, trainy, testx, testy, validx, validy]

train data : 10684
valid data : 1336
test data : 1336


In [77]:
trainx[:5]

[tensor([  101,  1980,  5965,   172, 22715,  1553,  1113,  9339,  3779,   102]),
 tensor([  101, 16028,  2064,   143,  9244, 13329,   118, 11661,  1200, 27772,
         22592,  1116,  1149, 16028,  2064,   112,   188,  3802,  1106,  1712,
          5600,  1822,  1111,   119,   119,   119,   102]),
 tensor([  101, 11854,  1760,  8745, 26883,  1279, 17878,  1162,  1112,   139,
         20408,  3299,   157, 11811,  6385,  3377,   117,  2722,  3177, 21238,
         16890, 14367,  1116,   102]),
 tensor([  101,  1302,  7904,  8652,  1116, 11661, 11854,   140, 15998,  1116,
         12118,  7200,  6922,  1174,  6051,  2544,  3313,   113,   122,   114,
           102]),
 tensor([  101,   155, 14663, 17656,  2036,   118,  6304,  4891,  9887,  1116,
          2501,  2672,  1107, 16028,  2064,  3085,  2774,  8679,   118,  3509,
           102])]

In [73]:
class BertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        config = transformers.BertConfig.from_pretrained('bert-base-cased', num_labels=4)
        self.bert = transformers.BertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
    
    def forward(self, batch):
        x = self.bert(batch)
        return x[0]
    
    def predict(self, x):
        x = self.forward(x)
        x = F.softmax(x, dim=1)
        return x

In [78]:
model = BertClassifier()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)
trainer = TRAINER(model, criterion, datasets, optimizer, 1, 5, True)

print("train_accuracy / valid_accuracy / train_loss / valid_loss")
trainer.learning()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

train_accuracy / valid_accuracy / train_loss / valid_loss


  after removing the cwd from sys.path.


0.8143953575439911 0.812874251497006 0.5683475751492558 0.502018268950685
0.8893672781729689 0.8525449101796407 0.41556599898000907 0.4220233768849316
0.9262448521153126 0.8997005988023952 0.2831514143053399 0.3331132270172685
0.9427180831149382 0.9019461077844312 0.22055188061279646 0.29990833734502337
0.95376263571696 0.8952095808383234 0.18142005102168482 0.3204194675841017


(0.95376263571696, 0.8952095808383234, 0.18142005102168482, 0.3204194675841017)