In [1]:
#Test
import sys
sys.path.insert(0, '../script/')

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import glob
import time
import numpy as np

import preprocess
X,Y = preprocess.load_data()

In [2]:
print("Finished cleaning")
def charToNumber(c):
    n = ord(c)
    if(n>=3585 and n<=3673):
        return n-3585
    else:
        return 89 #For Other

def prepare_sequence(sentence):
    idxs = [charToNumber(c) for c in sentence]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)
prepare_sequence("ทดสอบ")

Finished cleaning


Variable containing:
 22
 19
 41
 44
 25
[torch.LongTensor of size 5]

In [3]:
#Initiate parameters
emb_size = 32
nhidden = 300
nlayers = 5
dropout = 0.5
rnn_type = 'LSTM'
n_char = 90
clip = 0.25
nout = 1

In [4]:
class BiLSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, n_layers,char_size, tagset_size):
        super(BiLSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.word_embeddings = nn.Embedding(char_size, embedding_dim)
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2,num_layers = n_layers,bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(self.n_layers*2, 1, self.hidden_dim//2)),
                autograd.Variable(torch.zeros(self.n_layers*2, 1, self.hidden_dim//2)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.softmax(tag_space)
        return tag_scores


In [5]:
model = BiLSTMTagger(emb_size,nhidden,nlayers,n_char, nout)
loss_function = nn.BCELoss()
#loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [6]:
epochs = 20
printEvery = 50
i=0
j=0
for i in range(epochs):
    print("Starting epochs %d..."%i)
    start_time = time.time()
    shuffled_indexed = np.random.permutation(len(X))
    print(X.shape)
    print(Y.shape)
    for sentence, tags in zip(X[shuffled_indexed],Y[shuffled_indexed]):
        piped = preprocess.create_pipe(sentence,tags)
        if(piped):
            j+=1
            if(j%printEvery==0): print(piped)
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            model.hidden = model.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Variables of word indices.
            sentence_in = prepare_sequence(sentence)
            #targets = prepare_sequence(tags, tag_to_ix)

            # Step 3. Run our forward pass.
            tag_scores = model(sentence_in)
            targets = Variable(torch.FloatTensor([int(i) for i in tags])).view(-1,1)


            pred_string = "".join([str(int(i[0])) for i in tag_scores.data.numpy()])
            if(j%printEvery==0): 
                print("Pred: ",end='')
                print(preprocess.create_pipe(sentence,pred_string))

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = loss_function(tag_scores, targets)
            print("Loss: ",loss.data.numpy()[0],end='\r')
            loss.backward()
            optimizer.step()
        else:
            print("Skipping")
        
    print("Epochs: %d"%i)
    print("Done!")
    print("--- %s seconds ---" % (time.time() - start_time))
    print("Loss: %f"%(loss.data.numpy()[0]))
    train_loss.append(loss)
    print()

Starting epochs 0...
(21589,)
(21589,)
าง|ใน|ปี| |พ.ศ.| |2470| |พระอาจารย์มั่น| |เดินทาง|เข้า|มา|ใน|เขต|อำเภออำนาจเจริญ| |จังหวัดอุบลราชธานี| |(|ขณะ|นั้น|)| |ได้|มี
Pred: |า|ง|ใ|น|ป|ี| |พ|.|ศ|.| |2|4|7|0| |พ|ร|ะ|อ|า|จ|า|ร|ย|์|ม|ั|่|น| |เ|ด|ิ|น|ท|า|ง|เ|ข|้|า|ม|า|ใ|น|เ|ข|ต|อ|ำ|เ|ภ|อ|อ|ำ|น|า|จ|เ|จ|ร|ิ|ญ| |จ|ั|ง|ห|ว|ั|ด|อ|ุ|บ|ล|ร|า|ช|ธ|า|น|ี| |(|ข|ณ|ะ|น|ั|้|น|)| |ไ|ด|้|ม|ี
ือ|สมภาร|ของ|วัด|หนึ่ง|ๆ| |?|สอง|รูป|จาก|ทาง|เหนือ|ของ|ไทย| |และ|หนึ่ง|รูป|จาก|กรุงเทพฯ| |กำลัง|ก้ม|คาราวะ|จรด|พื้น|ต่อ|ผู้|หญิง|ท
Pred: |ื|อ|ส|ม|ภ|า|ร|ข|อ|ง|ว|ั|ด|ห|น|ึ|่|ง|ๆ| |?|ส|อ|ง|ร|ู|ป|จ|า|ก|ท|า|ง|เ|ห|น|ื|อ|ข|อ|ง|ไ|ท|ย| |แ|ล|ะ|ห|น|ึ|่|ง|ร|ู|ป|จ|า|ก|ก|ร|ุ|ง|เ|ท|พ|ฯ| |ก|ำ|ล|ั|ง|ก|้|ม|ค|า|ร|า|ว|ะ|จ|ร|ด|พ|ื|้|น|ต|่|อ|ผ|ู|้|ห|ญ|ิ|ง|ท
ัน|)| |...| |มัน|ไม่|เหมือน|การ|จ้าง| |...| |การ|จ้าง| |เงิน|คือ|สิ่ง|สำคัญ| |คุณ|จะ|ทำ|งาน|ใน|ช่วง|เวลา|ที่|แน่นอน| |...| |แต่|ทว่า|กา
Pred: |ั|น|)| |.|.|.| |ม|ั|น|ไ|ม|่|เ|ห|ม|ื|อ|น|ก|า|ร|จ|้|า|ง| |.|.|.| |ก|า|ร|จ|้|า|ง| |เ|ง|ิ|น|ค|ื|อ|ส|ิ|่|ง|ส|ำ|ค|ั|ญ| |ค|ุ|ณ|จ|ะ|ท|ำ|ง|า|น|ใ|น|ช|่|ว|ง|เ

KeyboardInterrupt: 