## LSTM - pytorch

<img src="image/pytorch_LSTM_1.png" width="550">

### Define LSTM architecture：
>1. input_size: 每个单词的 embedding or one-hot encoding 的尺寸。
2. hidden_size: 隐含层神经元的个数。
3. num_layers: LSTM的层数。
4. bias:
5. batch_first: 
6. dropout: 0-1
7. bidirectional: true or false.

### LSTM input：
>
### LSTM output：
>

In [1]:
# load pre-processed dataset.
import pandas as pd
import json
import torch   
from torchtext import data 
import random
import os
from torchtext.vocab import Vectors
import torch.nn as nn
import torch.optim as optim

In [2]:
dataset = pd.read_csv("pre-processed data/new_label_dataset.csv", usecols=['content', 'categories', 'priority'])

len(dataset)

37293

In [3]:
torch.manual_seed(2020)

# loading custom dataset
TEXT = data.Field(tokenize=lambda x: x.split() ,batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

fields = [(None, None), ('content', TEXT), (None, None), ('priority', LABEL)]

dataset=data.TabularDataset(path = 'pre-processed data/new_label_dataset.csv',format = 'csv',fields = fields,skip_header = True)

print(vars(dataset.examples[0]))

{'content': ['philippine', 'flood', 'worsen', 'death', 'toll', 'hit', 'wake', 'gener'], 'priority': 'Low'}


In [4]:
tr_X, te_X = dataset.split(split_ratio=0.8, random_state = random.seed(2020))
tr_x, val_x = tr_X.split(split_ratio=0.7, random_state = random.seed(2020))

# load downloaded glove word embedding.
cache = '.vector_cache'
if not os.path.exists(cache): os.mkdir(cache)
vectors = Vectors(name='./glove.840B.300d.txt', cache=cache)

# create vocab.
TEXT.build_vocab(tr_X, min_freq=3, vectors=vectors)
LABEL.build_vocab(tr_X)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

print("Size of LABEL vocabulary:",len(LABEL.vocab))

print("Top words: ", TEXT.vocab.freqs.most_common(5))  

# Word dictionary.
print("LABEL vocabulary: ", LABEL.vocab.stoi)

Size of TEXT vocabulary: 6543
Size of LABEL vocabulary: 5
Top words:  [('shoot', 3323), ('earthquake', 2530), ('people', 2370), ('philippine', 2132), ('school', 2119)]
LABEL vocabulary:  defaultdict(None, {'Low': 0, 'Medium': 1, 'High': 2, 'Critical': 3, 'Unknown': 4})


In [5]:
# GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((tr_x, val_x), 
                                                            batch_size = BATCH_SIZE,
                                                            sort_key = lambda x: len(x.content),
                                                            sort_within_batch=True,
                                                            device=device)


In [6]:
class RNN_classifier(nn.Module):
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()

        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # bi-directional LSTM
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim, 
                            num_layers=n_layers, 
                            bidirectional=bidirectional, 
                            dropout=dropout,
                            batch_first=True)

        #dense layer
        self.fc = nn.Linear(hidden_dim*2, output_dim)

        #activation function
        self.act = nn.Softmax(dim=1)
    
    def forward(self, text, text_lengths):
        
        embedded = self.embedding(text)
        
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)
        
        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [10]:
def train(model, iterator, optimizer, criterion):
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        cont, cont_lengths = batch.content 

        #convert to 1D tensor
        predictions = model(cont, cont_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.priority.long())  
        
        # backward propagation
        loss.backward()       
        
        #update the weights
        optimizer.step() 
        
        #loss and accuracy
        epoch_loss += loss.item()     
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [11]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and length
            cont, cont_lengths = batch.content 

            predictions = model(cont, cont_lengths).squeeze() 
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.priority.long())
            
            # train loss.
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
size_of_vocab = len(TEXT.vocab)

def Adam(n_epoches=100, lr = 0.001, dropout = 0.3, num_hidden_nodes = 32):
    
    train_loss_list, valid_loss_list = [], []
    
    embedding_dim = 300
    num_output_nodes = 5
    num_layers = 1 # depth
    bidirection = True

    # RNN model
    model = RNN_classifier(size_of_vocab, embedding_dim, 
                       num_hidden_nodes,num_output_nodes, 
                       num_layers, bidirectional = True, 
                       dropout = dropout)
    print(model)
    
    # pre-trained Glove.
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)
    
    # define optimizer and loss.
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-8)
    criterion = nn.CrossEntropyLoss()
    
    # training
    for epoch in range(n_epoches):

        #train the model
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        #evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        train_loss_list.append(train_loss)
        valid_loss_list.append(valid_loss)

        print(f"Epoch: {epoch:d} : Train Loss: {train_loss:.4f} | Valid loss: {valid_loss:.4f}")
        
    return train_loss_list, valid_loss_list

In [32]:
train_loss, valid_loss = Adam(n_epoches=20, lr=0.001, dropout=0.3, num_hidden_nodes = 20)

RNN_classifier(
  (embedding): Embedding(6543, 300)
  (lstm): LSTM(300, 20, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=40, out_features=5, bias=True)
  (act): Softmax(dim=1)
)
Epoch: 0 : Train Loss: 1.2181 | Valid loss: 1.1753
Epoch: 1 : Train Loss: 1.1758 | Valid loss: 1.1746
Epoch: 2 : Train Loss: 1.1753 | Valid loss: 1.1746
Epoch: 3 : Train Loss: 1.1751 | Valid loss: 1.1751
Epoch: 4 : Train Loss: 1.1602 | Valid loss: 1.1576
Epoch: 5 : Train Loss: 1.1383 | Valid loss: 1.1539
Epoch: 6 : Train Loss: 1.1179 | Valid loss: 1.1495
Epoch: 7 : Train Loss: 1.0971 | Valid loss: 1.1513
Epoch: 8 : Train Loss: 1.0819 | Valid loss: 1.1534
Epoch: 9 : Train Loss: 1.0719 | Valid loss: 1.1524
Epoch: 10 : Train Loss: 1.0644 | Valid loss: 1.1529
Epoch: 11 : Train Loss: 1.0583 | Valid loss: 1.1556
Epoch: 12 : Train Loss: 1.0549 | Valid loss: 1.1638
Epoch: 13 : Train Loss: 1.0515 | Valid loss: 1.1590
Epoch: 14 : Train Loss: 1.0485 | Valid loss: 1.1621
Epoch: 15 : Train L