In [None]:
import functools
import sys
import numpy as np
import pandas as pd
import random
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
import nltk
import json
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import string
from torch.utils.data import Dataset

In [None]:
#Hyper Params Do Not Change

PAD_INDEX = 0
UNK_INDEX = 1
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
MAX_LENGTH = 256
BATCH_SIZE = 1
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
OUTPUT_DIM = 12
N_LAYERS = 3
DROPOUT_RATE = 0.1
LR = 3e-4
N_EPOCHS = 5

UNIVERSAL_TAGS = [
    "VERB",
    "NOUN",
    "PRON",
    "ADJ",
    "ADV",
    "ADP",
    "CONJ",
    "DET",
    "NUM",
    "PRT",
    "X",
    ".",
]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:

class Tokenizer:
    
    def __init__(self):
        
        self.vocab = {}
        self.tags = {w:i for i, w in enumerate(UNIVERSAL_TAGS)}
        
    def train(self, corpus):
        vocab = {}
        counter = 2
        for sentence in tqdm(corpus, desc='Constructing Embeddings', file=sys.stdout):
            for word in sentence:
                if word not in vocab:
                    vocab[word] = counter
                    counter += 1
                    
        vocab[PAD_TOKEN] = PAD_INDEX
        vocab[UNK_TOKEN] = UNK_INDEX
        self.vocab = vocab
        
    def tokenize(self, sentence):
        tokens = []
        for word in sentence:
            if word in self.vocab:
                tokens.append(self.vocab[word])
            else:
                tokens.append(self.vocab[UNK_TOKEN])
        return tokens
    
    def tag(self, tag):
        if tag not in self.tags:
            return 'X'
        return self.tags[tag]

In [None]:
class TaggedDataset(Dataset):
    
    def __init__(self, x, y, tokenizer, max_length):
        
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __getitem__(self, index):
        
        tags = self.y[index]
        sentence = self.x[index]
        tokens = self.tokenizer.tokenize(sentence)
        if len(tokens) > self.max_length:
          tokens = tokens[:256]
          length = 256
        else:
          length = len(tokens)
        labels = []
        for i in range(length):
            tag = tags[i]
            label = self.tokenizer.tag(tag)
            labels.append(label)
        ret = {"ids" : tokens,
               "label" : labels}
        return ret

    def __len__(self):
        return len(self.x)

def collate(batch, pad_index):
    
    batch_ids = [torch.LongTensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_label = torch.LongTensor([i['label'] for i in batch])
    batch = {'ids': batch_ids, 'label': batch_label}
    return batch


    

In [None]:
class BiLSTMTagger(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout_rate, pad_index):
        
        super(BiLSTMTagger, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        self.dropout = nn.Dropout(p=dropout_rate)
        
        
    def forward(self, ids):
        
        embeddings = self.embedding(ids)
        embeddings = self.dropout(embeddings)
        out, _= self.lstm(embeddings)
        out = self.fc(out)
        return out

In [None]:
class Trainer:
    
    @staticmethod
    def train(dataloader, model, criterion, optimizer, device):
        model.train()
        epoch_losses = []
        epoch_accs = []

        for batch in tqdm(dataloader, desc='Training', file=sys.stdout):
            ids = batch['ids'].to(device)
            label = batch['label'].to(device)
            label = label.squeeze(dim=0)
            prediction = model(ids).squeeze(dim=0)
            loss = criterion(prediction, label)
            accuracy = Trainer.get_accuracy(prediction, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())

        return epoch_losses, epoch_accs
    
    @staticmethod
    def evaluate(dataloader, model, criterion, device):
        model.eval()
        epoch_losses = []
        epoch_accs = []

        with torch.no_grad():
            for batch in tqdm(dataloader, desc='Evaluating', file=sys.stdout):
                ids = batch['ids'].to(device)
                label = batch['label'].to(device)
                label = label.squeeze(dim=0)
                prediction = model(ids).squeeze(dim=0)
                loss = criterion(prediction, label)
                accuracy = Trainer.get_accuracy(prediction, label)
                epoch_losses.append(loss.item())
                epoch_accs.append(accuracy.item())

        return epoch_losses, epoch_accs
    
    @staticmethod
    def get_accuracy(prediction, label):
        size, _ = prediction.shape
        predicted_classes = prediction.argmax(dim=-1)
        correct_predictions = predicted_classes.eq(label).sum()
        return correct_predictions / size

    @staticmethod
    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    
    @staticmethod
    def predict_sentiment(text, model, tokenizer, device):
        tokens = tokenizer.tokenize(text)
        tensor = torch.LongTensor(tokens).unsqueeze(dim=0).to(device)
        prediction = model(tensor).squeeze(dim=0)
        probability = torch.softmax(prediction, dim=-1)
        predicted_class = probability.argmax(dim=-1).to('cpu').numpy()
        predicted_probability = []
        for i in range(len(tokens)):
            c = predicted_class[i]
            predicted_probability.append(probability[i][c].item())
        predicted_probability = np.array(predicted_probability)
        return predicted_class, predicted_probability
    

In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
               
def build_dataset(loc=""):
    
    if loc == "":
        data = nltk.corpus.brown.tagged_sents(tagset="universal")
    else: 
        with open(loc, 'r') as f:
          data = json.load(f)
          data = list(data.values())
    
    x = []
    y = []
    
    for sentence in data:
        words = []
        tags = []
        for word, tag in sentence:
            words.append(word)
            tags.append(tag)
        x.append(words)
        y.append(tags)
        
    return x, y

In [None]:
#Data Processing
nltk.download('brown')
nltk.download('universal_tagset')
X, y = build_dataset()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

tokenizer = Tokenizer()
tokenizer.train(X_train)

train_data = TaggedDataset(X_train, y_train, tokenizer, MAX_LENGTH)
valid_data = TaggedDataset(X_valid, y_valid, tokenizer, MAX_LENGTH)

collate = functools.partial(collate, pad_index=PAD_INDEX)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=collate, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=collate)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
Constructing Embeddings: 100%|██████████| 45872/45872 [00:00<00:00, 222236.60it/s]


In [None]:
#Create Necessities for Training
vocab_size = len(tokenizer.vocab)
model = BiLSTMTagger(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT_RATE, PAD_INDEX)
model.apply(initialize_weights)

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
# Start training
best_valid_loss = float('inf')
best_valid_acc = -1
train_losses = []
train_accs = []
valid_losses = []
valid_accs = []

epoch_to_train_losses = {}
epoch_to_train_accs = {}
epoch_to_valid_losses = {}
epoch_to_valid_accs = {}

for epoch in range(N_EPOCHS):
    train_loss, train_acc = Trainer.train(train_dataloader, model, criterion, optimizer, device)
    valid_loss, valid_acc = Trainer.evaluate(valid_dataloader, model, criterion, device)

    train_losses.extend(train_loss)
    train_accs.extend(train_acc)
    valid_losses.extend(valid_loss)
    valid_accs.extend(valid_acc)

    epoch_to_train_losses[epoch] = train_loss
    epoch_to_train_accs[epoch] = train_acc
    epoch_to_valid_losses[epoch] = valid_loss
    epoch_to_valid_accs[epoch] = valid_acc

    epoch_train_loss = np.mean(train_loss)
    epoch_train_acc = np.mean(train_acc)
    epoch_valid_loss = np.mean(valid_loss)
    epoch_valid_acc = np.mean(valid_acc)

    # Save the model that achieves the smallest validation loss.
    if epoch_valid_loss < best_valid_loss:
        best_valid_loss = epoch_valid_loss
        torch.save(model.state_dict(), 'valid_loss_lstm.pt')
    
    if epoch_valid_acc > best_valid_acc:
        best_valid_acc = epoch_valid_acc
        torch.save(model.state_dict(), 'valid_acc_lstm.pt')


    print(f'epoch: {epoch+1}')
    print(f'train_loss: {epoch_train_loss:.3f}, train_acc: {epoch_train_acc:.3f}')
    print(f'valid_loss: {epoch_valid_loss:.3f}, valid_acc: {epoch_valid_acc:.3f}')


Training: 100%|██████████| 45872/45872 [09:08<00:00, 83.65it/s]
Evaluating: 100%|██████████| 11468/11468 [00:31<00:00, 358.38it/s]
epoch: 1
train_loss: 0.235, train_acc: 0.921
valid_loss: 0.137, valid_acc: 0.955
Training: 100%|██████████| 45872/45872 [09:10<00:00, 83.32it/s]
Evaluating: 100%|██████████| 11468/11468 [00:32<00:00, 356.46it/s]
epoch: 2
train_loss: 0.100, train_acc: 0.968
valid_loss: 0.102, valid_acc: 0.967
Training: 100%|██████████| 45872/45872 [09:10<00:00, 83.34it/s]
Evaluating: 100%|██████████| 11468/11468 [00:31<00:00, 360.40it/s]
epoch: 3
train_loss: 0.062, train_acc: 0.980
valid_loss: 0.098, valid_acc: 0.970
Training: 100%|██████████| 45872/45872 [09:10<00:00, 83.29it/s]
Evaluating: 100%|██████████| 11468/11468 [00:31<00:00, 359.07it/s]
epoch: 4
train_loss: 0.042, train_acc: 0.986
valid_loss: 0.108, valid_acc: 0.970
Training: 100%|██████████| 45872/45872 [09:08<00:00, 83.61it/s]
Evaluating: 100%|██████████| 11468/11468 [00:31<00:00, 361.57it/s]
epoch: 5
train_loss: 

In [None]:
epoch_to_train_losses = pd.DataFrame(epoch_to_train_losses)
epoch_to_train_accs = pd.DataFrame(epoch_to_train_accs)
epoch_to_valid_losses = pd.DataFrame(epoch_to_valid_losses)
epoch_to_valid_accs = pd.DataFrame(epoch_to_valid_accs)

epoch_to_train_losses.to_csv('epoch_to_train_losses.csv')
epoch_to_train_accs.to_csv('epoch_to_train_accs.csv')
epoch_to_valid_losses.to_csv('epoch_to_valid_losses.csv')
epoch_to_valid_accs.to_csv('epoch_to_valid_accs.csv')

In [None]:
from google.colab import files

files.download('epoch_to_train_losses.csv')
files.download('epoch_to_train_accs.csv')
files.download('epoch_to_valid_losses.csv')
files.download('epoch_to_valid_accs.csv')
files.download('valid_loss_lstm.pt')
files.download('valid_acc_lstm.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>