<a href="https://colab.research.google.com/github/TongleiChen/sketch_to_image/blob/main/0329_a4_pos_lstm_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## A4 - POS LSTM Classifier

Author: Austin Blodgett

Adaptation to colab: Nitin Venkateswaran


### Follow the steps to use this notebook for your A4. 
**NOTE**: It is best to use your Georgetown Google accounts.
##### 1. Save a copy of this notebook starter template in your Google Drive (File -> Save a copy in drive)
##### 2. Upload a copy of all 3 tsv files in **pos-data** directory (available in a4.zip) to your Google Drive in the folder location **A4/pos-data/**; you will need to create the folder 'A4' at the root location in your Drive, followed by the subfolder 'pos-data'
##### 3. You are all set!


###Import libraries and mount Google Drive





In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import tensorflow as tf

from collections import Counter

from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed,Input,Dropout
from keras.activations import softmax

import numpy as np
from keras import backend as K
from keras import optimizers

Mounted at /content/drive


In [None]:
train_file = '/content/drive/My Drive/A4/pos-data/en-ud-train.upos.tsv'
dev_file = '/content/drive/My Drive/A4/pos-data/en-ud-dev.upos.tsv'
test_file = '/content/drive/My Drive/A4/pos-data/en-ud-test.upos.tsv'
UNK = '[UNK]'
PAD = '[PAD]'


### Implement this function if you want to transform the input text, e.g. normalizing case


In [None]:
# TODO
def transform_text_sequence(seq):
    '''
    Implement this function if you want to transform the input text,
    for example normalizing case.
    '''
    return seq


###Helper Functions (no need to implement)



In [None]:
def get_vocabulary_and_data(data_file, max_vocab_size=None):
    vocab = Counter()
    pos_vocab = {'<s>','</s>'}
    vocab[UNK] = 1
    vocab[PAD] = 1
    data = []
    gold_labels = []
    with open(data_file, 'r', encoding='utf8') as f:
        sent = ['<s>']
        sent_pos = ['<s>']
        for line in f:
            if line.strip():
                tok, pos = line.strip().split('\t')[0], line.strip().split('\t')[1]
                sent.append(tok)
                sent_pos.append(pos)
                vocab[tok]+=1
                vocab['<s>'] += 1
                vocab['</s>'] += 1
                pos_vocab.add(pos)
            elif sent:
                sent.append('</s>')
                sent_pos.append('</s>')
                sent = transform_text_sequence(sent)
                data.append(sent)
                gold_labels.append(sent_pos)
                sent = ['<s>']
                sent_pos = ['<s>']
    vocab = sorted(vocab.keys(), key = lambda k: vocab[k], reverse=True)
    if max_vocab_size:
        vocab = vocab[:max_vocab_size-2]
    vocab = [UNK, PAD] + vocab
    return {k:v for v,k in enumerate(vocab)}, list(pos_vocab), data, gold_labels


def vectorize_sequence(seq, vocab):
    seq = [tok if tok in vocab else UNK for tok in seq]
    return [vocab[tok] for tok in seq]


def unvectorize_sequence(seq, vocab):
    translate = sorted(vocab.keys(),key=lambda k:vocab[k])
    return [translate[i] for i in seq]


def one_hot_encode_label(label, label_set):
    vec = [1.0 if l==label else 0.0 for l in label_set]
    return np.array(vec)

def clean(seqs, vocab, unk):
    for i,seq in enumerate(seqs):
        for j,tok in enumerate(seq):
            if tok>=len(vocab):
                seq[j] = unk

def batch_generator(data, labels, vocab, label_set, batch_size=1):
    while True:
        batch_x = []
        batch_y = []
        for sent, sent_pos in zip(data,labels):
            batch_x.append(vectorize_sequence(sent, vocab))
            batch_y.append([one_hot_encode_label(label, label_set) for label in sent_pos])
            if len(batch_x) >= batch_size:
                clean(batch_x, vocab, vocab[UNK])
                # Pad Sequences in batch to same length
                batch_x = pad_sequences(batch_x, vocab[PAD])
                batch_y = pad_sequences(batch_y, one_hot_encode_label(PAD, label_set))
                yield np.array(batch_x), np.array(batch_y)
                batch_x = []
                batch_y = []


def describe_data(data, gold_labels, label_set, generator):
    batch_x, batch_y = [], []
    for bx, by in generator:
        batch_x = bx
        batch_y = by
        break
    print('Data example:',data[0])
    print('Label:',gold_labels[0])
    print('Label count:', len(label_set))
    print('Data size', len(data))
    print('Batch input shape:', batch_x.shape)
    print('Batch output shape:', batch_y.shape)


def pad_sequences(batch_x, pad_value):
    ''' This function should take a batch of sequences of different lengths
        and pad them with the pad_value token so that they are all the same length.

        Assume that batch_x is a list of lists.
    '''
    pad_length = len(max(batch_x, key=lambda x: len(x)))
    for i, x in enumerate(batch_x):
        if len(x) < pad_length:
            batch_x[i] = x + ([pad_value] * (pad_length - len(x)))

    return batch_x

###Change these arguments for the main procedure call as needed for your experiments

In [None]:
epochs = 20 # number of epochs
learning_rate = 10 # learning rate
dropout = 0.3 # dropout rate
early_stopping = -1 # early stopping criteria
embedding_size = 100 # embedding dimension size
hidden_size = 10 # hidden layer size
batch_size = 40 # batch size

###Check the GPU is available

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  device_name = '/cpu:0'
  print(
      '\n\n This notebook is not '
      'configured to use a GPU.  You can change this in Notebook Settings. Defaulting to:' + device_name)
else:
  print ('GPU Device found: ' + device_name)

GPU Device found: /device:GPU:0


In [None]:
vocab, labels, train_data, train_labels = get_vocabulary_and_data(train_file)
_, _, dev_data, dev_labels = get_vocabulary_and_data(dev_file)
describe_data(train_data, train_labels, labels,
              batch_generator(train_data, train_labels, vocab, labels, batch_size))

Data example: ['<s>', 'Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.', '</s>']
Label: ['<s>', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', '</s>']
Label count: 19
Data size 12543
Batch input shape: (40, 53)
Batch output shape: (40, 53, 19)


In [None]:
len(vocab)

19676

###Main procedure call: Implement the keras model here


In [None]:
vocab, labels, train_data, train_labels = get_vocabulary_and_data(train_file)
_, _, dev_data, dev_labels = get_vocabulary_and_data(dev_file)

describe_data(train_data, train_labels, labels,
              batch_generator(train_data, train_labels, vocab, labels, batch_size))

with tf.device(device_name):

    # Implement your model here! ----------------------------------------------------------------------
    # Use the variables batch_size, hidden_size, embedding_size, dropout, epochs

    pos_tagger = tf.keras.Sequential()
    input_size = len(vocab)
    output_size = len(labels)

    drop_out_e = 0.25
    drop_out_lstm = 0.25
    drop_out_d = 0.25
    pos_tagger.add(Embedding(input_dim = input_size, output_dim = embedding_size))
    pos_tagger.add(tf.keras.layers.Dropout(drop_out_e))
    pos_tagger.add(Bidirectional(LSTM(hidden_size, return_sequences=True,dropout = drop_out_lstm)))# dropout

    pos_tagger.add(TimeDistributed(Dense(output_size, activation='softmax')))
    pos_tagger.add(tf.keras.layers.Dropout(drop_out_d))
    # ------------------------------------------------------------------------------------------------
    
    pos_tagger.summary()
    sgd = optimizers.SGD(learning_rate=learning_rate)
    pos_tagger.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

    for i in range(epochs):
        print('Epoch',i+1,'/',epochs)
        # Training
        pos_tagger.fit(batch_generator(train_data, train_labels, vocab, labels, batch_size),
                                  epochs=1, steps_per_epoch=len(train_data)/batch_size)
        # Evaluation
        loss, acc = pos_tagger.evaluate(batch_generator(dev_data, dev_labels, vocab, labels),
                                                  steps=len(dev_data))
        print('Dev Loss:', loss, 'Dev Acc:', acc)

Data example: ['<s>', 'Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.', '</s>']
Label: ['<s>', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', '</s>']
Label count: 19
Data size 12543
Batch input shape: (40, 53)
Batch output shape: (40, 53, 19)
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 100)         1967600   
                                                                 
 dropout_8 (Dropout)         (None, None, 100)         0         
                                           

In [None]:
labels

['NOUN',
 'VERB',
 'CCONJ',
 'SYM',
 'INTJ',
 '</s>',
 'ADV',
 'DET',
 'PUNCT',
 'ADP',
 'AUX',
 'ADJ',
 'X',
 'PROPN',
 '<s>',
 'PART',
 'SCONJ',
 'NUM',
 'PRON']