<a href="https://colab.research.google.com/github/TongleiChen/sketch_to_image/blob/main/a4_surname_classifier_lstm_template_0322.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## A4 - Surname Classifier

Author: Austin Blodgett

Adaptation to colab: Nitin Venkateswaran


### Follow the steps to use this notebook for your A4. 
**NOTE**: It is best to use your Georgetown Google accounts.
##### 1. Save a copy of this notebook starter template in your Google Drive (File -> Save a copy in drive)
##### 2. Upload a copy of the datafile files from **surname-data** directory (available in a4.zip) to your Google Drive in the location **A4/surname-data/surnames.csv**; you will need to create the folder 'A4' at the root location in your Drive, followed by the subfolder 'surname-data' 
##### 3. You are all set!



###Import libraries and mount Google Drive





In [1]:
from google.colab import drive
drive.mount('/content/drive')


import os, random
import tensorflow as tf

from collections import Counter
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from keras.regularizers import l2

import numpy as np
from keras import backend as K



Mounted at /content/drive


In [2]:
data_file = '/content/drive/My Drive/A4/surname-data/surnames.csv'
UNK = '[UNK]'
PAD = '[PAD]'
START = '<s>'
END = '</s>'

###Implement this function if you want to transform the input text, e.g. normalizing case

In [3]:
# TODO
def transform_text_sequence(seq):
    '''
    Implement this function if you want to transform the input text,
    for example normalizing case.
    '''
    return seq

###Helper Functions (no need to implement)

In [4]:
def get_vocabulary_and_data(data_file, split, max_vocab_size=None):
    vocab = Counter()
    data = []
    labels = []
    with open(data_file, 'r', encoding='utf8') as f:
        for line in f:
            cols = line.split(',')
            s, surname, label = cols[0].strip(), cols[1].strip(), cols[2].strip()
            if s==split:
                surname = list(surname)
                surname = [START]+surname+[END]
                data.append(transform_text_sequence(surname))
                labels.append(label)
            for tok in surname:
                vocab[tok]+=1

    vocab = sorted(vocab.keys(), key=lambda k: vocab[k], reverse=True)
    if max_vocab_size:
        vocab = vocab[:max_vocab_size-2]
    vocab = [UNK, PAD] + vocab

    return {k:v for v,k in enumerate(vocab)}, set(labels), data, labels


def vectorize_sequence(seq, vocab):
    seq = [tok if tok in vocab else UNK for tok in seq]
    return [vocab[tok] for tok in seq]


def unvectorize_sequence(seq, vocab):
    translate = sorted(vocab.keys(),key=lambda k:vocab[k])
    return [translate[i] for i in seq]


def one_hot_encode_label(label, label_set):
    vec = [1.0 if l==label else 0.0 for l in label_set]
    return np.array(vec)


def batch_generator(data, labels, vocab, label_set, batch_size=1):
    while True:
        batch_x = []
        batch_y = []
        for doc, label in zip(data,labels):
            batch_x.append(vectorize_sequence(doc, vocab))
            batch_y.append(one_hot_encode_label(label, label_set))
            if len(batch_x) >= batch_size:
                # Pad Sequences in batch to same length
                batch_x = pad_sequences(batch_x, vocab[PAD])
                yield np.array(batch_x), np.array(batch_y)
                batch_x = []
                batch_y = []


def describe_data(data, gold_labels, label_set, generator):
    batch_x, batch_y = [], []
    for bx, by in generator:
        batch_x = bx
        batch_y = by
        break
    print('Data example:',data[0])
    print('Label:',gold_labels[0])
    print('Label count:', len(label_set))
    print('Data size', len(data))
    print('Batch input shape:', batch_x.shape)
    print('Batch output shape:', batch_y.shape)


def pad_sequences(batch_x, pad_value):
    ''' This function should take a batch of sequences of different lengths
        and pad them with the pad_value token so that they are all the same length.

        Assume that batch_x is a list of lists.
    '''
    pad_length = len(max(batch_x, key=lambda x: len(x)))
    for i, x in enumerate(batch_x):
        if len(x) < pad_length:
            batch_x[i] = x + ([pad_value] * (pad_length - len(x)))

    return batch_x

###Change these arguments for the main procedure call as needed for your experiments

In [5]:
epochs = 10 # number of epochs
learning_rate = 0.1 # learning rate
dropout = 0.3 # dropout rate
early_stopping = -1 # early stopping criteria
embedding_size = 100 # embedding dimension size
hidden_size = 10 # hidden layer size
batch_size = 50 # batch size

###Check the GPU is available

In [6]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  device_name = '/cpu:0'
  print(
      '\n\n This notebook is not '
      'configured to use a GPU.  You can change this in Notebook Settings. Defaulting to:' + device_name)
else:
  print ('GPU Device found: ' + device_name)

GPU Device found: /device:GPU:0


In [7]:
vocab, labels, train_data, train_labels = get_vocabulary_and_data(data_file, 'train')
_, _, dev_data, dev_labels = get_vocabulary_and_data(data_file, 'dev')
_, _, test_data, test_labels = get_vocabulary_and_data(data_file, 'test')

describe_data(train_data, train_labels, labels,
                  batch_generator(train_data, train_labels, vocab, labels, batch_size))

Data example: ['<s>', 'H', 'a', 'd', 'a', 'd', '</s>']
Label: arabic
Label count: 19
Data size 15000
Batch input shape: (50, 14)
Batch output shape: (50, 19)


In [10]:
input_size = 14
output_size = 19

In [None]:
train_data

###Main procedure call: Implement the keras model here

##### Use the variables batch_size, hidden_size, embedding_size, dropout, epochs here.

In [11]:
vocab, labels, train_data, train_labels = get_vocabulary_and_data(data_file, 'train')
_, _, dev_data, dev_labels = get_vocabulary_and_data(data_file, 'dev')
_, _, test_data, test_labels = get_vocabulary_and_data(data_file, 'test')

describe_data(train_data, train_labels, labels,
                  batch_generator(train_data, train_labels, vocab, labels, batch_size))

with tf.device(device_name):
    # Implement your model here! ----------------------------------------------------------------------
    # Use the variables batch_size, hidden_size, embedding_size, dropout, epochs
    classifier = tf.keras.Sequential()
    classifier.add(tf.keras.layers.Embedding(input_size, embedding_size, input_length=batch_size))
    classifier.add(Bidirectional(LSTM(hidden_size, return_sequences=False)))
    classifier.add(tf.keras.layers.Dense(output_size, activation='softmax'))

    # ------------------------------------------------------------------------------------------------

    classifier.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])

    for i in range(epochs):
        print('Epoch',i+1,'/',epochs)
        # Training
        classifier.fit(batch_generator(train_data, train_labels, vocab, labels, batch_size=batch_size),
                                  epochs=1, steps_per_epoch=len(train_data)/batch_size)
        # Evaluation
        loss, acc = classifier.evaluate(batch_generator(dev_data, dev_labels, vocab, labels),
                                                  steps=len(dev_data))
        print('Dev Loss:', loss, 'Dev Acc:', acc)


Data example: ['<s>', 'H', 'a', 'd', 'a', 'd', '</s>']
Label: arabic
Label count: 19
Data size 15000
Batch input shape: (50, 14)
Batch output shape: (50, 19)
Epoch 1 / 10
Dev Loss: 2.945093870162964 Dev Acc: 0.030392156913876534
Epoch 2 / 10
Dev Loss: 2.9413952827453613 Dev Acc: 0.05228758230805397
Epoch 3 / 10
Dev Loss: 2.9373579025268555 Dev Acc: 0.1689542531967163
Epoch 4 / 10
Dev Loss: 2.9330618381500244 Dev Acc: 0.34477123618125916
Epoch 5 / 10
Dev Loss: 2.9285316467285156 Dev Acc: 0.4490196108818054
Epoch 6 / 10
Dev Loss: 2.9237844944000244 Dev Acc: 0.4696078300476074
Epoch 7 / 10
Dev Loss: 2.91884708404541 Dev Acc: 0.46568626165390015
Epoch 8 / 10
Dev Loss: 2.913701295852661 Dev Acc: 0.4650326669216156
Epoch 9 / 10
Dev Loss: 2.908355712890625 Dev Acc: 0.4650326669216156
Epoch 10 / 10
Dev Loss: 2.9027915000915527 Dev Acc: 0.4650326669216156
