In [9]:
from argparse import Namespace
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [3]:
class ElmanRNN(nn.Module):
  def __init__(self, input_size, hidden_size, batch_first=False):
    super(ElmanRNN, self).__init__()

    self.rnn_cell = nn.RNNCell(input_size, hidden_size)

    self.batch_first = batch_first
    self.hidden_size = hidden_size

  def _initialize_hidden(self, batch_size):
    return torch.zeros((batch_size, self.hidden_size))

  def forward(self, x_in, initial_hidden=None):

    if self.batch_first:
      batch_size, seq_size, feat_size = x_in.size()
      x_in = x_in.permute(1, 0, 2)

    else:
      seq_size, batch_size, feat_size = x_in.size()

    hiddens = []

    if initial_hidden is None:
      initial_hidden = self._initialize_hidden(batch_size)
      initial_hidden = initial_hidden.to(x_in.device)

    hidden_t = initial_hidden

    for t in range(seq_size):
      hidden_t = self.rnn_cell(x_in[t], hidden_t)
      hiddens.append(hidden_t)

    hiddens = torch.stack(hiddens)

    if self.batch_first:
      hiddens = hiddens.permute(1, 0, 2)

    return hiddens

In [4]:
class SurnameDataset(Dataset):
  @classmethod
  def load_dataset_and_make_vectorize(cls, surname_csv):

    surname_df = pd.read_csv(surname_csv)
    train_surname_df[surname_df.split=='train']
    return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))

    def __getitem__(self, index):

      row = self._target_df.iloc[index]

      surname_vector, vec_length = \
        self._vectorizer.vectorize(row.surname, self._max_seq_length)

      nationality_index = \
        self._vectorizer.nationality_vocab.lookup_token(row.nationality)

      return {'x_data': surname_vector,
              'y_target': nationality_index,
              'x_length': vec_length}

In [5]:
class SurnameVectorizer(object):
  def vectorize(self, surname, vector_length=-1):

    indices = [self.char_vocab.begin_seq_index]
    indices.extend(self.char_vocab.lookup_token(token)
                    for token in surname)
    indices.append(self.char_vocab.end_seq_index)

    if vector_length < 0:
      vector_length = len(indices)

    out_vector = np.zeros(vector_lenth, dtype=np.int64)
    out_vector[:len(indices)] = indices
    out_vector[len(indices):] = self.char_vocab.mask_index

    return out_vector, len(indices)

  @classmethod
  def from_dataframe(cls, surname_df):

    char_vocab = SequenceVocabulary()
    nationality_vocab = Vocabulary()
    for index, row in surname_df.iterrows():
      for char in row.surname:
        char_vocab.add_token(char)
      nationality_vocab.add_token(row.nationality)

    return cls(char_vocab, nationality_vocab)

In [6]:
class SurnameClassifier(nn.Module):

    def __init__(self, embedding_size, num_embeddings, num_classes,
                 rnn_hidden_size, batch_first=True, padding_idx=0):

        super(SurnameClassifier, self).__init__()

        self.emb = nn.Embedding(num_embeddings=num_embeddings,
                                embedding_dim=embedding_size,
                                padding_idx=padding_idx)
        self.rnn = ElmanRNN(input_size=embedding_size,
                             hidden_size=rnn_hidden_size,
                             batch_first=batch_first)
        self.fc1 = nn.Linear(in_features=rnn_hidden_size,
                         out_features=rnn_hidden_size)
        self.fc2 = nn.Linear(in_features=rnn_hidden_size,
                          out_features=num_classes)

    def forward(self, x_in, x_lengths=None, apply_softmax=False):

        x_embedded = self.emb(x_in)
        y_out = self.rnn(x_embedded)

        if x_lengths is not None:
            y_out = column_gather(y_out, x_lengths)
        else:
            y_out = y_out[:, -1, :]

        y_out = F.relu(self.fc1(F.dropout(y_out, 0.5)))
        y_out = self.fc2(F.dropout(y_out, 0.5))

        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)

        return y_out

In [7]:
def column_gather(y_out, x_lengths):

    x_lengths = x_lengths.long().detach().cpu().numpy() - 1

    out = []
    for batch_index, column_index in enumerate(x_lengths):
        out.append(y_out[batch_index, column_index])

    return torch.stack(out)

In [8]:
args = Namespace(
    # data and path info
    surname_csv="data/surnames/surnames_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch6/surname_classification",
    # model hyper parameter
    char_embedding_size=100,
    rnn_hidden_size=64,
    # training hyper parameter
    num_epochs=100,
    learning_rate=1e-3,
    batch_size=64,
    seed=1337,
    early_stopping_criteria=5,
    # runtime hyper parameter
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)