## Data Reader

In [1]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np

from allennlp.data.dataset_readers import DatasetReader

from allennlp.data import Instance
from allennlp.data.fields import TextField, LabelField, SequenceLabelField

from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import Token

import glob
import os
import unicodedata
import string

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
import allennlp
allennlp.__version__

'0.8.3'

In [3]:
class NameDataReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None,
                char_indexers: Dict[str, TokenCharactersIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.char_indexers = char_indexers or {"token_characters": TokenCharactersIndexer()}
        self.all_letters = string.ascii_letters + " .,;'"
        #the category_lines dictionary, a list of names per language
        self.category_lines = {}
        self.all_categories = []
        self.n_categories = None
        
    # Turn a Unicode string to plain ASCII
    def unicodeToAscii(self, s:str) -> str:
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in self.all_letters)
    
    # Read a file and split into lines
    def readLines(self, filename:str) -> str:
        lines = open(filename, encoding='utf-8').read().strip().split('\n')
        return [self.unicodeToAscii(line) for line in lines]
    
    #convert inputs corresponding to training example to Instance
    def toInstance(self, names: List[str], categories: List[str] = None) -> Instance:
        token_field = TextField([Token(nm) for nm in names], self.token_indexers)
        
        fields = {"tokens": token_field}
        
        fields["token_characters"] = TextField([Token(nm) for nm in names], self.char_indexers)
        
        if categories:
            fields["labels"] = SequenceLabelField(labels=categories, sequence_field = token_field)
        return Instance(fields)

    #takes a filename and produces a stream of Instances (random training examples)
    def _read(self, file_path: str) -> Iterator[Instance]:
        filenames = glob.glob(file_path)

        for f in filenames:
            category = os.path.splitext(os.path.basename(f))[0]
            self.all_categories.append(category)
            lines = self.readLines(f)
            self.category_lines[category] = lines
        self.n_categories = len(self.all_categories)
        
        name_and_category = []
        for cat in self.all_categories:
            for name in self.category_lines[cat]:
                name_and_category.append((name,cat))
                
        np.random.shuffle(name_and_category)
        
        step = 10
        
        for i in range(0,len(name_and_category),step):
            yield self.toInstance([n[0] for n in name_and_category[i:i+step]], 
                                  [n[1] for n in name_and_category[i:i+step]])
        



#### Read data

In [4]:
data_reader = NameDataReader()



In [5]:
data = data_reader.read('data/names/*.txt')

2008it [00:00, 6224.36it/s]


In [6]:
data[0].fields['labels'].labels

['Korean',
 'Arabic',
 'Russian',
 'Russian',
 'Russian',
 'English',
 'Russian',
 'Russian',
 'Russian',
 'Russian']

## Model

In [7]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.data.iterators import BucketIterator
from sklearn.model_selection import train_test_split
from allennlp.training.trainer import Trainer
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from tqdm import tqdm_notebook as tqdm
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.token_embedders.token_characters_encoder import TokenCharactersEncoder

In [8]:
vocab = Vocabulary.from_instances(data)

100%|██████████| 2008/2008 [00:00<00:00, 7287.81it/s]


In [9]:
vocab

Vocabulary with namespaces:  tokens, Size: 17424 || token_characters, Size: 57 || labels, Size: 18 || Non Padded Namespaces: {'*tags', '*labels'}

In [10]:
class NameNet(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        
    def forward(self,
               tokens: Dict[str, torch.Tensor],
               token_characters: Dict[str, torch.Tensor],
               labels: torch.Tensor = None) -> torch.Tensor:
        
        mask = get_text_field_mask(tokens)
        
        embeddings = self.word_embeddings({**tokens,**token_characters})
        
        encoder_out = self.encoder(embeddings, mask)
        
        tag_logits = self.hidden2tag(encoder_out)
        
        output = {"tag_logits": tag_logits}
        
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

#### Training the model

In [11]:
train, val = train_test_split(data,test_size=0.2, random_state=100)

In [12]:
CHAR_EMB_DIM = 5
WORD_EMB_DIM = 5
EMBEDDING_DIM = 10
HIDDEN_DIM = 10

In [13]:
char_encoder = PytorchSeq2VecWrapper(torch.nn.RNN(CHAR_EMB_DIM, CHAR_EMB_DIM, batch_first=True))
token_char_embedding = Embedding(num_embeddings=vocab.get_vocab_size('token_characters'),
                            embedding_dim=WORD_EMB_DIM)
char_embeddings = TokenCharactersEncoder(token_char_embedding, char_encoder)
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=WORD_EMB_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding, "token_characters": char_embeddings})

lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = NameNet(word_embeddings, lstm, vocab)

In [14]:
optimizer = optim.SGD(model.parameters(), lr=0.05)
iterator = BucketIterator(batch_size=2, sorting_keys=[("tokens", "num_tokens"), ("token_characters", "num_token_characters")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train,
                  validation_dataset=val,
                  patience=5,
                  num_epochs=20, cuda_device=-1)
trainer.train()

accuracy: 0.4597, loss: 1.9598 ||: 100%|██████████| 803/803 [00:05<00:00, 136.44it/s]
accuracy: 0.4689, loss: 1.8376 ||: 100%|██████████| 201/201 [00:00<00:00, 276.71it/s]
accuracy: 0.4686, loss: 1.8519 ||: 100%|██████████| 803/803 [00:06<00:00, 125.31it/s]
accuracy: 0.4689, loss: 1.8305 ||: 100%|██████████| 201/201 [00:00<00:00, 488.49it/s]
accuracy: 0.4686, loss: 1.8391 ||: 100%|██████████| 803/803 [00:06<00:00, 126.74it/s]
accuracy: 0.4689, loss: 1.7969 ||: 100%|██████████| 201/201 [00:00<00:00, 511.00it/s]
accuracy: 0.4712, loss: 1.7069 ||: 100%|██████████| 803/803 [00:05<00:00, 142.24it/s]
accuracy: 0.4856, loss: 1.5815 ||: 100%|██████████| 201/201 [00:00<00:00, 429.29it/s]
accuracy: 0.5204, loss: 1.5385 ||: 100%|██████████| 803/803 [00:05<00:00, 139.13it/s]
accuracy: 0.5414, loss: 1.4712 ||: 100%|██████████| 201/201 [00:00<00:00, 445.99it/s]
accuracy: 0.5398, loss: 1.4778 ||: 100%|██████████| 803/803 [00:06<00:00, 115.26it/s]
accuracy: 0.5476, loss: 1.4389 ||: 100%|██████████| 20

{'best_epoch': 19,
 'peak_cpu_memory_MB': 297.028,
 'training_duration': '00:02:23',
 'training_start_epoch': 0,
 'training_epochs': 19,
 'epoch': 19,
 'training_accuracy': 0.7060398505603985,
 'training_loss': 1.0199193247636853,
 'training_cpu_memory_MB': 297.028,
 'validation_accuracy': 0.7040358744394619,
 'validation_loss': 1.0147619629973796,
 'best_validation_accuracy': 0.7040358744394619,
 'best_validation_loss': 1.0147619629973796}

## Predictor

In [15]:
from allennlp.predictors import Predictor
from allennlp.common.util import JsonDict

In [16]:
class LanguagePredictor(Predictor):
    def predict_json(self,inputs: JsonDict) -> JsonDict:
        instance = self._dataset_reader.toInstance(inputs)
        out = self.predict_instance(instance)
        #find maximum score from predictions
        return [self._model.vocab.get_token_from_index(i,'labels') 
                for i in np.argmax(out['tag_logits'], axis=-1)]

#### Predict name category

In [17]:
predictor = LanguagePredictor(model,data_reader)

In [18]:
predictor.predict_json(['Takahashi','Sokolov','Foster','Abboud'])

['Japanese', 'Russian', 'English', 'Arabic']