**As part of the Semantic parsing , this chatbot will do the below - **

Given a sentence (e.g. "The dog ate the apple") we want to predict part-of-speech tags for each word (e.g ["DET", "NN", "V", "DET", "NN"]).

we'll embed each word in a low-dimensional space, pass them through an LSTM to get a sequence of encodings, and use a feedforward layer to transform those into a sequence of logits (corresponding to the possible part-of-speech tags).

In [1]:
!pip install overrides

Collecting overrides
  Downloading https://files.pythonhosted.org/packages/de/55/3100c6d14c1ed177492fcf8f07c4a7d2d6c996c0a7fc6a9a0a41308e7eec/overrides-1.9.tar.gz
Building wheels for collected packages: overrides
  Building wheel for overrides (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/8d/52/86/e5a83b1797e7d263b458d2334edd2704c78508b3eea9323718
Successfully built overrides
Installing collected packages: overrides
Successfully installed overrides-1.9


In [2]:
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/a4/c8/10342a6068a8d156a5947e03c95525d559e71ad62de0f2585ab922e14533/allennlp-0.8.3-py3-none-any.whl (5.6MB)
[K    100% |████████████████████████████████| 5.6MB 7.0MB/s 
[?25hCollecting conllu==0.11 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/d4/2c/856344d9b69baf5b374c395b4286626181a80f0c2b2f704914d18a1cea47/conllu-0.11-py2.py3-none-any.whl
Collecting jsonnet>=0.10.0; sys_platform != "win32" (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/60/dc/3abd3971869a741d7acdba166d71d4f9366b6b53028dfd56f95de356af0f/jsonnet-0.12.1.tar.gz (240kB)
[K    100% |████████████████████████████████| 245kB 28.3MB/s 
[?25hCollecting responses>=0.7 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/d1/5a/b887e89925f1de7890ef298a74438371ed4ed29b33def9e6d02dc6036fd8/responses-0.10.6-py2.py3-none-any.whl
Collecting awscli>=1.11.91 (from allennlp)
[?25l  Downl

In [3]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

torch.manual_seed(1)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


<torch._C.Generator at 0x7f527d0832f0>

In [0]:
class PosDatasetReader(DatasetReader):
    """
    DatasetReader for PoS tagging data, one sentence per line, like

        The###DET dog###NN ate###V the###DET apple###NN
    """
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}

        if tags:
            label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field)
            fields["labels"] = label_field

        return Instance(fields)
    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            for line in f:
                pairs = line.strip().split()
                sentence, tags = zip(*(pair.split("###") for pair in pairs))
                yield self.text_to_instance([Token(word) for word in sentence], tags)

In [0]:
class LstmTagger(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [6]:
reader = PosDatasetReader()
train_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/validation.txt'))

93B [00:00, 58851.88B/s]             
2it [00:00, 1137.44it/s]
93B [00:00, 19738.40B/s]             
2it [00:00, 5121.25it/s]


In [7]:
vocab = Vocabulary.from_instances(train_dataset + validation_dataset)
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
vocab

100%|██████████| 4/4 [00:00<00:00, 2832.08it/s]


Vocabulary with namespaces:  tokens, Size: 11 || labels, Size: 3 || Non Padded Namespaces: {'*tags', '*labels'}

In [0]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmTagger(word_embeddings, lstm, vocab)

In [0]:
if torch.cuda.is_available():
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    cuda_device = -1

In [11]:
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000,
                  cuda_device=cuda_device)
trainer.train()

accuracy: 0.3333, loss: 1.1334 ||: 100%|██████████| 1/1 [00:00<00:00, 92.23it/s]
accuracy: 0.3333, loss: 1.1263 ||: 100%|██████████| 1/1 [00:00<00:00, 188.75it/s]
accuracy: 0.3333, loss: 1.1276 ||: 100%|██████████| 1/1 [00:00<00:00, 89.62it/s]
accuracy: 0.3333, loss: 1.1209 ||: 100%|██████████| 1/1 [00:00<00:00, 178.28it/s]
accuracy: 0.3333, loss: 1.1222 ||: 100%|██████████| 1/1 [00:00<00:00, 90.84it/s]
accuracy: 0.3333, loss: 1.1158 ||: 100%|██████████| 1/1 [00:00<00:00, 157.35it/s]
accuracy: 0.3333, loss: 1.1171 ||: 100%|██████████| 1/1 [00:00<00:00, 95.53it/s]
accuracy: 0.3333, loss: 1.1111 ||: 100%|██████████| 1/1 [00:00<00:00, 170.85it/s]
accuracy: 0.3333, loss: 1.1124 ||: 100%|██████████| 1/1 [00:00<00:00, 100.90it/s]
accuracy: 0.3333, loss: 1.1067 ||: 100%|██████████| 1/1 [00:00<00:00, 164.85it/s]
accuracy: 0.3333, loss: 1.1080 ||: 100%|██████████| 1/1 [00:00<00:00, 104.79it/s]
accuracy: 0.3333, loss: 1.1026 ||: 100%|██████████| 1/1 [00:00<00:00, 165.31it/s]
accuracy: 0.3333, lo

{'best_epoch': 999,
 'best_validation_accuracy': 1.0,
 'best_validation_loss': 0.017854422330856323,
 'epoch': 999,
 'peak_cpu_memory_MB': 1977.34,
 'peak_gpu_0_memory_MB': 324,
 'training_accuracy': 1.0,
 'training_cpu_memory_MB': 1977.34,
 'training_duration': '00:01:18',
 'training_epochs': 999,
 'training_gpu_0_memory_MB': 324,
 'training_loss': 0.01790783368051052,
 'training_start_epoch': 0,
 'validation_accuracy': 1.0,
 'validation_loss': 0.017854422330856323}

In [12]:
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)
tag_logits = predictor.predict("The dog ate the apple")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

['DET', 'NN', 'V', 'DET', 'NN']


In [13]:
tag_logits = predictor.predict("I have eaten a banana")['tag_logits']
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

['DET', 'NN', 'NN', 'NN', 'NN']


In [0]:
# Here's how to save the model.
with open("/tmp/model.th", 'wb') as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files("/tmp/vocabulary")

In [0]:
# And here's how to reload the model.
vocab2 = Vocabulary.from_files("/tmp/vocabulary")
model2 = LstmTagger(word_embeddings, lstm, vocab2)

In [0]:
with open("/tmp/model.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))
if cuda_device > -1:
    model2.cuda(cuda_device)

In [17]:
predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']
#np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

tag_ids2 = np.argmax(tag_logits2, axis=-1)
print([model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids2])

['DET', 'NN', 'V', 'DET', 'NN']


In [36]:
tag_logits2 = predictor2.predict("I read a lot")['tag_logits']
#np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

tag_ids2 = np.argmax(tag_logits2, axis=-1)
print([model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids2])

['DET', 'V', 'DET', 'NN']


In [34]:
tag_logits2 = predictor2.predict("I ate the tomato")['tag_logits']
#np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

tag_ids2 = np.argmax(tag_logits2, axis=-1)
print([model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids2])

['DET', 'V', 'DET', 'NN']


In [33]:
tag_logits2 = predictor2.predict("Everybody read that book")['tag_logits']
#np.testing.assert_array_almost_equal(tag_logits2, tag_logits)

tag_ids2 = np.argmax(tag_logits2, axis=-1)
print([model2.vocab.get_token_from_index(i, 'labels') for i in tag_ids2])

['NN', 'V', 'DET', 'NN']
