In [113]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
torch.manual_seed(1)

<torch._C.Generator at 0x26552761bb0>

In [114]:
from __future__ import unicode_literals, print_function, division
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField,LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy , Average
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor
from allennlp.data.iterators import BucketIterator, BasicIterator
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder,PytorchSeq2VecWrapper
from torch.nn import LogSoftmax
from torch.nn.modules import NLLLoss
from io import open
import glob
import os
torch.manual_seed(1)

<torch._C.Generator at 0x26552761bb0>

In [115]:
import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.training.trainer import Trainer
import json

In [116]:
class NewsDatasetReader(DatasetReader):
    """
    DatasetReader for PoS tagging data, one word per line and its label 
    Doveyski , Russian.txt
    """
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def text_to_instance(self, tokens: List[Token], label: str =None) -> Instance:
        word_field = TextField(tokens, self.token_indexers)
        fields = {"word": word_field}
 
        if label is None:
            return Instance(fields)
        
        
        label_field = LabelField(label=label)
        fields["label"] = label_field
            
        return Instance(fields)   
    
    def findFiles(self,path): 
        return glob.glob(path)
    
    def _read(self, file_path: str) -> Iterator[Instance]:
        
        with open(file_path,encoding='utf-8') as f:
            data=json.loads(f.read())
            for item in data:
                yield self.text_to_instance([Token(w) for w in item['headline'].split(" ") + item['short_description'].split(" ")], item['category'])

In [117]:
elmo_token_indexer = ELMoTokenCharactersIndexer()
reader =NewsDatasetReader(
        token_indexers={'tokens': elmo_token_indexer})
train_dataset = reader.read('train_.txt')
validation_dataset=reader.read('validation_.txt')


0it [00:00, ?it/s]
1it [00:00,  3.09it/s]
20000it [00:03, 5784.11it/s]
3000it [00:00, 29783.10it/s]


In [118]:
from typing import Dict

import numpy as np
import torch
import torch.optim as optim
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader
from allennlp.data.iterators import BucketIterator
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import Trainer

# from realworldnlp.predictors import SentenceClassifierPredictor

EMBEDDING_DIM = 128
HIDDEN_DIM = 256

# Model in AllenNLP represents a model that is trained.
#@Model.register("lstm_classifier")
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: int = 4) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=128)

        # Monitor the metrics - we use accuracy, as well as prec, rec, f1 for 4 (very positive)
        self.accuracy = CategoricalAccuracy()
        self.f1_measure = F1Measure(positive_label)

        # We use the cross entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                word: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(word)

        # Forward pass
        embeddings = self.word_embeddings(word)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.linear(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            self.f1_measure(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        precision, recall, f1_measure = self.f1_measure.get_metric(reset)
        return {'accuracy': self.accuracy.get_metric(reset),
                'precision': precision,
                'recall': recall,
                'f1_measure': f1_measure}

In [119]:
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

In [120]:
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

# Pass in the ElmoTokenEmbedder instance instead
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

# The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
elmo_embedding_dim = 256
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

model = LstmClassifier(word_embeddings, lstm, vocab)
optimizer = optim.Adam(model.parameters())

iterator = BucketIterator(batch_size=100, sorting_keys=[("word", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=20)


100%|████████████████████████████████████████████████████████████████████████| 23000/23000 [00:00<00:00, 173396.54it/s]


In [121]:
trainer.train()

accuracy: 0.5942, precision: 0.7560, recall: 0.3870, f1_measure: 0.5119, loss: 1.1903 ||: 100%|█| 200/200 [16:42<00:00,  3.32s/it]
accuracy: 0.7273, precision: 0.8696, recall: 0.7092, f1_measure: 0.7812, loss: 0.7609 ||: 100%|█| 30/30 [02:12<00:00,  8.70s/it]
accuracy: 0.7402, precision: 0.7975, recall: 0.7756, f1_measure: 0.7864, loss: 0.7093 ||: 100%|█| 200/200 [17:09<00:00,  6.82s/it]
accuracy: 0.7643, precision: 0.7940, recall: 0.8475, f1_measure: 0.8199, loss: 0.6416 ||: 100%|█| 30/30 [02:16<00:00,  8.93s/it]
accuracy: 0.7677, precision: 0.8425, recall: 0.8149, f1_measure: 0.8285, loss: 0.6346 ||: 100%|█| 200/200 [1:04:52<00:00, 51.86s/it]
accuracy: 0.7740, precision: 0.7830, recall: 0.8830, f1_measure: 0.8300, loss: 0.6166 ||: 100%|█| 30/30 [24:18<00:00, 116.50s/it]
accuracy: 0.7844, precision: 0.8448, recall: 0.8294, f1_measure: 0.8370, loss: 0.5856 ||: 100%|█| 200/200 [2:42:27<00:00, 48.76s/it]   
accuracy: 0.7783, precision: 0.7405, recall: 0.9007, f1_measure: 0.8128, loss: 0.

{'best_epoch': 12,
 'peak_cpu_memory_MB': 0,
 'training_duration': '19:02:53',
 'training_start_epoch': 0,
 'training_epochs': 19,
 'epoch': 19,
 'training_accuracy': 0.88865,
 'training_precision': 0.9373988127361036,
 'training_recall': 0.9348762109795479,
 'training_f1_measure': 0.9361358124494247,
 'training_loss': 0.30314835950732233,
 'training_cpu_memory_MB': 0.0,
 'validation_accuracy': 0.8203333333333334,
 'validation_precision': 0.8551236749116607,
 'validation_recall': 0.8581560283687943,
 'validation_f1_measure': 0.8566371681415429,
 'validation_loss': 0.5402635181943576,
 'best_validation_accuracy': 0.8133333333333334,
 'best_validation_precision': 0.8419243986254296,
 'best_validation_recall': 0.8687943262411347,
 'best_validation_f1_measure': 0.8551483420592868,
 'best_validation_loss': 0.5160865028699239}

In [33]:
from allennlp.predictors import Predictor
predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

In [62]:
logits = predictor.predict("Why Disturbing Leaked Video Of Texas Cops May Be Relevant To Jordan Edwards' Killing")['logits']
tag_ids = np.argmax(logits, axis=-1)

In [63]:
tag_ids

0

In [64]:
print([model.vocab.get_token_from_index(tag_ids, 'labels')])

['POLITICS']


In [9]:
options_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json')
weight_file = ('https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5')


In [10]:
elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

# Pass in the ElmoTokenEmbedder instance instead
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

# The dimension of the ELMo embedding will be 2 x [size of LSTM hidden states]
elmo_embedding_dim = 1024
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(elmo_embedding_dim, HIDDEN_DIM, batch_first=True))

model = LstmClassifier(word_embeddings, lstm, vocab)
optimizer = optim.Adam(model.parameters())

iterator = BucketIterator(batch_size=100, sorting_keys=[("word", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=20)


100%|████████████████████████████████████████████████████████████████████████| 23000/23000 [00:00<00:00, 147827.30it/s]


In [11]:
trainer.train()

accuracy: 0.5189, precision: 0.3298, recall: 0.1097, f1_measure: 0.1646, loss: 1.8364 ||: 100%|█| 200/200 [34:00<00:00, 10.00s/it]
accuracy: 0.5917, precision: 0.4444, recall: 0.0519, f1_measure: 0.0930, loss: 1.4797 ||: 100%|█| 30/30 [04:08<00:00, 11.50s/it]
accuracy: 0.6219, precision: 0.5157, recall: 0.3827, f1_measure: 0.4394, loss: 1.2943 ||: 100%|█| 200/200 [32:55<00:00, 11.13s/it]
accuracy: 0.6160, precision: 0.4026, recall: 0.4026, f1_measure: 0.4026, loss: 1.3369 ||: 100%|█| 30/30 [03:47<00:00, 10.76s/it]
accuracy: 0.6625, precision: 0.5787, recall: 0.4889, f1_measure: 0.5300, loss: 1.1269 ||: 100%|█| 200/200 [31:37<00:00,  8.07s/it]
accuracy: 0.6247, precision: 0.3832, recall: 0.5325, f1_measure: 0.4457, loss: 1.2834 ||: 100%|█| 30/30 [03:38<00:00, 10.08s/it]
accuracy: 0.6895, precision: 0.6603, recall: 0.5648, f1_measure: 0.6088, loss: 1.0160 ||: 100%|█| 200/200 [31:10<00:00, 11.09s/it]
accuracy: 0.6373, precision: 0.2812, recall: 0.5844, f1_measure: 0.3797, loss: 1.2506 ||:

{'best_epoch': 4,
 'peak_cpu_memory_MB': 0,
 'training_duration': '08:14:28',
 'training_start_epoch': 0,
 'training_epochs': 13,
 'epoch': 13,
 'training_accuracy': 0.9037,
 'training_precision': 0.859338061465721,
 'training_recall': 0.8483080513418904,
 'training_f1_measure': 0.8537874339400557,
 'training_loss': 0.3067641580849886,
 'training_cpu_memory_MB': 0.0,
 'validation_accuracy': 0.6396666666666667,
 'validation_precision': 0.4691358024691358,
 'validation_recall': 0.4935064935064935,
 'validation_f1_measure': 0.48101265822779804,
 'validation_loss': 1.534892777601878,
 'best_validation_accuracy': 0.653,
 'best_validation_precision': 0.5161290322580645,
 'best_validation_recall': 0.4155844155844156,
 'best_validation_f1_measure': 0.46043165467620956,
 'best_validation_loss': 1.2359104375044505}

In [122]:
from allennlp.common import JsonDict
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.models import Model
from allennlp.predictors import Predictor
from overrides import overrides
import numpy as np

class NewsClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self.model = model
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        scores = self.predict_json({"sentence" : sentence})['logits']
        label_id = np.argmax(scores)
        return self.model.vocab.get_token_from_index(label_id, 'labels')

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.split_words(sentence)
        return self._dataset_reader.text_to_instance(tokens)

In [123]:
predictor = NewsClassifierPredictor(model, dataset_reader=reader)

In [137]:
predictor.predict("")


'WORLD NEWS'