In [None]:
!pip list | grep allennlp || pip install allennlp

allennlp                      2.2.0         


In [None]:
from typing import Dict, Iterable, List, Tuple

import allennlp
from allennlp.common.util import JsonDict
from allennlp.data import DataLoader, DatasetReader, Instance, Vocabulary, TextFieldTensors
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.fields import Field, LabelField, TextField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, WhitespaceTokenizer
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2VecEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2vec_encoders import BagOfEmbeddingsEncoder
from allennlp.nn import util
from allennlp.predictors import Predictor
from allennlp.training.trainer import GradientDescentTrainer, Trainer
from allennlp.training.optimizers import AdamOptimizer
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training.util import evaluate
from allennlp.predictors.text_classifier import TextClassifierPredictor
import torch
import torch.nn as nn
import torch.nn.functional as func
import pandas as pd

In [None]:
class ClassificationReader(DatasetReader):
  def __init__(
      self,
      tokenizer: Tokenizer = None,
      tokenIndexers: Dict[str, TokenIndexer] = None,
      maxTokens: int = None,
      **kwargs
      ):
    super().__init__(**kwargs)
    self.tokenizer = tokenizer or WhitespaceTokenizer()
    self.tokenIndexers = tokenIndexers or {'tokens': SingleIdTokenIndexer()}
    self.maxTokens = maxTokens
  
  def _read(self, filePath: str) ->  Iterable[Instance]:
    df = pd.read_csv(filePath)
    for i in range(len(df)):
      text = df['Text'][i]
      textField = TextField(self.tokenizer.tokenize(text), self.tokenIndexers)
      label = str(df['sentiment '][i])
      labelField = LabelField(label)
      fields = {'text': textField, 'label': labelField}
      yield Instance(fields)
  
  def text_to_instance(self, text: str, label: str = None) -> Instance:
    tokens = self.tokenizer.tokenize(text)
    text_field = TextField(tokens, self.tokenIndexers)
    fields = {'text': text_field}
    if label:
      fields['label'] = LabelField(label)
    return Instance(fields)

In [None]:
class Classifier(Model):
  def __init__(self,
               vocab: Vocabulary,
               embedder: TextFieldEmbedder,
               encoder: Seq2VecEncoder):
    super().__init__(vocab)
    self.embedder = embedder
    self.encoder = encoder
    numLabels = vocab.get_vocab_size('labels')
    print(numLabels)
    self.classifier = nn.Linear(encoder.get_output_dim(), numLabels)
    self.accuracy = CategoricalAccuracy()

  def forward(
      self,
      text: TextFieldTensors,
      label: torch.Tensor = None
      ) -> Dict[str, torch.Tensor]:
    embeddedText = self.embedder(text)
    mask = util.get_text_field_mask(text)
    encodedText = self.encoder(embeddedText, mask)
    logits = self.classifier(encodedText)
    probs = func.softmax(logits)
    output = {'probs': probs}
    if label is not None:
      self.accuracy(logits, label)
      output['loss'] = func.cross_entropy(logits, label)
    
    return output
  
  def get_metrics(self, reset: bool = False) -> Dict[str, float]:
    return {'accuracy': self.accuracy.get_metric(reset)}

In [None]:
def buildVocab(instances: Iterable[Instance]) -> Vocabulary:
  print('Building the vocabulary...')
  vocab = Vocabulary.from_instances(instances)
  print('Built the vocabulary!')
  return vocab

In [None]:
def buildModel(vocab: Vocabulary) -> Model:
  print('Building the model...')
  vocabSize = vocab.get_vocab_size('tokens')
  embedder = BasicTextFieldEmbedder(
      {'tokens': Embedding(embedding_dim=10, num_embeddings=vocabSize)}
  )
  encoder = BagOfEmbeddingsEncoder(embedding_dim=10)
  model = Classifier(vocab, embedder, encoder)
  print('Built the model!')
  return model

In [None]:
def buildDatasetReader() -> DatasetReader:
  return ClassificationReader()

def readData(reader: DatasetReader) -> Tuple[List[Instance], List[Instance]]:
  trainingData = list(reader.read('nlp.csv'))[:100]
  validationData = list(reader.read('nlp.csv'))[100:195]
  return trainingData, validationData

def buildDataLoaders(
    trainData: List[Instance],
    devData: List[Instance],
) -> Tuple[DataLoader, DataLoader]:
  trainLoader = SimpleDataLoader(trainData, 8, shuffle=True)
  devLoader = SimpleDataLoader(devData, 8, shuffle=False)
  return trainLoader, devLoader

In [None]:
def buildTrainer(
    model: Model,
    trainLoader: DataLoader,
    devLoader: DataLoader,
    numEpochs: int
) -> Trainer:
  parameters = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
  optimizer = AdamOptimizer(parameters)
  trainer = GradientDescentTrainer(
      model=model,
      data_loader=trainLoader,
      validation_data_loader=devLoader,
      num_epochs=numEpochs,
      optimizer=optimizer
  )
  return trainer

In [None]:
def runTrainingLoop(numEpochs: int):
  datasetReader = buildDatasetReader()
  
  print('Reading data...')
  trainData, devData = readData(datasetReader)
  print('Reading done!')

  vocab = buildVocab(trainData + devData)
  model = buildModel(vocab)

  trainLoader, devLoader = buildDataLoaders(trainData, devData)
  trainLoader.index_with(vocab)
  devLoader.index_with(vocab)

  trainer = buildTrainer(model, trainLoader, devLoader, numEpochs)
  print('Starting training...')
  trainer.train()
  print('Finished training...')

  return model, datasetReader

In [None]:
model, datasetReader = runTrainingLoop(numEpochs=100)

Reading data...


building vocab: 100%|##########| 195/195 [00:00<00:00, 14840.22it/s]
You provided a validation dataset but patience was set to None, meaning that early stopping is disabled
accuracy: 0.3100, batch_loss: 1.0986, loss: 1.1157 ||: 100%|##########| 13/13 [00:00<00:00, 233.41it/s]
accuracy: 0.4526, batch_loss: 1.0171, loss: 1.0640 ||: 100%|##########| 12/12 [00:00<00:00, 597.52it/s]
accuracy: 0.7200, batch_loss: 0.9478, loss: 0.9887 ||: 100%|##########| 13/13 [00:00<00:00, 359.63it/s]
  0%|          | 0/12 [00:00<?, ?it/s]

Reading done!
Building the vocabulary...
Built the vocabulary!
Building the model...
3
Built the model!
Starting training...


accuracy: 0.5474, batch_loss: 0.9866, loss: 1.0424 ||: 100%|##########| 12/12 [00:00<00:00, 523.53it/s]
accuracy: 0.8100, batch_loss: 0.7233, loss: 0.8701 ||: 100%|##########| 13/13 [00:00<00:00, 263.96it/s]
accuracy: 0.4947, batch_loss: 0.9764, loss: 1.0394 ||: 100%|##########| 12/12 [00:00<00:00, 479.27it/s]
accuracy: 0.8700, batch_loss: 0.7777, loss: 0.7711 ||: 100%|##########| 13/13 [00:00<00:00, 305.42it/s]
accuracy: 0.5368, batch_loss: 0.9696, loss: 1.0454 ||: 100%|##########| 12/12 [00:00<00:00, 736.09it/s]
accuracy: 0.8600, batch_loss: 1.0020, loss: 0.6897 ||: 100%|##########| 13/13 [00:00<00:00, 271.02it/s]
accuracy: 0.5158, batch_loss: 0.9752, loss: 1.0594 ||: 100%|##########| 12/12 [00:00<00:00, 743.56it/s]
accuracy: 0.8600, batch_loss: 0.5326, loss: 0.5891 ||: 100%|##########| 13/13 [00:00<00:00, 317.57it/s]
accuracy: 0.5158, batch_loss: 0.9825, loss: 1.0736 ||: 100%|##########| 12/12 [00:00<00:00, 501.10it/s]
accuracy: 0.9000, batch_loss: 0.4003, loss: 0.5149 ||: 100%|####

Finished training...


In [None]:
class SentenceClassifierPredictor(Predictor):
  def predict(self, sentence: str) -> JsonDict:
    return self.predict_json({'sentence': sentence})

  def _json_to_instance(self, json_dict: JsonDict) -> Instance:
    sentence = json_dict['sentence']
    return self._dataset_reader.text_to_instance(sentence)

vocab = model.vocab
predictor = SentenceClassifierPredictor(model, datasetReader)
output = predictor.predict('i want the vaccine')
print(
  [
    (vocab.get_token_from_index(label_id, "labels"), prob)
      for label_id, prob in enumerate(output["probs"])
  ]
)


[('1', 0.22575588524341583), ('-1', 0.4164092540740967), ('0', 0.3578348457813263)]


