# Subject Recognition CRF Tagger Model

In [1]:
import sys
sys.path.insert(0, '../../../allennlp')
sys.path.insert(0, '../../')

In [2]:
import os
from allennlp.common.params import Params

crf_tagger = {
    "dataset_reader": {
        "type": "sequence_tagging",
        "word_tag_delimiter": "/",
        "token_indexers": {
            "tokens": {
                "type": "single_id",
                "lowercase_tokens": True
            },
            "token_characters": {
                "type": "characters",
                "character_tokenizer": {
                    "end_tokens": [
                        "@@PADDING@@",
                        "@@PADDING@@",
                        "@@PADDING@@",
                        "@@PADDING@@"
                    ]
                }
            }
        }
    },
    # TODO: Update this to the location for subject name recognition training data
    "train_data_path": './../../data/subject_recognition/train.txt', 
    "validation_data_path": './../../data/subject_recognition/dev.txt',
    "model": {
        "type": "crf_tagger",
        "text_field_embedder": {
            "tokens": {
                "type": "embedding",
                "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
                "embedding_dim": 100,
                "trainable": False
            },
            "token_characters": {
                "type": "character_encoding",
                "embedding": {
                    "embedding_dim": 16
                },
                "encoder": {
                    "type": "cnn",
                    "embedding_dim": 16,
                    "num_filters": 100,
                    "ngram_filter_sizes": [
                        5
                    ]
                },
                "dropout": 0.6
            }
        },
        "encoder": {
            "type": "lstm",
            "input_size": 200,
            "hidden_size": 300,
            "num_layers": 3,
            "dropout": 0.2,
            "bidirectional": True
        }
    },
    "iterator": {
        "type": "bucket",
        "sorting_keys": [
            [
                "tokens",
                "num_tokens"
            ]
        ],
        "batch_size": 16
    },
    "trainer": {
        "optimizer": "adam",
        "num_epochs": 10,
        "grad_norm": 1.0,
        "patience": 3,
        "cuda_device": 0
    }
}

In [3]:
import logging
# Create root logger
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)

from allennlp.commands.train import train_model
from lib.utils import get_log_directory_path

params = Params(crf_tagger)

serialization_dir = get_log_directory_path('subject_recognition_crf_tagger', './../../logs')
print('Serialization Directory:', serialization_dir)
train_model(params=params, serialization_dir=serialization_dir)

Serialization Directory: ./../../logs/0000.01-15_21:34:34.subject_recognition_crf_tagger
2018-01-15 21:34:34,010 | INFO : random_seed = 13370
2018-01-15 21:34:34,011 | INFO : numpy_seed = 1337
2018-01-15 21:34:34,011 | INFO : pytorch_seed = 133
2018-01-15 21:34:34,013 | INFO : Pytorch version: 0.3.0.post4
2018-01-15 21:34:34,015 | INFO : dataset_reader.type = sequence_tagging
2018-01-15 21:34:34,016 | INFO : dataset_reader.token_indexers.tokens.type = single_id
2018-01-15 21:34:34,016 | INFO : dataset_reader.token_indexers.tokens.namespace = tokens
2018-01-15 21:34:34,016 | INFO : dataset_reader.token_indexers.tokens.lowercase_tokens = True
2018-01-15 21:34:34,017 | INFO : dataset_reader.token_indexers.token_characters.type = characters
2018-01-15 21:34:34,017 | INFO : dataset_reader.token_indexers.token_characters.namespace = token_characters
2018-01-15 21:34:34,018 | INFO : dataset_reader.token_indexers.token_characters.character_tokenizer.byte_encoding = None
2018-01-15 21:34:34,018

74520it [00:02, 33110.60it/s]


2018-01-15 21:34:36,338 | INFO : validation_data_path = ./../../data/subject_recognition/dev.txt
2018-01-15 21:34:36,339 | INFO : Reading validation data from ./../../data/subject_recognition/dev.txt
2018-01-15 21:34:36,339 | INFO : Reading instances from lines in file at: ./../../data/subject_recognition/dev.txt


10648it [00:00, 20577.30it/s]

2018-01-15 21:34:36,868 | INFO : test_data_path = None
2018-01-15 21:34:36,869 | INFO : Creating a vocabulary using validation, train data.





2018-01-15 21:34:36,962 | INFO : vocabulary.directory_path = None
2018-01-15 21:34:36,963 | INFO : vocabulary.min_count = 1
2018-01-15 21:34:36,964 | INFO : vocabulary.max_vocab_size = None
2018-01-15 21:34:36,966 | INFO : vocabulary.non_padded_namespaces = ('*tags', '*labels')
2018-01-15 21:34:36,966 | INFO : vocabulary.only_include_pretrained_words = False
2018-01-15 21:34:36,966 | INFO : Fitting token dictionary from dataset.


100%|##########| 85168/85168 [00:06<00:00, 12858.90it/s]


2018-01-15 21:34:43,766 | INFO : model.type = crf_tagger
2018-01-15 21:34:43,768 | INFO : model.text_field_embedder.type = basic
2018-01-15 21:34:43,769 | INFO : model.text_field_embedder.tokens.type = embedding
2018-01-15 21:34:43,769 | INFO : model.text_field_embedder.tokens.num_embeddings = None
2018-01-15 21:34:43,770 | INFO : model.text_field_embedder.tokens.vocab_namespace = tokens
2018-01-15 21:34:43,770 | INFO : model.text_field_embedder.tokens.embedding_dim = 100
2018-01-15 21:34:43,771 | INFO : model.text_field_embedder.tokens.pretrained_file = https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz
2018-01-15 21:34:43,771 | INFO : model.text_field_embedder.tokens.projection_dim = None
2018-01-15 21:34:43,772 | INFO : model.text_field_embedder.tokens.trainable = False
2018-01-15 21:34:43,772 | INFO : model.text_field_embedder.tokens.padding_index = None
2018-01-15 21:34:43,772 | INFO : model.text_field_embedder.tokens.max_norm = None
2018-01-15 21:34:4

100%|##########| 74520/74520 [00:09<00:00, 7767.49it/s]

2018-01-15 21:34:59,170 | INFO : Indexing dataset



100%|##########| 10648/10648 [00:01<00:00, 8253.81it/s]

2018-01-15 21:35:00,462 | INFO : trainer.patience = 3
2018-01-15 21:35:00,463 | INFO : trainer.validation_metric = -loss
2018-01-15 21:35:00,464 | INFO : trainer.num_epochs = 10
2018-01-15 21:35:00,465 | INFO : trainer.cuda_device = 0
2018-01-15 21:35:00,466 | INFO : trainer.grad_norm = 1.0
2018-01-15 21:35:00,467 | INFO : trainer.grad_clipping = None
2018-01-15 21:35:00,468 | INFO : trainer.learning_rate_scheduler = None





2018-01-15 21:35:02,521 | INFO : trainer.optimizer = adam
2018-01-15 21:35:02,522 | INFO : Converting Params object to dict; logging of default values will not occur when dictionary parameters are used subsequently.
2018-01-15 21:35:02,524 | INFO : CURRENTLY DEFINED PARAMETERS: 
2018-01-15 21:35:02,525 | INFO : trainer.no_tqdm = False
2018-01-15 21:35:02,530 | INFO : evaluate_on_test = False
2018-01-15 21:35:02,533 | INFO : Beginning training.
2018-01-15 21:35:02,533 | INFO : Epoch 0/9


  0%|          | 0/4658 [00:00<?, ?it/s]

2018-01-15 21:35:02,535 | INFO : Training


token_accuracy: 0.96, accuracy: 0.85, loss: 12.45 ||: 100%|##########| 4658/4658 [01:56<00:00, 40.15it/s]

2018-01-15 21:36:58,562 | INFO : Validating



token_accuracy: 0.98, accuracy: 0.92, loss: 5.83 ||: 100%|##########| 666/666 [00:09<00:00, 73.63it/s]


2018-01-15 21:37:07,684 | INFO : Best validation performance so far. Copying weights to './../../logs/0000.01-15_21:34:34.subject_recognition_crf_tagger/best.th'.
2018-01-15 21:37:07,719 | INFO : Training token_accuracy : 0.963872    Validation token_accuracy : 0.976746 
2018-01-15 21:37:07,721 | INFO : Training accuracy : 0.848846    Validation accuracy : 0.918107 
2018-01-15 21:37:07,723 | INFO : Training loss : 12.448190    Validation loss : 5.828502 
2018-01-15 21:37:07,725 | INFO : Epoch duration: 00:02:05
2018-01-15 21:37:07,727 | INFO : Estimated training time remaining: 00:18:46
2018-01-15 21:37:07,730 | INFO : Epoch 1/9


  0%|          | 0/4658 [00:00<?, ?it/s]

2018-01-15 21:37:07,733 | INFO : Training


token_accuracy: 0.98, accuracy: 0.93, loss: 5.37 ||: 100%|##########| 4658/4658 [01:55<00:00, 40.23it/s]

2018-01-15 21:39:03,531 | INFO : Validating



token_accuracy: 0.97, accuracy: 0.94, loss: 5.22 ||: 100%|##########| 666/666 [00:08<00:00, 74.27it/s]


2018-01-15 21:39:12,567 | INFO : Best validation performance so far. Copying weights to './../../logs/0000.01-15_21:34:34.subject_recognition_crf_tagger/best.th'.
2018-01-15 21:39:12,639 | INFO : Training token_accuracy : 0.975723    Validation token_accuracy : 0.969536 
2018-01-15 21:39:12,641 | INFO : Training accuracy : 0.930287    Validation accuracy : 0.937547 
2018-01-15 21:39:12,647 | INFO : Training loss : 5.374257    Validation loss : 5.222215 
2018-01-15 21:39:12,648 | INFO : Epoch duration: 00:02:04
2018-01-15 21:39:12,651 | INFO : Estimated training time remaining: 00:16:40
2018-01-15 21:39:12,652 | INFO : Epoch 2/9


  0%|          | 0/4658 [00:00<?, ?it/s]

2018-01-15 21:39:12,655 | INFO : Training


token_accuracy: 0.97, accuracy: 0.95, loss: 4.13 ||: 100%|##########| 4658/4658 [01:55<00:00, 40.28it/s]

2018-01-15 21:41:08,290 | INFO : Validating



token_accuracy: 0.97, accuracy: 0.94, loss: 5.18 ||: 100%|##########| 666/666 [00:09<00:00, 73.24it/s]


2018-01-15 21:41:17,451 | INFO : Best validation performance so far. Copying weights to './../../logs/0000.01-15_21:34:34.subject_recognition_crf_tagger/best.th'.
2018-01-15 21:41:17,551 | INFO : Training token_accuracy : 0.973081    Validation token_accuracy : 0.970676 
2018-01-15 21:41:17,555 | INFO : Training accuracy : 0.947330    Validation accuracy : 0.941773 
2018-01-15 21:41:17,557 | INFO : Training loss : 4.133083    Validation loss : 5.181502 
2018-01-15 21:41:17,558 | INFO : Epoch duration: 00:02:04
2018-01-15 21:41:17,560 | INFO : Estimated training time remaining: 00:14:35
2018-01-15 21:41:17,563 | INFO : Epoch 3/9


  0%|          | 0/4658 [00:00<?, ?it/s]

2018-01-15 21:41:17,567 | INFO : Training


token_accuracy: 0.97, accuracy: 0.96, loss: 3.32 ||: 100%|##########| 4658/4658 [01:55<00:00, 40.19it/s]

2018-01-15 21:43:13,461 | INFO : Validating



token_accuracy: 0.97, accuracy: 0.95, loss: 5.34 ||: 100%|##########| 666/666 [00:09<00:00, 73.92it/s]


2018-01-15 21:43:22,549 | INFO : Training token_accuracy : 0.973321    Validation token_accuracy : 0.974707 
2018-01-15 21:43:22,551 | INFO : Training accuracy : 0.958468    Validation accuracy : 0.948347 
2018-01-15 21:43:22,552 | INFO : Training loss : 3.323836    Validation loss : 5.343469 
2018-01-15 21:43:22,552 | INFO : Epoch duration: 00:02:04
2018-01-15 21:43:22,553 | INFO : Estimated training time remaining: 00:12:30
2018-01-15 21:43:22,553 | INFO : Epoch 4/9


  0%|          | 0/4658 [00:00<?, ?it/s]

2018-01-15 21:43:22,554 | INFO : Training


token_accuracy: 0.98, accuracy: 0.97, loss: 2.76 ||: 100%|##########| 4658/4658 [01:54<00:00, 40.54it/s]

2018-01-15 21:45:17,468 | INFO : Validating



token_accuracy: 0.97, accuracy: 0.95, loss: 5.50 ||: 100%|##########| 666/666 [00:09<00:00, 70.08it/s]


2018-01-15 21:45:27,050 | INFO : Training token_accuracy : 0.975615    Validation token_accuracy : 0.968350 
2018-01-15 21:45:27,053 | INFO : Training accuracy : 0.967351    Validation accuracy : 0.947690 
2018-01-15 21:45:27,054 | INFO : Training loss : 2.760576    Validation loss : 5.500036 
2018-01-15 21:45:27,054 | INFO : Epoch duration: 00:02:04
2018-01-15 21:45:27,054 | INFO : Estimated training time remaining: 00:10:24
2018-01-15 21:45:27,055 | INFO : Epoch 5/9


  0%|          | 0/4658 [00:00<?, ?it/s]

2018-01-15 21:45:27,056 | INFO : Training


token_accuracy: 0.98, accuracy: 0.97, loss: 2.21 ||: 100%|##########| 4658/4658 [01:53<00:00, 41.04it/s]


2018-01-15 21:47:20,568 | INFO : Validating


token_accuracy: 0.98, accuracy: 0.95, loss: 5.90 ||: 100%|##########| 666/666 [00:08<00:00, 74.60it/s]

2018-01-15 21:47:29,499 | INFO : Ran out of patience.  Stopping training.
2018-01-15 21:47:29,500 | INFO : archiving weights and vocabulary to ./../../logs/0000.01-15_21:34:34.subject_recognition_crf_tagger/model.tar.gz





CrfTagger(
  (text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding(
    )
    (token_embedder_token_characters): TokenCharactersEncoder(
      (_embedding): TimeDistributed(
        (_module): Embedding(
        )
      )
      (_encoder): TimeDistributed(
        (_module): CnnEncoder(
          (_activation): ReLU()
          (conv_layer_0): Conv1d (16, 100, kernel_size=(5,), stride=(1,))
        )
      )
      (_dropout): Dropout(p=0.6)
    )
  )
  (encoder): PytorchSeq2SeqWrapper(
    (_module): LSTM(200, 300, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  )
  (tag_projection_layer): TimeDistributed(
    (_module): Linear(in_features=600, out_features=2)
  )
  (crf): ConditionalRandomField(
  )
)