In [2]:
from flair.data import Corpus, Sentence, Dictionary
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger
from flair.embeddings import FlairEmbeddings, StackedEmbeddings
from flair.trainers import ModelTrainer
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus

In [3]:
corpus = ColumnCorpus(
  data_folder = 'data/processed/NER/flair',
  column_format = {0:'text', 1:'ner'},
  train_file = 'train_set',
  dev_file = 'eval_set',
  test_file = 'test_set'
)
tag_dict = corpus.make_tag_dictionary('ner')
print(tag_dict)

2021-01-14 17:58:05,270 Reading data from data/processed/NER/flair
2021-01-14 17:58:05,277 Train: data/processed/NER/flair/train_set
2021-01-14 17:58:05,278 Dev: data/processed/NER/flair/eval_set
2021-01-14 17:58:05,280 Test: data/processed/NER/flair/test_set
Dictionary with 13 tags: <unk>, O, S-dish, B-dish, E-dish, I-dish, B-restaurant, E-restaurant, S-occasion, I-restaurant, S-restaurant, <START>, <STOP>


In [4]:
f_language_model = FlairEmbeddings('news-forward-fast').lm
f_is_forward_lm = f_language_model.is_forward_lm
f_dictionary = f_language_model.dictionary
f_corpus = TextCorpus(
    'data/processed/embeddings/flair/corpus/',
    f_dictionary,
    f_is_forward_lm,
    character_level=True
)
f_trainer = LanguageModelTrainer(f_language_model, f_corpus)
f_trainer.train(
    'tuned_forward_model',
    sequence_length=100,
    mini_batch_size=32,
    learning_rate=20,
    patience=10,
    max_epochs=5,
    num_workers=2
)

2021-01-14 17:58:08,049 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmpulop_mrg


100%|██████████| 19689779/19689779 [00:01<00:00, 11540063.60B/s]

2021-01-14 17:58:10,128 copying /tmp/tmpulop_mrg to cache at /root/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2021-01-14 17:58:10,148 removing temp file /tmp/tmpulop_mrg





2021-01-14 17:58:24,843 read text file with 11398 lines
2021-01-14 17:59:08,650 read text file with 11398 lines
2021-01-14 17:59:52,858 read text file with 102577 lines
2021-01-14 17:59:52,928 shuffled
2021-01-14 18:06:24,342 Sequence length is 100
2021-01-14 18:06:24,824 Split 1	 - (18:06:24)
2021-01-14 18:06:30,058 | split   1 /  1 |   100/18432 batches | ms/batch 52.31 | loss  1.61 | ppl     5.01
2021-01-14 18:06:35,095 | split   1 /  1 |   200/18432 batches | ms/batch 50.36 | loss  1.41 | ppl     4.11
2021-01-14 18:06:40,129 | split   1 /  1 |   300/18432 batches | ms/batch 50.32 | loss  1.34 | ppl     3.83
2021-01-14 18:06:45,165 | split   1 /  1 |   400/18432 batches | ms/batch 50.32 | loss  1.31 | ppl     3.71
2021-01-14 18:06:50,202 | split   1 /  1 |   500/18432 batches | ms/batch 50.36 | loss  1.29 | ppl     3.63
2021-01-14 18:06:55,239 | split   1 /  1 |   600/18432 batches | ms/batch 50.36 | loss  1.26 | ppl     3.52
2021-01-14 18:07:00,281 | split   1 /  1 |   700/18432 ba

In [5]:
b_language_model = FlairEmbeddings('news-backward-fast').lm
b_is_forward_lm = b_language_model.is_forward_lm
b_dictionary = b_language_model.dictionary
b_corpus = TextCorpus(
    'data/processed/embeddings/flair/corpus/',
    b_dictionary,
    b_is_forward_lm,
    character_level=True
)
b_trainer = LanguageModelTrainer(b_language_model, b_corpus)
b_trainer.train(
    'tuned_backward_model',
    sequence_length=100,
    mini_batch_size=32,
    learning_rate=20,
    patience=10,
    max_epochs=5,
    num_workers=2,
    checkpoint=True
)

2021-01-14 19:54:49,678 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmpioxc6m67


100%|██████████| 19689779/19689779 [00:01<00:00, 11192620.58B/s]

2021-01-14 19:54:51,811 copying /tmp/tmpioxc6m67 to cache at /root/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2021-01-14 19:54:51,837 removing temp file /tmp/tmpioxc6m67





2021-01-14 19:54:52,548 read text file with 11398 lines
2021-01-14 19:55:36,833 read text file with 11398 lines
2021-01-14 19:56:21,316 read text file with 102577 lines
2021-01-14 19:56:21,396 shuffled
2021-01-14 20:03:03,362 Sequence length is 100
2021-01-14 20:03:04,002 Split 1	 - (20:03:04)
2021-01-14 20:03:09,150 | split   1 /  1 |   100/18432 batches | ms/batch 51.45 | loss  1.57 | ppl     4.81
2021-01-14 20:03:14,212 | split   1 /  1 |   200/18432 batches | ms/batch 50.61 | loss  1.37 | ppl     3.95
2021-01-14 20:03:19,267 | split   1 /  1 |   300/18432 batches | ms/batch 50.48 | loss  1.33 | ppl     3.79
2021-01-14 20:03:24,321 | split   1 /  1 |   400/18432 batches | ms/batch 50.52 | loss  1.30 | ppl     3.68
2021-01-14 20:03:29,361 | split   1 /  1 |   500/18432 batches | ms/batch 50.38 | loss  1.28 | ppl     3.59
2021-01-14 20:03:34,401 | split   1 /  1 |   600/18432 batches | ms/batch 50.38 | loss  1.28 | ppl     3.59
2021-01-14 20:03:39,433 | split   1 /  1 |   700/18432 ba

In [None]:
embeddings = StackedEmbeddings([
    FlairEmbeddings('tuned_forward_model/best-lm.pt'),
    FlairEmbeddings('tuned_backward_model/best-lm.pt')
])

In [7]:
tagger = SequenceTagger(
    hidden_size = 256,
    embeddings = embeddings,
    tag_dictionary = tag_dict,
    tag_type = 'ner',
    use_crf = True
)

trainer = ModelTrainer(tagger, corpus)
trainer.train(
    'custom_embeddings_ner',
    learning_rate=.1,
    mini_batch_size=30,
    max_epochs=50,
    embeddings_storage_mode='gpu',
    train_with_dev=True
)

2021-01-14 21:51:23,849 ----------------------------------------------------------------------------------------------------
2021-01-14 21:51:23,850 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2048, out_features=2048, bias=True)
  (rnn): LSTM(2048, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_feature

{'dev_loss_history': [],
 'dev_score_history': [],
 'test_score': 0.6632124352331605,
 'train_loss_history': [36.54604160494921,
  11.485774819443865,
  8.977585152881902,
  7.999804868930724,
  7.15531239276979,
  6.525738774276361,
  6.296117427872448,
  5.907643039052079,
  5.85638100926469,
  5.466031859560711,
  5.335527175810279,
  5.344755794943833,
  5.13518244464223,
  5.109814143762356,
  4.864025618971848,
  4.799987269610893,
  4.562519916673986,
  4.680666795591029,
  4.537707212494641,
  4.448484717345819,
  4.496600430186202,
  4.444590475501084,
  4.433365882896796,
  4.263942142812217,
  4.17296219453579,
  4.241308944981273,
  4.106546029811952,
  4.000650760604114,
  3.9277867427686366,
  4.107921705013368,
  3.963152937772797,
  3.824887258250539,
  3.89337108774883,
  3.739013183407667,
  3.7427748790601405,
  3.8409370794528868,
  3.5580726920104606,
  3.7467214857659688,
  3.5561163425445557,
  3.536944182907663,
  3.4637512913564357,
  3.5987718163467033,
  3.56

In [None]:
model = SequenceTagger.load('custom_embeddings_ner/final-model.pt')

In [None]:
prueba = Sentence("Everybody was very friendly here. My kids loved the beef tacos and I had a chicken salad. I would probably order tacos or enchiladas next time instead. One of my sons didn't like what he got first and they were happy to exchange it for something else.  My kids had so much fun playing at the playground. I'm so glad to have somewhere close by where my kids can eat and play.")
model.predict(prueba)
print(prueba.to_tagged_string())

In [10]:
%%time
trainer.train(
    'custom_embeddings_ner',
    learning_rate=0.1,
    mini_batch_size=30,
    max_epochs=50,
    embeddings_storage_mode='gpu',
    train_with_dev=True
)

2021-01-14 22:21:54,542 ----------------------------------------------------------------------------------------------------
2021-01-14 22:21:54,544 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2048, out_features=2048, bias=True)
  (rnn): LSTM(2048, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_feature

{'dev_loss_history': [],
 'dev_score_history': [],
 'test_score': 0.6632124352331605,
 'train_loss_history': []}

In [11]:
!pip install memory_profiler
%load_ext memory_profiler

Collecting memory_profiler
  Downloading https://files.pythonhosted.org/packages/8f/fd/d92b3295657f8837e0177e7b48b32d6651436f0293af42b76d134c3bb489/memory_profiler-0.58.0.tar.gz
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-cp36-none-any.whl size=30181 sha256=58884e18e1cd68abc331c2f6c0db8f23c1f59d2ecf3cbb89df278dec3e7b108a
  Stored in directory: /root/.cache/pip/wheels/02/e4/0b/aaab481fc5dd2a4ea59e78bc7231bb6aae7635ca7ee79f8ae5
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0


In [12]:
%%memit
trainer.train(
    'custom_embeddings_ner',
    learning_rate=0.1,
    mini_batch_size=30,
    max_epochs=50,
    embeddings_storage_mode='gpu',
    train_with_dev=True
)

2021-01-14 22:22:06,627 ----------------------------------------------------------------------------------------------------
2021-01-14 22:22:06,631 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2048, out_features=2048, bias=True)
  (rnn): LSTM(2048, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_feature