In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parent.parent
if module_path not in sys.path:
    sys.path.append(str(module_path))

In [3]:
from flair.datasets import ColumnCorpus
from flair.data import Corpus, Sentence
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from src.loader import TextLoader

In [5]:
loader = TextLoader()

In [6]:
columns = {0: "text", 1: "ner"}
corpus: Corpus = ColumnCorpus(
    "data/",
    columns,
    train_file="NER_train.txt",
    dev_file="NER_dev.txt",
    test_file="NER_test.txt",
    column_delimiter=" ",
    document_separator_token="<DOC>",
)
corpus.filter_empty_sentences()


2022-12-03 02:34:37,301 Reading data from data
2022-12-03 02:34:37,302 Train: data/NER_train.txt
2022-12-03 02:34:37,302 Dev: data/NER_dev.txt
2022-12-03 02:34:37,302 Test: data/NER_test.txt
2022-12-03 02:34:43,597 Filtering empty sentences
2022-12-03 02:34:43,990 Corpus: 21569 train + 23175 dev + 2536 test sentences


In [7]:
label_dict = corpus.make_label_dictionary(label_type='ner')

2022-12-03 02:34:44,015 Computing label dictionary. Progress:


21569it [00:00, 95533.67it/s]

2022-12-03 02:34:44,244 Dictionary created for label 'ner' with 5 values: DS (seen 9129 times), DE (seen 9101 times), DE<DOC> (seen 79 times), DS<DOC> (seen 1 times)





In [8]:
embedding_types = [
    WordEmbeddings('glove'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [9]:
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict,
    tag_type="ner",
    use_crf=True,
)


2022-12-03 02:34:46,986 SequenceTagger predicts: Dictionary with 5 tags: <unk>, DS, DE, DE<DOC>, DS<DOC>


In [10]:
trainer = ModelTrainer(tagger, corpus)

In [11]:
trainer.train(
    'models/',
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=10,
    checkpoint=True,
)


2022-12-03 02:34:48,748 ----------------------------------------------------------------------------------------------------
2022-12-03 02:34:48,748 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=7, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)"
2022-12-03 02:34:48,749 ----------------------------------------------------------------------------------------------------
2022-12-03 02:34:48,749 Corpus: "Corpus: 21569 train + 23175 dev + 2536 test sentences"
2022-12-03 02:34:48,750 ----------------------------------------------------------------------------------------------------
2022-12-03 02:34:48,75

100%|██████████| 725/725 [00:57<00:00, 12.54it/s]


2022-12-03 02:36:21,386 Evaluating as a multi-label problem: False
2022-12-03 02:36:22,842 DEV : loss 0.11871852725744247 - f1-score (micro avg)  0.0164
2022-12-03 02:36:23,546 BAD EPOCHS (no improvement): 0
2022-12-03 02:36:23,876 saving best model
2022-12-03 02:36:24,186 ----------------------------------------------------------------------------------------------------
2022-12-03 02:36:27,252 epoch 2 - iter 67/675 - loss 0.11877753 - samples/sec: 701.13 - lr: 0.100000
2022-12-03 02:36:30,538 epoch 2 - iter 134/675 - loss 0.11805694 - samples/sec: 653.67 - lr: 0.100000
2022-12-03 02:36:33,569 epoch 2 - iter 201/675 - loss 0.11696686 - samples/sec: 708.56 - lr: 0.100000
2022-12-03 02:36:36,715 epoch 2 - iter 268/675 - loss 0.11595002 - samples/sec: 682.82 - lr: 0.100000
2022-12-03 02:36:39,767 epoch 2 - iter 335/675 - loss 0.11641897 - samples/sec: 703.70 - lr: 0.100000
2022-12-03 02:36:43,109 epoch 2 - iter 402/675 - loss 0.11619739 - samples/sec: 642.69 - lr: 0.100000
2022-12-03 02:

100%|██████████| 725/725 [00:52<00:00, 13.75it/s]


2022-12-03 02:37:48,760 Evaluating as a multi-label problem: False
2022-12-03 02:37:50,213 DEV : loss 0.11902952939271927 - f1-score (micro avg)  0.0167
2022-12-03 02:37:50,904 BAD EPOCHS (no improvement): 0
2022-12-03 02:37:51,264 saving best model
2022-12-03 02:37:51,633 ----------------------------------------------------------------------------------------------------
2022-12-03 02:37:54,757 epoch 3 - iter 67/675 - loss 0.10894779 - samples/sec: 687.72 - lr: 0.100000
2022-12-03 02:37:57,917 epoch 3 - iter 134/675 - loss 0.11124713 - samples/sec: 679.76 - lr: 0.100000
2022-12-03 02:38:01,105 epoch 3 - iter 201/675 - loss 0.11064897 - samples/sec: 673.64 - lr: 0.100000
2022-12-03 02:38:04,193 epoch 3 - iter 268/675 - loss 0.11025712 - samples/sec: 695.50 - lr: 0.100000
2022-12-03 02:38:07,372 epoch 3 - iter 335/675 - loss 0.11176428 - samples/sec: 675.82 - lr: 0.100000
2022-12-03 02:38:10,434 epoch 3 - iter 402/675 - loss 0.11039292 - samples/sec: 701.52 - lr: 0.100000
2022-12-03 02:

100%|██████████| 725/725 [00:53<00:00, 13.43it/s]


2022-12-03 02:39:17,643 Evaluating as a multi-label problem: False
2022-12-03 02:39:19,177 DEV : loss 0.11433745920658112 - f1-score (micro avg)  0.0005
2022-12-03 02:39:19,852 BAD EPOCHS (no improvement): 1
2022-12-03 02:39:20,234 ----------------------------------------------------------------------------------------------------
2022-12-03 02:39:23,466 epoch 4 - iter 67/675 - loss 0.10618117 - samples/sec: 664.49 - lr: 0.100000
2022-12-03 02:39:26,810 epoch 4 - iter 134/675 - loss 0.10738730 - samples/sec: 642.45 - lr: 0.100000
2022-12-03 02:39:30,069 epoch 4 - iter 201/675 - loss 0.10718083 - samples/sec: 658.90 - lr: 0.100000
2022-12-03 02:39:33,232 epoch 4 - iter 268/675 - loss 0.10690487 - samples/sec: 679.08 - lr: 0.100000
2022-12-03 02:39:36,534 epoch 4 - iter 335/675 - loss 0.10611978 - samples/sec: 650.46 - lr: 0.100000
2022-12-03 02:39:39,831 epoch 4 - iter 402/675 - loss 0.10625002 - samples/sec: 651.47 - lr: 0.100000
2022-12-03 02:39:43,088 epoch 4 - iter 469/675 - loss 0.

 20%|██        | 145/725 [00:12<00:48, 11.89it/s]

2022-12-03 02:40:05,410 ----------------------------------------------------------------------------------------------------
2022-12-03 02:40:05,410 Exiting from training early.
2022-12-03 02:40:05,410 Saving model ...





2022-12-03 02:40:05,745 Done.
2022-12-03 02:40:05,759 ----------------------------------------------------------------------------------------------------
2022-12-03 02:40:05,760 loading file models/best-model.pt
2022-12-03 02:40:06,111 SequenceTagger predicts: Dictionary with 7 tags: <unk>, DS, DE, DE<DOC>, DS<DOC>, <START>, <STOP>


100%|██████████| 80/80 [00:05<00:00, 14.41it/s]

2022-12-03 02:40:11,775 Evaluating as a multi-label problem: False





2022-12-03 02:40:11,948 0.0089	0.2397	0.0172	0.0089
2022-12-03 02:40:11,949 
Results:
- F-score (micro) 0.0172
- F-score (macro) 0.0894
- Accuracy 0.0089

By class:
              precision    recall  f1-score   support

       <unk>     0.0000    0.0000    0.0000         0
          DS     0.2771    0.4770    0.3505      1111
          DE     0.2857    0.0036    0.0071      1116
     DE<DOC>     0.0000    0.0000    0.0000         1

   micro avg     0.0089    0.2397    0.0172      2228
   macro avg     0.1407    0.1202    0.0894      2228
weighted avg     0.2813    0.2397    0.1783      2228

2022-12-03 02:40:11,949 ----------------------------------------------------------------------------------------------------


{'test_score': 0.017161864665520398,
 'dev_score_history': [0.016413184411706992,
  0.01672097971244224,
  0.00045399806858448786],
 'train_loss_history': [0.12469920030835922,
  0.1138496341959533,
  0.1100371816011539],
 'dev_loss_history': [0.11871852725744247,
  0.11902952939271927,
  0.11433745920658112]}

In [None]:
model = SequenceTagger.load("models/final-model.pt")

In [None]:
text = loader.load_random_text()

sent = Sentence(text.text)
model.predict(sent)

In [None]:
sent.to_tagged_string()