In [2]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, CharacterEmbeddings, FlairEmbeddings

In [1]:
import flair
import torch
flair.device = torch.device('cpu')

In [3]:
# 1. get the corpus
corpus: Corpus = ColumnCorpus(
    "../../data/", column_format={0:'text', 1:'ner'},
    train_file='train.conll', dev_file='dev.conll', test_file='test.conll',
)
print(corpus)

2021-02-09 10:19:27,356 Reading data from ../../data
2021-02-09 10:19:27,357 Train: ../../data/train.conll
2021-02-09 10:19:27,357 Dev: ../../data/dev.conll
2021-02-09 10:19:27,358 Test: ../../data/test.conll
Corpus: 6544 train + 728 dev + 1818 test sentences


In [6]:
# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
embedding_types = [
    CharacterEmbeddings()
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

Dictionary with 14 tags: <unk>, O, B-BOOK, I-BOOK, B-SINGER, I-SINGER, B-COMPOSER, I-COMPOSER, B-FILM, B-SONG, I-SONG, I-FILM, <START>, <STOP>


In [7]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
history = trainer.train(
    './models/baseline-charembeddings',
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=20
)

2021-02-09 10:20:35,698 ----------------------------------------------------------------------------------------------------
2021-02-09 10:20:35,699 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): CharacterEmbeddings(
      (char_embedding): Embedding(275, 25)
      (char_rnn): LSTM(25, 25, bidirectional=True)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=50, out_features=50, bias=True)
  (rnn): LSTM(50, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=14, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2021-02-09 10:20:35,699 ----------------------------------------------------------------------------------------------------
2021-02-09 10:20:35,699 Corpus: "Corpus: 6544 train + 728 dev + 1818 test sentences"
2021-02-09 10:20:35,700 ------------------------------------------------------------------------------