In [None]:
import sys
from pathlib import Path

In [None]:
module_path = Path.cwd().parent.parent
if module_path not in sys.path:
    sys.path.append(str(module_path))

In [None]:
from flair.data import Corpus, Sentence
from flair.datasets import ColumnCorpus
from flair.embeddings import FlairEmbeddings, StackedEmbeddings, WordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import torch

In [None]:
from src.loader import TextLoader
from src.model import DatasetType

In [None]:
loader = TextLoader(DatasetType.V1_WITH_PREDICTIONSTRING)

In [None]:
columns = {0: "text", 1: "ner"}
corpus: Corpus = ColumnCorpus(
    "data/",
    columns,
    train_file="NER_train.txt",
    dev_file="NER_dev.txt",
    test_file="NER_test.txt",
    column_delimiter=" ",
    document_separator_token="<DOC>",
)
corpus.filter_empty_sentences()


In [None]:
label_dict = corpus.make_label_dictionary(label_type='ner')

In [None]:
embedding_types = [
    # GloVe embeddings
    WordEmbeddings('glove'),
    # contextual string embeddings, forward
    FlairEmbeddings('news-forward'),
    # contextual string embeddings, backward
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=label_dict,
    tag_type="ner",
    use_crf=True,
)


In [None]:
torch.cuda.empty_cache()

In [None]:
trainer = ModelTrainer(tagger, corpus)

In [None]:
trainer.train(
    'models/',
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=5,
    patience=4,
    train_with_dev=True,
    embeddings_storage_mode='gpu',
    checkpoint=True,
)


In [None]:
model = SequenceTagger.load("models/final-model.pt")

In [None]:
text = loader.load_random_text()

sent = Sentence(text.text)
model.predict(sent)

In [None]:
for disc in text.discourses:
    print(disc)

In [None]:
sent.to_tagged_string()