In [2]:
from flair.data import Sentence, Corpus
from flair.datasets import UD_ENGLISH, ColumnCorpus
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

In [3]:
bert_embedding = TransformerWordEmbeddings('bert-base-cased')
sentence = Sentence('The grass is green .')

In [4]:
bert_embedding.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 The
tensor([ 0.2881, -0.6816,  0.5577,  ...,  0.8676,  0.0792,  0.8672])
Token: 2 grass
tensor([-0.0703, -0.0699,  0.0544,  ..., -0.2776, -1.0295, -0.3793])
Token: 3 is
tensor([ 0.0810, -0.3258,  0.4203,  ...,  0.2182,  0.3702,  0.9412])
Token: 4 green
tensor([ 0.0508, -0.4786,  0.3054,  ..., -0.6855, -0.0248,  0.6028])
Token: 5 .
tensor([ 0.6098, -0.1838, -0.0825,  ...,  0.2716, -0.0733,  0.3918])


In [5]:
sentence = Sentence('This is the grass green .')
bert_embedding.embed(sentence)

# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 This
tensor([ 0.4386, -0.5114,  0.6741,  ...,  1.2735, -0.1206,  0.6134])
Token: 2 is
tensor([ 0.3237, -0.2456,  0.4793,  ...,  0.3411,  0.1160,  1.2343])
Token: 3 the
tensor([-0.0832, -0.4757, -0.0042,  ...,  0.9476,  0.2747,  1.3109])
Token: 4 grass
tensor([-0.1270,  0.0168,  0.0254,  ..., -0.1931, -0.7082,  0.3254])
Token: 5 green
tensor([ 0.0683, -0.2390,  0.5181,  ..., -0.5370, -1.0446,  0.7060])
Token: 6 .
tensor([ 0.2875, -0.0406,  0.1617,  ...,  1.1258,  0.0067,  0.4776])


In [6]:
sentence = Sentence('Where did you put the grass green ?')
bert_embedding.embed(sentence)
gensim
# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

Token: 1 Where
tensor([ 0.5379, -0.8622,  0.0541,  ..., -0.2395, -1.8098, -0.9114])
Token: 2 did
tensor([ 0.2666, -0.4282, -0.4665,  ...,  1.4636, -1.7452,  1.0283])
Token: 3 you
tensor([ 0.8508, -0.2540, -0.0265,  ...,  0.7896, -1.5016,  0.5311])
Token: 4 put
tensor([ 0.3347, -0.0850, -0.6118,  ...,  0.8824, -0.6401,  0.7491])
Token: 5 the
tensor([ 0.1939, -0.4221, -0.2043,  ...,  0.9627,  0.0954,  1.2726])
Token: 6 grass
tensor([ 0.0489,  0.0358,  0.0538,  ..., -0.6686, -0.9376, -0.3929])
Token: 7 green
tensor([ 0.2135, -0.1704,  0.3763,  ..., -0.4998, -0.2303,  0.5111])
Token: 8 ?
tensor([ 0.4633, -0.5424, -0.0246,  ...,  0.1583, -1.2093,  1.3458])


In [17]:
# define columns
columns = {0: 'text', 1: 'arg'}

# this is the folder in which train, test and dev files reside
data_folder = '/Users/talhindi/Documents/data_wm/merged_arg/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')
print(corpus)

tag_type = 'arg'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

2020-08-19 13:30:02,113 Reading data from /Users/talhindi/Documents/data_wm/merged_arg
2020-08-19 13:30:02,114 Train: /Users/talhindi/Documents/data_wm/merged_arg/train.txt
2020-08-19 13:30:02,114 Dev: /Users/talhindi/Documents/data_wm/merged_arg/dev.txt
2020-08-19 13:30:02,115 Test: /Users/talhindi/Documents/data_wm/merged_arg/test.txt
Corpus: 1862 train + 1266 dev + 1266 test sentences
Dictionary with 7 tags: <unk>, O, O-claim, B-claim, I-claim, <START>, <STOP>


In [19]:
# 4. initialize embeddings
# embedding_types = [

#     WordEmbeddings('glove'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
# ]

# embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger


tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=TransformerWordEmbeddings('bert-base-cased'),
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

In [None]:
# 7. start training
trainer.train('resources/taggers/example-pos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)