#### Učitavanje baze i pretprocesiranje

In [19]:
import flair.datasets
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models  import SequenceTagger
from flair.trainers import ModelTrainer

corpus = flair.datasets.UD_SERBIAN().downsample(0.1)

2022-08-25 17:32:24,517 Reading data from C:\Users\vladimir\.flair\datasets\ud_serbian
2022-08-25 17:32:24,518 Train: C:\Users\vladimir\.flair\datasets\ud_serbian\sr_set-ud-train.conllu
2022-08-25 17:32:24,518 Dev: C:\Users\vladimir\.flair\datasets\ud_serbian\sr_set-ud-dev.conllu
2022-08-25 17:32:24,519 Test: C:\Users\vladimir\.flair\datasets\ud_serbian\sr_set-ud-test.conllu


In [20]:
# print the number of Sentences in the train split
print("Training: ",len(corpus.train))

# print the number of Sentences in the test split
print("Test: ",len(corpus.test))

# print the number of Sentences in the dev split
print("Dev: ",len(corpus.dev))

Training:  333
Test:  52
Dev:  54


In [21]:
# get the first Sentence in the training split
sentence = corpus.test[1]

# print with all annotations
print(sentence)

# print only with POS annotations (better readability)
print(sentence.to_tagged_string('pos'))

Sentence: "Odluke o sprovođenju dogovora , koji nisu potpisivani , sada treba da usvoje vlade u Beogradu i Prištini ." → ["Odluke"/odluka/NOUN/Ncfpn/nsubj/Nom/Fem/Plur, "o"/o/ADP/Sl/case/Loc, "sprovođenju"/sprovođenje/NOUN/Ncnsl/nmod/Loc/Neut/Sing, "dogovora"/dogovor/NOUN/Ncmsg/obl/Gen/Masc/Sing, ","/,/PUNCT/Z/punct, "koji"/koji/DET/Pi-mpn/nsubj/Nom/Masc/Plur/Int,Rel, "nisu"/biti/AUX/Var3p/aux/Ind/Plur/3/Neg/Pres/Fin, "potpisivani"/potpisivati/ADJ/Appmpny/acl/Nom/Def/Pos/Masc/Plur/Part/Pass, ","/,/PUNCT/Z/punct, "sada"/sada/ADV/Rgp/advmod/Pos/Dem, "treba"/trebati/VERB/Vmr3s/root/Ind/Sing/3/Pres/Fin, "da"/da/SCONJ/Cs/mark, "usvoje"/usvojiti/VERB/Vmr3p/xcomp/Ind/Plur/3/Pres/Fin, "vlade"/vlada/NOUN/Ncfpn/nsubj/Nom/Fem/Plur, "u"/u/ADP/Sl/case/Loc, "Beogradu"/Beograd/PROPN/Npmsl/nmod/Loc/Masc/Sing, "i"/i/CCONJ/Cc/cc, "Prištini"/Priština/PROPN/Npfsl/conj/Loc/Fem/Sing, "."/./PUNCT/Z/punct]
Sentence: "Odluke o sprovođenju dogovora , koji nisu potpisivani , sada treba da usvoje vlade u Beogradu

### UPOS TAGS:
- ADJ: adjective
- ADP: adposition
- ADV: adverb
- AUX: auxiliary
- CCONJ: coordinating conjunction
- DET: determiner
- INTJ: interjection
- NOUN: noun
- NUM: numeral
- PART: particle
- PRON: pronoun
- PROPN: proper noun
- PUNCT: punctuation
- SCONJ: subordinating conjunction
- SYM: symbol
- VERB: verb
- X: other

In [32]:
# Make the UPOS label dictionary from the corpus
label_type = 'upos'
label_dict  = corpus.make_label_dictionary(label_type = label_type)

# print dictionary
print(label_dict)

2022-08-25 17:51:07,972 Computing label dictionary. Progress:


333it [00:00, 22200.55it/s]

2022-08-25 17:51:07,990 Dictionary created for label 'upos' with 17 values: NOUN (seen 1840 times), PUNCT (seen 978 times), ADJ (seen 892 times), ADP (seen 716 times), VERB (seen 640 times), PROPN (seen 564 times), AUX (seen 466 times), DET (seen 294 times), SCONJ (seen 270 times), ADV (seen 248 times), CCONJ (seen 240 times), PRON (seen 210 times), NUM (seen 94 times), PART (seen 54 times), X (seen 37 times), SYM (seen 1 times)
Dictionary with 17 tags: <unk>, NOUN, PUNCT, ADJ, ADP, VERB, PROPN, AUX, DET, SCONJ, ADV, CCONJ, PRON, NUM, PART, X, SYM





In [34]:
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

In [35]:
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

2022-08-25 17:51:20,598 SequenceTagger predicts: Dictionary with 17 tags: <unk>, NOUN, PUNCT, ADJ, ADP, VERB, PROPN, AUX, DET, SCONJ, ADV, CCONJ, PRON, NUM, PART, X, SYM


In [36]:
trainer = ModelTrainer(tagger, corpus)

In [28]:
trainer.train('resources/taggers/example-upos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)

2022-08-25 17:33:45,138 ----------------------------------------------------------------------------------------------------
2022-08-25 17:33:45,139 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, b

100%|██████████| 2/2 [00:08<00:00,  4.09s/it]

2022-08-25 17:34:58,449 Evaluating as a multi-label problem: False
2022-08-25 17:34:58,463 DEV : loss 2.059873580932617 - f1-score (micro avg)  0.3797
2022-08-25 17:34:58,468 BAD EPOCHS (no improvement): 0
2022-08-25 17:34:58,469 saving best model





2022-08-25 17:34:59,846 ----------------------------------------------------------------------------------------------------
2022-08-25 17:35:00,850 epoch 2 - iter 1/11 - loss 2.07947641 - samples/sec: 31.93 - lr: 0.100000
2022-08-25 17:35:01,696 epoch 2 - iter 2/11 - loss 2.00762480 - samples/sec: 37.91 - lr: 0.100000
2022-08-25 17:35:02,780 epoch 2 - iter 3/11 - loss 2.00526522 - samples/sec: 29.56 - lr: 0.100000
2022-08-25 17:35:03,918 epoch 2 - iter 4/11 - loss 1.98824665 - samples/sec: 28.14 - lr: 0.100000
2022-08-25 17:35:05,404 epoch 2 - iter 5/11 - loss 1.98712863 - samples/sec: 21.55 - lr: 0.100000
2022-08-25 17:35:06,392 epoch 2 - iter 6/11 - loss 1.97747730 - samples/sec: 32.42 - lr: 0.100000
2022-08-25 17:35:07,774 epoch 2 - iter 7/11 - loss 1.96420633 - samples/sec: 23.17 - lr: 0.100000
2022-08-25 17:35:08,910 epoch 2 - iter 8/11 - loss 1.93366170 - samples/sec: 28.18 - lr: 0.100000
2022-08-25 17:35:10,053 epoch 2 - iter 9/11 - loss 1.91093913 - samples/sec: 28.04 - lr: 0.

100%|██████████| 2/2 [00:00<00:00,  3.41it/s]

2022-08-25 17:35:11,924 Evaluating as a multi-label problem: False
2022-08-25 17:35:11,931 DEV : loss 1.701104760169983 - f1-score (micro avg)  0.5115





2022-08-25 17:35:11,935 BAD EPOCHS (no improvement): 0
2022-08-25 17:35:11,936 saving best model
2022-08-25 17:35:13,486 ----------------------------------------------------------------------------------------------------
2022-08-25 17:35:14,880 epoch 3 - iter 1/11 - loss 1.78581494 - samples/sec: 22.97 - lr: 0.100000
2022-08-25 17:35:15,884 epoch 3 - iter 2/11 - loss 1.75570822 - samples/sec: 31.90 - lr: 0.100000
2022-08-25 17:35:17,037 epoch 3 - iter 3/11 - loss 1.75120617 - samples/sec: 27.78 - lr: 0.100000
2022-08-25 17:35:18,108 epoch 3 - iter 4/11 - loss 1.72670427 - samples/sec: 29.91 - lr: 0.100000
2022-08-25 17:35:19,141 epoch 3 - iter 5/11 - loss 1.70093046 - samples/sec: 31.04 - lr: 0.100000
2022-08-25 17:35:20,560 epoch 3 - iter 6/11 - loss 1.70018467 - samples/sec: 22.57 - lr: 0.100000
2022-08-25 17:35:21,583 epoch 3 - iter 7/11 - loss 1.67081020 - samples/sec: 31.34 - lr: 0.100000
2022-08-25 17:35:22,633 epoch 3 - iter 8/11 - loss 1.64109133 - samples/sec: 30.50 - lr: 0.1

100%|██████████| 2/2 [00:00<00:00,  3.41it/s]

2022-08-25 17:35:25,870 Evaluating as a multi-label problem: False
2022-08-25 17:35:25,877 DEV : loss 1.4447882175445557 - f1-score (micro avg)  0.5594
2022-08-25 17:35:25,882 BAD EPOCHS (no improvement): 0
2022-08-25 17:35:25,882 saving best model





2022-08-25 17:35:27,769 ----------------------------------------------------------------------------------------------------
2022-08-25 17:35:28,604 epoch 4 - iter 1/11 - loss 1.44918366 - samples/sec: 38.42 - lr: 0.100000
2022-08-25 17:35:29,621 epoch 4 - iter 2/11 - loss 1.44525030 - samples/sec: 31.49 - lr: 0.100000
2022-08-25 17:35:30,753 epoch 4 - iter 3/11 - loss 1.42942597 - samples/sec: 28.29 - lr: 0.100000
2022-08-25 17:35:31,721 epoch 4 - iter 4/11 - loss 1.43612336 - samples/sec: 33.13 - lr: 0.100000
2022-08-25 17:35:32,805 epoch 4 - iter 5/11 - loss 1.44841316 - samples/sec: 29.55 - lr: 0.100000
2022-08-25 17:35:33,731 epoch 4 - iter 6/11 - loss 1.45823451 - samples/sec: 34.59 - lr: 0.100000
2022-08-25 17:35:34,825 epoch 4 - iter 7/11 - loss 1.45347690 - samples/sec: 29.28 - lr: 0.100000
2022-08-25 17:35:35,893 epoch 4 - iter 8/11 - loss 1.45216255 - samples/sec: 29.99 - lr: 0.100000
2022-08-25 17:35:37,315 epoch 4 - iter 9/11 - loss 1.44480848 - samples/sec: 22.52 - lr: 0.

100%|██████████| 2/2 [00:00<00:00,  3.45it/s]

2022-08-25 17:35:39,710 Evaluating as a multi-label problem: False
2022-08-25 17:35:39,718 DEV : loss 1.2875179052352905 - f1-score (micro avg)  0.5714
2022-08-25 17:35:39,723 BAD EPOCHS (no improvement): 0
2022-08-25 17:35:39,724 saving best model





2022-08-25 17:35:41,644 ----------------------------------------------------------------------------------------------------
2022-08-25 17:35:42,649 epoch 5 - iter 1/11 - loss 1.39832083 - samples/sec: 31.93 - lr: 0.100000
2022-08-25 17:35:43,415 epoch 5 - iter 2/11 - loss 1.35170801 - samples/sec: 41.83 - lr: 0.100000
2022-08-25 17:35:44,503 epoch 5 - iter 3/11 - loss 1.38802144 - samples/sec: 29.44 - lr: 0.100000
2022-08-25 17:35:45,457 epoch 5 - iter 4/11 - loss 1.38996994 - samples/sec: 33.58 - lr: 0.100000
2022-08-25 17:35:46,539 epoch 5 - iter 5/11 - loss 1.36977868 - samples/sec: 29.60 - lr: 0.100000
2022-08-25 17:35:47,954 epoch 5 - iter 6/11 - loss 1.35401692 - samples/sec: 22.66 - lr: 0.100000
2022-08-25 17:35:48,826 epoch 5 - iter 7/11 - loss 1.34736567 - samples/sec: 36.70 - lr: 0.100000
2022-08-25 17:35:50,129 epoch 5 - iter 8/11 - loss 1.34620188 - samples/sec: 24.58 - lr: 0.100000
2022-08-25 17:35:51,154 epoch 5 - iter 9/11 - loss 1.33669574 - samples/sec: 31.25 - lr: 0.

100%|██████████| 2/2 [00:00<00:00,  3.38it/s]

2022-08-25 17:35:53,351 Evaluating as a multi-label problem: False
2022-08-25 17:35:53,362 DEV : loss 1.2213255167007446 - f1-score (micro avg)  0.577
2022-08-25 17:35:53,366 BAD EPOCHS (no improvement): 0
2022-08-25 17:35:53,367 saving best model





2022-08-25 17:35:55,334 ----------------------------------------------------------------------------------------------------
2022-08-25 17:35:56,441 epoch 6 - iter 1/11 - loss 1.27553634 - samples/sec: 28.93 - lr: 0.100000
2022-08-25 17:35:57,587 epoch 6 - iter 2/11 - loss 1.26016208 - samples/sec: 27.95 - lr: 0.100000
2022-08-25 17:35:58,933 epoch 6 - iter 3/11 - loss 1.27633540 - samples/sec: 23.79 - lr: 0.100000
2022-08-25 17:35:59,786 epoch 6 - iter 4/11 - loss 1.26721140 - samples/sec: 37.56 - lr: 0.100000
2022-08-25 17:36:00,761 epoch 6 - iter 5/11 - loss 1.24673765 - samples/sec: 32.85 - lr: 0.100000
2022-08-25 17:36:01,734 epoch 6 - iter 6/11 - loss 1.24251908 - samples/sec: 32.94 - lr: 0.100000
2022-08-25 17:36:02,801 epoch 6 - iter 7/11 - loss 1.24566535 - samples/sec: 29.99 - lr: 0.100000
2022-08-25 17:36:04,197 epoch 6 - iter 8/11 - loss 1.24065509 - samples/sec: 22.94 - lr: 0.100000
2022-08-25 17:36:05,162 epoch 6 - iter 9/11 - loss 1.24201442 - samples/sec: 33.19 - lr: 0.

100%|██████████| 2/2 [00:00<00:00,  3.40it/s]

2022-08-25 17:36:07,143 Evaluating as a multi-label problem: False
2022-08-25 17:36:07,151 DEV : loss 1.101011037826538 - f1-score (micro avg)  0.6479
2022-08-25 17:36:07,156 BAD EPOCHS (no improvement): 0
2022-08-25 17:36:07,156 saving best model





2022-08-25 17:36:08,756 ----------------------------------------------------------------------------------------------------
2022-08-25 17:36:09,669 epoch 7 - iter 1/11 - loss 1.22383970 - samples/sec: 35.09 - lr: 0.100000
2022-08-25 17:36:10,790 epoch 7 - iter 2/11 - loss 1.20190502 - samples/sec: 28.60 - lr: 0.100000
2022-08-25 17:36:11,765 epoch 7 - iter 3/11 - loss 1.16625123 - samples/sec: 32.89 - lr: 0.100000
2022-08-25 17:36:12,741 epoch 7 - iter 4/11 - loss 1.17622115 - samples/sec: 32.82 - lr: 0.100000
2022-08-25 17:36:14,111 epoch 7 - iter 5/11 - loss 1.16031711 - samples/sec: 23.39 - lr: 0.100000
2022-08-25 17:36:15,168 epoch 7 - iter 6/11 - loss 1.15162040 - samples/sec: 30.30 - lr: 0.100000
2022-08-25 17:36:16,514 epoch 7 - iter 7/11 - loss 1.14889418 - samples/sec: 23.79 - lr: 0.100000
2022-08-25 17:36:17,523 epoch 7 - iter 8/11 - loss 1.15475690 - samples/sec: 31.78 - lr: 0.100000
2022-08-25 17:36:18,409 epoch 7 - iter 9/11 - loss 1.15803508 - samples/sec: 36.12 - lr: 0.

100%|██████████| 2/2 [00:00<00:00,  3.47it/s]

2022-08-25 17:36:20,605 Evaluating as a multi-label problem: False
2022-08-25 17:36:20,613 DEV : loss 1.0925477743148804 - f1-score (micro avg)  0.6249
2022-08-25 17:36:20,617 BAD EPOCHS (no improvement): 1
2022-08-25 17:36:20,618 ----------------------------------------------------------------------------------------------------





2022-08-25 17:36:21,556 epoch 8 - iter 1/11 - loss 1.15353491 - samples/sec: 34.15 - lr: 0.100000
2022-08-25 17:36:22,559 epoch 8 - iter 2/11 - loss 1.15946730 - samples/sec: 31.97 - lr: 0.100000
2022-08-25 17:36:23,569 epoch 8 - iter 3/11 - loss 1.13588196 - samples/sec: 31.71 - lr: 0.100000
2022-08-25 17:36:24,695 epoch 8 - iter 4/11 - loss 1.14016710 - samples/sec: 28.47 - lr: 0.100000
2022-08-25 17:36:25,888 epoch 8 - iter 5/11 - loss 1.11859773 - samples/sec: 26.85 - lr: 0.100000
2022-08-25 17:36:27,283 epoch 8 - iter 6/11 - loss 1.11985962 - samples/sec: 22.96 - lr: 0.100000
2022-08-25 17:36:28,198 epoch 8 - iter 7/11 - loss 1.11038772 - samples/sec: 35.01 - lr: 0.100000
2022-08-25 17:36:29,133 epoch 8 - iter 8/11 - loss 1.09953967 - samples/sec: 34.26 - lr: 0.100000
2022-08-25 17:36:30,526 epoch 8 - iter 9/11 - loss 1.09482268 - samples/sec: 22.99 - lr: 0.100000
2022-08-25 17:36:31,578 epoch 8 - iter 10/11 - loss 1.10337527 - samples/sec: 30.45 - lr: 0.100000
2022-08-25 17:36:31

100%|██████████| 2/2 [00:00<00:00,  3.33it/s]

2022-08-25 17:36:32,576 Evaluating as a multi-label problem: False
2022-08-25 17:36:32,585 DEV : loss 0.9754950404167175 - f1-score (micro avg)  0.6793
2022-08-25 17:36:32,589 BAD EPOCHS (no improvement): 0
2022-08-25 17:36:32,590 saving best model





2022-08-25 17:36:34,252 ----------------------------------------------------------------------------------------------------
2022-08-25 17:36:35,341 epoch 9 - iter 1/11 - loss 1.04575430 - samples/sec: 29.41 - lr: 0.100000
2022-08-25 17:36:36,379 epoch 9 - iter 2/11 - loss 1.04986765 - samples/sec: 30.86 - lr: 0.100000
2022-08-25 17:36:37,433 epoch 9 - iter 3/11 - loss 1.04833873 - samples/sec: 30.41 - lr: 0.100000
2022-08-25 17:36:38,838 epoch 9 - iter 4/11 - loss 1.05044605 - samples/sec: 22.79 - lr: 0.100000
2022-08-25 17:36:40,197 epoch 9 - iter 5/11 - loss 1.06204714 - samples/sec: 23.58 - lr: 0.100000
2022-08-25 17:36:41,299 epoch 9 - iter 6/11 - loss 1.05337210 - samples/sec: 29.05 - lr: 0.100000
2022-08-25 17:36:42,139 epoch 9 - iter 7/11 - loss 1.05396866 - samples/sec: 38.14 - lr: 0.100000
2022-08-25 17:36:43,067 epoch 9 - iter 8/11 - loss 1.05429730 - samples/sec: 34.52 - lr: 0.100000
2022-08-25 17:36:44,026 epoch 9 - iter 9/11 - loss 1.05487380 - samples/sec: 33.40 - lr: 0.

100%|██████████| 2/2 [00:00<00:00,  3.54it/s]

2022-08-25 17:36:46,099 Evaluating as a multi-label problem: False
2022-08-25 17:36:46,108 DEV : loss 0.9628021717071533 - f1-score (micro avg)  0.6737
2022-08-25 17:36:46,113 BAD EPOCHS (no improvement): 1
2022-08-25 17:36:46,113 ----------------------------------------------------------------------------------------------------





2022-08-25 17:36:47,563 epoch 10 - iter 1/11 - loss 0.99082503 - samples/sec: 22.08 - lr: 0.100000
2022-08-25 17:36:48,559 epoch 10 - iter 2/11 - loss 0.98960826 - samples/sec: 32.16 - lr: 0.100000
2022-08-25 17:36:49,584 epoch 10 - iter 3/11 - loss 0.98159208 - samples/sec: 31.28 - lr: 0.100000
2022-08-25 17:36:50,445 epoch 10 - iter 4/11 - loss 1.01782635 - samples/sec: 37.21 - lr: 0.100000
2022-08-25 17:36:51,431 epoch 10 - iter 5/11 - loss 1.02345868 - samples/sec: 32.52 - lr: 0.100000
2022-08-25 17:36:52,770 epoch 10 - iter 6/11 - loss 1.02633066 - samples/sec: 23.92 - lr: 0.100000
2022-08-25 17:36:53,649 epoch 10 - iter 7/11 - loss 1.01899155 - samples/sec: 36.45 - lr: 0.100000
2022-08-25 17:36:54,724 epoch 10 - iter 8/11 - loss 1.01655217 - samples/sec: 29.79 - lr: 0.100000
2022-08-25 17:36:55,702 epoch 10 - iter 9/11 - loss 1.01030259 - samples/sec: 32.79 - lr: 0.100000
2022-08-25 17:36:56,644 epoch 10 - iter 10/11 - loss 1.01339764 - samples/sec: 34.00 - lr: 0.100000
2022-08-2

100%|██████████| 2/2 [00:00<00:00,  3.50it/s]

2022-08-25 17:36:57,782 Evaluating as a multi-label problem: False
2022-08-25 17:36:57,790 DEV : loss 0.921704888343811 - f1-score (micro avg)  0.6885
2022-08-25 17:36:57,794 BAD EPOCHS (no improvement): 0
2022-08-25 17:36:57,795 saving best model





2022-08-25 17:37:00,605 ----------------------------------------------------------------------------------------------------
2022-08-25 17:37:00,606 loading file resources\taggers\example-upos\best-model.pt
2022-08-25 17:37:01,161 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PUNCT, ADJ, ADP, VERB, PROPN, AUX, DET, SCONJ, ADV, CCONJ, PRON, NUM, PART, X, SYM, <START>, <STOP>


100%|██████████| 2/2 [00:07<00:00,  3.77s/it]

2022-08-25 17:37:08,971 Evaluating as a multi-label problem: False
2022-08-25 17:37:08,979 0.6839	0.6839	0.6839	0.6839
2022-08-25 17:37:08,979 
Results:
- F-score (micro) 0.6839
- F-score (macro) 0.5392
- Accuracy 0.6839

By class:
              precision    recall  f1-score   support

        NOUN     0.6679    0.5497    0.6031       322
        VERB     0.3131    0.6526    0.4232        95
         ADJ     0.4656    0.4485    0.4569       136
       PUNCT     0.9701    0.9924    0.9811       131
         ADP     0.9652    0.8880    0.9250       125
       PROPN     0.6899    0.8990    0.7807        99
         AUX     0.8939    0.8676    0.8806        68
       CCONJ     0.9556    0.9149    0.9348        47
       SCONJ     0.9756    0.9091    0.9412        44
        PRON     0.6111    0.8800    0.7213        25
         DET     0.6471    0.3333    0.4400        33
         ADV     0.0000    0.0000    0.0000        35
         NUM     0.0000    0.0000    0.0000         8
           




{'test_score': 0.6839422259983008,
 'dev_score_history': [0.3797235023041475,
  0.511520737327189,
  0.5594470046082949,
  0.5714285714285714,
  0.5769585253456221,
  0.647926267281106,
  0.6248847926267281,
  0.6792626728110599,
  0.6737327188940092,
  0.688479262672811],
 'train_loss_history': [2.527608566041463,
  1.9090732562327055,
  1.6193697010092871,
  1.4314656909930492,
  1.3297093412656167,
  1.2217663049950707,
  1.1453577165006958,
  1.102639369216223,
  1.060815141663698,
  1.0107488055750016],
 'dev_loss_history': [2.059873580932617,
  1.701104760169983,
  1.4447882175445557,
  1.2875179052352905,
  1.2213255167007446,
  1.101011037826538,
  1.0925477743148804,
  0.9754950404167175,
  0.9628021717071533,
  0.921704888343811]}

In [43]:
# load the model you trained
model = SequenceTagger.load('resources/taggers/example-upos/final-model.pt')

# create example sentence
# TODO: Matrica konfuzije za predikciju,
sentence = Sentence('Ramkovski , bivši vlasnik televizijske stanice A1 , četiri novine i drugih preduzeća , osuđen je u krivičnom sudu u Skoplju 14. marta na 13 godina zatvora , zbog pranja novca , kriminalne zavere , zloupotrebe položaja i utaje poreza ; 19 saučesnika dobilo je zatvorske kazne u trajanju dve do sedam godina .')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())

for i,s in enumerate(sentence):
    labels = s.get_labels('upos')
    for label in labels:
        print(label.data_point.form, ":" ,label.value)

2022-08-25 18:28:19,593 loading file resources/taggers/example-upos/final-model.pt
2022-08-25 18:28:20,121 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PUNCT, ADJ, ADP, VERB, PROPN, AUX, DET, SCONJ, ADV, CCONJ, PRON, NUM, PART, X, SYM, <START>, <STOP>
Sentence: "Ramkovski , bivši vlasnik televizijske stanice A1 , četiri novine i drugih preduzeća , osuđen je u krivičnom sudu u Skoplju 14 . marta na 13 godina zatvora , zbog pranja novca , kriminalne zavere , zloupotrebe položaja i utaje poreza ; 19 saučesnika dobilo je zatvorske kazne u trajanju dve do sedam godina ." → ["Ramkovski"/PROPN, ","/PUNCT, "bivši"/VERB, "vlasnik"/VERB, "televizijske"/ADJ, "stanice"/NOUN, "A1"/PROPN, ","/PUNCT, "četiri"/ADJ, "novine"/NOUN, "i"/CCONJ, "drugih"/ADJ, "preduzeća"/NOUN, ","/PUNCT, "osuđen"/VERB, "je"/AUX, "u"/ADP, "krivičnom"/ADJ, "sudu"/NOUN, "u"/ADP, "Skoplju"/NOUN, "14"/NUM, "."/PUNCT, "marta"/VERB, "na"/ADP, "13"/NUM, "godina"/ADJ, "zatvora"/NOUN, ","/PUNCT, "zbog"/VERB, "pranj

In [None]:
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt

actual = np.array([])
predicted = np.array([])



for k,s in enumerate(corpus.dev):
    for i,token in enumerate(sentence):
        labels = token.get_labels('upos')
        for j,label in enumerate(labels):
            predictedToken = label.data_point.form
            predictedValue = label.value



confusion_matrix = metrics.confusion_matrix(actual, predicted)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show()