# Treniranje jezičkog modela

In [29]:
import flair.datasets
from flair.data import Sentence, Dictionary
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger, LanguageModel
from flair.trainers import ModelTrainer, LanguageModelTrainer, TextCorpus
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import collections
import glob
import pickle

### Generisanje rečnika

In [10]:
char_dictionary: Dictionary = Dictionary()
counter = collections.Counter()

In [11]:
files = glob.glob('corpus/corpus3/**/*', recursive=True)
files.remove('corpus/corpus3\\train')
print(files)

['corpus/corpus3\\test.txt', 'corpus/corpus3\\valid.txt', 'corpus/corpus3\\train\\train_split_1', 'corpus/corpus3\\train\\train_split_2']


In [12]:
# Number of processed lines
processed = 0

for file in files:

    with open(file, 'r', encoding='utf-8') as f:

        tokens = 0

        for line in f:
            processed += 1
            chars = list(line)
            tokens += len(chars)

            # Add chars to the dictionary
            counter.update(chars)


In [13]:
counter.keys()

dict_keys(['1', '9', '8', '4', 'K', 'a', 'o', ' ', 'i', 'b', 'č', 'n', ',', 'l', 'c', 'e', 'E', 'm', 'u', 'G', 'd', 'š', 't', 'j', 'N', 'r', 'g', 'p', '-', 's', 'k', '.', 'T', 'z', 'S', 'ž', 'h', 'đ', 'v', '(', ')', 'ć', 'P', 'V', 'B', 'D', 'O', ':', '–', 'f', '\n', '2', '0', 'I', 'M', 'A', 'U', 'L', 'Č', '’', '„', '!', '“', 'R', ';', 'J', 'Ž', 'Š', '3', 'Z', 'Ć', '5', '6', '7', '?', '/', 'H', '*', 'w', 'y', 'X', 'Y', 'ñ', '\xad', 'F', '=', 'â', '|', '`', '~'])

In [14]:
# Listing elements from most common to least common and summarizing their count
total_count = 0
for letter, count in counter.most_common():
    total_count += count

In [15]:
print("Total number of characters:",total_count)
print("Total number of processed lines:", processed)

Total number of characters: 52714
Total number of processed lines: 39


In [16]:
latin = set("abcčćddžđefghijklljmnnjoprsštuvzžABCČĆDDŽĐEFGHIJKLLJMNNJOPRSŠTUVZŽ")
numbers = set("0123456789")
punct = set(".?!,:;\"\'\/()[]{}_+-*^%#<>|&`~")

In [17]:
latin.issubset(set(counter.keys()))

False

In [18]:
numbers.issubset(set(counter.keys()))

True

In [19]:
punct.issubset(set(counter.keys()))

False

In [20]:
summ = 0
idx = 0
for letter, count in counter.most_common():
    summ += count
    percentile = (summ / total_count)

    char_dictionary.add_item(letter)
    idx += 1
    print('%d\t%s\t%7d\t%7d\t%f' % (idx, letter, count, summ, percentile))

1	 	   9587	   9587	0.181868
2	a	   4680	  14267	0.270649
3	o	   4178	  18445	0.349907
4	e	   3831	  22276	0.422582
5	i	   3653	  25929	0.491881
6	n	   2613	  28542	0.541450
7	t	   2028	  30570	0.579922
8	s	   1929	  32499	0.616516
9	r	   1847	  34346	0.651554
10	j	   1725	  36071	0.684277
11	u	   1647	  37718	0.715521
12	d	   1574	  39292	0.745381
13	l	   1458	  40750	0.773039
14	m	   1442	  42192	0.800395
15	v	   1382	  43574	0.826612
16	k	   1332	  44906	0.851880
17	p	   1111	  46017	0.872956
18	g	    702	  46719	0.886273
19	,	    662	  47381	0.898831
20	b	    619	  48000	0.910574
21	z	    619	  48619	0.922317
22	.	    416	  49035	0.930208
23	č	    392	  49427	0.937645
24	š	    389	  49816	0.945024
25	ž	    272	  50088	0.950184
26	c	    269	  50357	0.955287
27	h	    218	  50575	0.959423
28	ć	    200	  50775	0.963217
29	-	    117	  50892	0.965436
30	đ	     83	  50975	0.967011
31	A	     83	  51058	0.968585
32	O	     79	  51137	0.970084
33	f	     75	  51212	0.971507
34	–	     74	  5128

In [21]:
with open('resources/char_mappings/latin_dict', 'wb') as f:
    mappings = {
        'idx2item': char_dictionary.idx2item,
        'item2idx': char_dictionary.item2idx
    }
    pickle.dump(mappings, f)

### Traniranje embedding-a

In [26]:
is_forward_lm = True
dictionary: Dictionary = Dictionary.load_from_file('resources/char_mappings/latin_dict')

# get your corpus, process forward and at the character level
corpus = TextCorpus('corpus/corpus3',
                    dictionary,
                    is_forward_lm,
                    character_level=True)

# instantiate your language model, set hidden size and number of layers
# U pozadini LanguageModel-a je rekurzivna neuronska mreza iz PyTorch bibloteke, konkretno LSTM ili GRU
language_model = LanguageModel(dictionary,
                               is_forward_lm,
                               hidden_size=128,
                               nlayers=1)
trainer = LanguageModelTrainer(language_model, corpus)

trainer.train('resources/taggers/language_model',
              sequence_length=50,
              mini_batch_size=100,
              max_epochs=128)

2022-08-29 23:15:44,135 read text file with 8 lines
2022-08-29 23:15:44,138 read text file with 8 lines
2022-08-29 23:15:46,166 Sequence length is 50
2022-08-29 23:15:46,167 Split 1	 - (23:15:46)
2022-08-29 23:15:46,569 best split so far
2022-08-29 23:15:46,570 best loss so far 3.65703276
2022-08-29 23:15:46,804 ('\nuiđo2d K  Žo–  =    č |Z 1XieBO  . t2K 5âćJm 3oa    “ o;s sAFioT-  ? Kev4   faoo.-  ( i3o i6)      8 o â XoČ / M`o:“ –e - K .– ’oŽcsub    a  K /o PL oŠY  j E  b 8,    `  aI : X–8 2 Un i og~pez0  – d<unk>o Dsmoauđi  M e,3 a eH  a G   s uV (i;I   u(!  s– d    )  96  =Ž5šo wooog mn)   j  af   h u U cktIi o v’a91u?–s`s e m AS „ =ee.O(  s0  mosL)U |  foD rjF5b 6u=  w2l oŽĆf YT T m  š~  Ss m   GdjF o\n   ;o Č bm?  at. V šPč   l a <unk>,bo z Fâ  wnw ču Ž   8 l„ FUoj    s `d9g     7aZ*b4 a š  1,o vhjvHA;   z?Y f4invu3o; 5o  `AñMy3o r8,A ni   nŽjco\xad–      VćIžos;a ;  YBMgot|Žofm)Mkka  ñn sĆ  ivm!X 2I U:G  U Ga  a\xad UNLcyĆ oetzn t   ––vdgši(ŽmŠcI.n yu APE  gu)j o R     p e â  en

In [27]:
sentence = Sentence('Ja volim Beograd.')

# init embeddings from your trained LM
char_lm_embeddings = FlairEmbeddings('resources/taggers/language_model/best-lm.pt')

# embed sentence
char_lm_embeddings.embed(sentence)

for token in sentence:
    print(token)
    print(token.embedding)

Token[0]: "Ja"
tensor([ 4.4672e-04,  2.0592e-02,  1.1349e-02,  1.0647e-02,  5.3616e-02,
        -1.5402e-02,  4.2784e-02,  4.1853e-02, -2.9934e-01,  6.1621e-02,
        -3.0780e-02,  6.2406e-02,  5.0246e-01,  8.2877e-02, -3.2485e-02,
         1.9455e-02, -2.3220e-02,  4.0551e-02, -5.4064e-02, -4.5243e-01,
        -7.0002e-02, -3.7112e-01,  8.0851e-02,  1.1083e-01,  1.1769e-01,
        -3.0781e-02,  1.0171e-02,  2.3354e-02,  8.9468e-02,  4.9446e-02,
        -1.5761e-01, -4.5992e-01,  2.0249e-02, -6.4030e-02,  6.0617e-02,
        -4.7748e-02, -4.4009e-02, -2.8610e-02, -4.6191e-03, -2.0838e-02,
         2.6921e-02,  1.0086e-01, -4.7114e-03, -1.2954e-01,  6.1421e-02,
         5.0519e-02, -2.1571e-02,  2.0560e-03, -1.1609e-04,  1.2283e-02,
         5.2555e-01, -1.8959e-01,  2.5811e-01, -8.6976e-02,  8.3163e-02,
        -2.6978e-02, -2.5024e-01,  1.8236e-02,  6.3224e-01,  5.8730e-01,
         1.5536e-01, -1.5281e-01, -1.5044e-01,  1.0005e-01,  4.5478e-02,
        -9.3623e-02, -5.2791e-01, -1

### Podela postojećeg korpusa na dva dela
Koristili smo kao inicijalni korpus UD_SERBIAN, koji je podeljen po svojoj strukturi na train, test i dev skupove.
Treniranje SequenceTagger koristi validation skup korpusa, pa je bilo potrebno da dev skup podelimo na 2 dela:
1. Jedan će koristiti SequenceTagger za internu validaciju
2. Drugi ćemo mi koristiti za validaciju hiperparametara po kojima optimizujemo postojeći model

Sličnu stvar smo uradili i sa test skupom, iz istog raloga.

In [34]:
# define columns
columns = {0: 'id', 1: 'text', 2: 'ner', 3: 'upos'}

# this is the folder in which train, test and dev files reside
data_folder = 'corpus/corpus1'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus1 = ColumnCorpus(data_folder, columns,
                              train_file='sr_set-ud-train.conllu',
                              test_file='sr_set-ud-test2.conllu',
                              dev_file='sr_set-ud-dev2.conllu')


2022-08-29 23:32:40,746 Reading data from corpus\corpus1
2022-08-29 23:32:40,747 Train: corpus\corpus1\sr_set-ud-train.conllu
2022-08-29 23:32:40,747 Dev: corpus\corpus1\sr_set-ud-dev2.conllu
2022-08-29 23:32:40,747 Test: corpus\corpus1\sr_set-ud-test2.conllu


In [35]:
# define columns
columns = {0: 'id', 1: 'text', 2: 'ner', 3: 'upos'}

# this is the folder in which train, test and dev files reside
data_folder = 'corpus/corpus2'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus2 = ColumnCorpus(data_folder, columns,
                              train_file='sr_set-ud-train.conllu',
                              test_file='sr_set-ud-test.conllu',
                              dev_file='sr_set-ud-dev.conllu')

2022-08-29 23:32:50,700 Reading data from corpus\corpus2
2022-08-29 23:32:50,701 Train: corpus\corpus2\sr_set-ud-train.conllu
2022-08-29 23:32:50,701 Dev: corpus\corpus2\sr_set-ud-dev.conllu
2022-08-29 23:32:50,702 Test: corpus\corpus2\sr_set-ud-test.conllu


In [36]:
label_type = 'upos'
label_dict  = corpus1.make_label_dictionary(label_type = label_type)

2022-08-29 23:32:55,261 Computing label dictionary. Progress:


3328it [00:00, 29714.73it/s]

2022-08-29 23:32:55,375 Dictionary created for label 'upos' with 18 values: NOUN (seen 18103 times), PUNCT (seen 9351 times), ADJ (seen 8835 times), ADP (seen 7130 times), VERB (seen 6406 times), PROPN (seen 5622 times), AUX (seen 4667 times), DET (seen 2848 times), SCONJ (seen 2713 times), ADV (seen 2543 times), CCONJ (seen 2541 times), PRON (seen 1859 times), NUM (seen 944 times), PART (seen 461 times), X (seen 232 times), INTJ (seen 3 times), SYM (seen 1 times)





### Primena istreniranog embedding-a
Iskoristićemo vektorske reprezentacije tokena iz embedding modela kog smo trenirali da bismo napravili model za predikciju UPOS tagova. Uvezali smo Character embedding model koji smo istrenirali sa vec postojecim word embedding modelom (koji funkcionise na nivou reci) i tu strukturu prosledili modelu za predkiciju UPOS tagova.

In [37]:
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('resources/taggers/language_model/best-lm.pt')
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

In [38]:
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

trainer = ModelTrainer(tagger, corpus1)

2022-08-29 23:38:53,302 SequenceTagger predicts: Dictionary with 18 tags: <unk>, NOUN, PUNCT, ADJ, ADP, VERB, PROPN, AUX, DET, SCONJ, ADV, CCONJ, PRON, NUM, PART, X, INTJ, SYM


In [40]:
best_score = 0.0
best_params = {'learning_rate':0, 'mini_batch_size': 0, 'max_epochs' : 10}

param_learning_rates = np.linspace(0.001, 0.6, num=4)
param_mini_batch_sizes = np.arange(10,100,20)
max_epochs = np.arange(10,13,1)

model_history = []

In [None]:
for i,lr in enumerate(param_learning_rates):
    for j,mbs in enumerate(param_mini_batch_sizes):
        for k,me in enumerate(max_epochs):
                print('\n\n\n')
                print("#######################################################")
                print(f"################ MODEL NUMBER {i+j+k+1} #######################")
                print("#######################################################")
                trainer.train('resources/taggers/language_model_testing',
                              learning_rate=lr,
                              mini_batch_size=int(mbs),
                              max_epochs=int(me),
                              write_weights = True)
                model = SequenceTagger.load('resources/taggers/optimized-upos/final-model.pt')
                actual= np.array([])
                predicted = np.array([])

                # Validiramo na drugoj "polovini" validacionog skupa/corpusa
                # (onoj koja nije prosledjena ranijem traineru)
                for actualSentence in corpus2.dev:

                    for token in actualSentence:
                        actual = np.append(actual, token.get_label('upos').value)

                    predictedSentence = Sentence([token.text for token in actualSentence.tokens])
                    model.predict(predictedSentence)
                    for token in predictedSentence:
                        predictedLabels = token.get_labels('upos')
                        for predictedLabel in predictedLabels:
                            predicted = np.append(predicted, predictedLabel.value)
                score = metrics.accuracy_score(actual,predicted)

                if score>best_score:
                    best_params['learning_rate'] = lr
                    best_params['param_mini_batch_sizes'] = mbs
                    best_params['max_epochs'] = me

                report = metrics.classification_report(actual,predicted)
                params = {'learning_rate': lr, 'param_mini_batch_sizes': mbs,
                          'max_epochs': me}
                model_history.append({'params':params,'report':report})





#######################################################
################ MODEL NUMBER 1 #######################
#######################################################
2022-08-29 23:47:51,275 ----------------------------------------------------------------------------------------------------
2022-08-29 23:47:51,275 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.1, inplace=False)
        (encoder): Embedding(91, 100)
        (rnn): LSTM(100, 128)
        (decoder): Linear(in_features=128, out_features=91, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=228, out_features=228, bias=True)
  (rnn): LSTM(228, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512,

100%|██████████| 28/28 [00:01<00:00, 16.74it/s]

2022-08-29 23:48:42,424 Evaluating as a multi-label problem: False
2022-08-29 23:48:42,452 DEV : loss 2.808323383331299 - f1-score (micro avg)  0.1296
2022-08-29 23:48:42,474 BAD EPOCHS (no improvement): 0
2022-08-29 23:48:42,475 saving best model





2022-08-29 23:48:43,316 ----------------------------------------------------------------------------------------------------
2022-08-29 23:48:47,059 epoch 2 - iter 33/333 - loss 2.80339131 - samples/sec: 88.23 - lr: 0.001000
2022-08-29 23:48:51,056 epoch 2 - iter 66/333 - loss 2.76932634 - samples/sec: 83.73 - lr: 0.001000
2022-08-29 23:48:55,472 epoch 2 - iter 99/333 - loss 2.74628848 - samples/sec: 75.84 - lr: 0.001000
2022-08-29 23:48:59,286 epoch 2 - iter 132/333 - loss 2.72454003 - samples/sec: 87.98 - lr: 0.001000
2022-08-29 23:49:03,171 epoch 2 - iter 165/333 - loss 2.70512670 - samples/sec: 86.34 - lr: 0.001000
2022-08-29 23:49:07,252 epoch 2 - iter 198/333 - loss 2.68371348 - samples/sec: 82.09 - lr: 0.001000
2022-08-29 23:49:11,201 epoch 2 - iter 231/333 - loss 2.66860798 - samples/sec: 84.88 - lr: 0.001000
2022-08-29 23:49:15,423 epoch 2 - iter 264/333 - loss 2.65598594 - samples/sec: 79.31 - lr: 0.001000


In [39]:
trainer.train('resources/taggers/language_model_testing',
                              learning_rate=best_params['learning_rate'],
                              mini_batch_size=int(best_params['param_mini_batch_sizes']),
                              max_epochs=int(best_params['max_epochs']),
                              write_weights = True)

NameError: name 'best_params' is not defined

In [None]:
# load the model you trained
model = SequenceTagger.load('resources/taggers/language_model_testing/final-model.pt')

In [None]:
# create example sentence
# TODO: Matrica konfuzije za predikciju,
actualSentenceExample = corpus2.dev[17]
predictedSentenceExample = Sentence(actualSentenceExample.to_plain_string())
model.predict(predictedSentenceExample)
print("\n\n")
# predict tags and print

for i,token in enumerate(predictedSentenceExample):
    predictedLabels = token.get_labels('upos')
    actualLabels = actualSentenceExample[i].get_labels('upos')

    for j, label in enumerate(predictedLabels):
        token = label.data_point.form
        predictedValue = label.value
        actualValue = actualLabels[j].value

        print(token, "- predicted:" ,predictedValue, ", actual:", actualValue)

### Konstrukcija matrice konfuzije

In [None]:
actual= np.array([])
predicted = np.array([])

for actualSentence in corpus2.dev:
    predictedSentence = Sentence(actualSentence.to_plain_string())
    model.predict(predictedSentence)


    numTokens = min(len(predictedSentence.tokens),len(actualSentence.tokens))
    i=0
    punctCount = 0

    while i+punctCount<numTokens:
        actualLabels = actualSentence[i].get_labels('upos')

        if predictedSentence[i+punctCount].labels[0].data_point.form == ".":
            punctCount+=1

        if i+punctCount >= numTokens:
            break
        predictedLabels = predictedSentence[i+punctCount].get_labels('upos')

        numLabels = min(len(actualLabels),len(predictedLabels))
        j=0
        while j<numLabels:
            predicted = np.append(predicted, predictedLabels[j].value)
            actual = np.append(actual, actualLabels[j].value)
            j+=1
        i+=1


In [None]:
classes = np.union1d(np.unique(actual), np.unique(predicted))
cm = metrics.confusion_matrix(actual, predicted)
cm_df = pd.DataFrame(cm,
                     index = classes,
                     columns = classes)
#Plotting the confusion matrix
plt.figure(figsize=(15,13))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()

In [None]:
sentence = Sentence('Srbija je tokom bombardovanja devedesetih izgubila mnoge kulturne znamenitosti.')

# predict tags and print
model.predict(sentence)

print(sentence.to_tagged_string())