In [None]:
import nltk
nltk.download("all")

# V06: Izgradnja označavatelja

In [14]:
import nltk
import random

# Definirajmo recenice nad kojima cemo trenirati i testirati modele
from nltk.corpus import brown

brown_sents = brown.sents(categories = "news")

#moramo ga konvertirati u listu
brown_tagged_sents = list(brown.tagged_sents(categories = "news", tagset = "universal"))

#podjela
size = int(0.9 * len(brown_tagged_sents))
random.shuffle(brown_tagged_sents)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]

#testna neoznačena rečenica
test_sent = brown_sents[5]

In [24]:
#regex oznacavatelj
patterns = [
    (r".*ing$", "VERB"), #glagoli na ing
    (r".*ed$", "VERB"), #simple past
    (r".*e+s$", "VERB"), #3rd person present simple
    (r".*'s$", "NOUN"), #Posvojna imenica
    (r"\d+", "NUM"), #broj
    (r"\.", "."), #tocka je tocka :)
    (r".*", "NOUN"), #default, ako ne znas sta je, imenica je
]

#regex tagger
regexp_tagger = nltk.RegexpTagger(patterns)

#test
tagged_sent = regexp_tagger.tag(test_sent)

for token, tag in tagged_sent:
    print(token, tag)

#provjerimo preciznost
print(f"Preciznost: {regexp_tagger.accuracy(test_sents)}")

Preciznost: 0.3919931695285077


## N-gramski HMM označavatelji

In [40]:
# unigram označavatelj
unigram_tagger = nltk.UnigramTagger(train_sents, backoff=regexp_tagger)

#testiranje
tagged_sent = unigram_tagger.tag(test_sent)

#for token, tag in tagged_sent:
#    print(token, tag)

print(f"Preciznost: {unigram_tagger.accuracy(test_sents)}")

Preciznost: 0.9403282421022673


In [45]:
# matrica zbunjenosti
# predvidjene oznake
test_tags = [tag for sent in brown.sents(categories = "news")[:10] for (word, tag) in unigram_tagger.tag(sent)]

#prave oznake
gold_tags = [tag for sent in brown.tagged_sents(categories = "news", tagset="universal")[:10] for (word, tag) in sent]

#Confusion matrix
cm = nltk.ConfusionMatrix(gold_tags, test_tags)

print(cm)

     |              C     N     P     V |
     |     A  A  A  O  D  O  N  R  P  E |
     |     D  D  D  N  E  U  U  O  R  R |
     |  .  J  P  V  J  T  N  M  N  T  B |
-----+----------------------------------+
   . |<38> .  .  .  .  .  .  .  .  .  . |
 ADJ |  .<18> .  .  .  .  1  .  .  .  . |
 ADP |  .  .<29> .  .  .  .  .  .  2  1 |
 ADV |  .  1  . <5> .  .  .  .  .  .  . |
CONJ |  .  .  .  .<10> .  .  .  .  .  . |
 DET |  .  .  .  .  .<39> .  .  .  .  . |
NOUN |  .  .  .  .  .  .<81> .  .  .  . |
 NUM |  .  .  .  .  .  .  . <1> .  .  . |
PRON |  .  .  .  .  .  .  .  . <6> .  . |
 PRT |  .  .  .  .  .  .  .  .  . <3> . |
VERB |  .  .  .  .  .  .  1  .  .  .<48>|
-----+----------------------------------+
(row = reference; col = test)



## Izgradnja bigramskog modela

In [None]:
bigram_tagger = nltk.BigramTagger(train_sents)

#oznaci testni primjer
tagged_sent = bigram_tagger.tag(test_sent)

for token, tag in tagged_sent:
    print(token, tag)

print(f"Preciznost: {bigram_tagger.accuracy(test_sents)}")

In [49]:
trigram_tagger = nltk.TrigramTagger(train_sents)

#oznaci testni primjer
tagged_sent = trigram_tagger.tag(test_sent)

for token, tag in tagged_sent:
    print(token, tag)

print(f"Preciznost: {trigram_tagger.accuracy(test_sents)}")

It PRON
recommended VERB
that ADP
Fulton NOUN
legislators NOUN
act VERB
`` .
to PRT
have VERB
these DET
laws NOUN
studied VERB
and CONJ
revised VERB
to PRT
the DET
end None
of None
modernizing None
and None
improving None
them None
'' None
. None
Preciznost: 0.09211649748600702


Problem: Preciznost se smanjuje zbog nedostatka zagladivanja

In [50]:
#Pokusajmo kombinirati vise oznacavanja

t0 = regexp_tagger
t1 = nltk.UnigramTagger(train_sents, backoff = t0)
t2 = nltk.BigramTagger(train_sents, backoff = t1)
t3 = nltk.TrigramTagger(train_sents, backoff = t2)

print(f"Preciznost: {t3.accuracy(test_sents)}")

Preciznost: 0.9422255952945641


In [52]:
# matrica zbunjenosti
# predvidjene oznake
test_tags = [tag for sent in brown.sents(categories = "news")[:10] for (word, tag) in t3.tag(sent)]

#prave oznake
gold_tags = [tag for sent in brown.tagged_sents(categories = "news", tagset="universal")[:10] for (word, tag) in sent]

#Confusion matrix
cm = nltk.ConfusionMatrix(gold_tags, test_tags)

print(cm)

     |              C     N     P     V |
     |     A  A  A  O  D  O  N  R  P  E |
     |     D  D  D  N  E  U  U  O  R  R |
     |  .  J  P  V  J  T  N  M  N  T  B |
-----+----------------------------------+
   . |<38> .  .  .  .  .  .  .  .  .  . |
 ADJ |  .<19> .  .  .  .  .  .  .  .  . |
 ADP |  .  .<30> .  .  .  .  .  .  2  . |
 ADV |  .  .  . <6> .  .  .  .  .  .  . |
CONJ |  .  .  .  .<10> .  .  .  .  .  . |
 DET |  .  .  .  .  .<39> .  .  .  .  . |
NOUN |  .  .  .  .  .  .<81> .  .  .  . |
 NUM |  .  .  .  .  .  .  . <1> .  .  . |
PRON |  .  .  .  .  .  .  .  . <6> .  . |
 PRT |  .  .  .  .  .  .  .  .  . <3> . |
VERB |  .  .  .  .  .  .  .  .  .  .<49>|
-----+----------------------------------+
(row = reference; col = test)

