# ILN 2: POS Tagging

In [1]:
import pprint
import nltk
import matplotlib.pylab as plt
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import treebank

nltk.download('treebank')

tag_fd = nltk.FreqDist(tag for (word, tag) in treebank.tagged_words(tagset = 'universal')).max()
print(tag_fd)


[nltk_data] Downloading package treebank to
[nltk_data]     /Users/sergisanz/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


NOUN


In [13]:
import pprint
import nltk
import matplotlib.pylab as plt
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger,DefaultTagger
from nltk.corpus import treebank
from nltk.probability import FreqDist
import random

nltk.download('treebank')

# Se crean 10 particiones, y cada partición esta compuesta por una lista, por tanto será necesario recorrerla 
# para calcular las tasas de acierto para cada uno de los tres modelos

#random.seed(1000)

frases = treebank.tagged_sents(tagset='universal')
frasesAleatoria = [frases[i] for i in random.sample(range(0, len(frases)), len(frases))  ]

particiones = zip(*[iter(frasesAleatoria)]*int(len(frasesAleatoria)/10))

#Necesario, sino se crea una variable local en cada iteración del bucle
SumUnigrama = 0
SumBigrama = 0
SumBigramaBackoff = 0

# Etiquetador
etiquetadorDefecto = DefaultTagger(tag_fd)

for lista in particiones:

    numElementos = int(len(lista) * 0.9)
    muestrasTraining = lista[:numElementos]
    muestrasTest = lista[numElementos:]
    
    unigram_tagger = UnigramTagger(muestrasTraining)
    bigram_tagger = BigramTagger(muestrasTraining)
    backoff_tagger = BigramTagger(muestrasTraining, backoff= unigram_tagger)


    unigrama = unigram_tagger.evaluate(muestrasTest) * 100
    bigrama = bigram_tagger.evaluate(muestrasTest) * 100
    
    bigramaBackoff = backoff_tagger.evaluate(muestrasTest) * 100
    
    SumUnigrama += unigrama
    SumBigrama += bigrama
    SumBigramaBackoff += bigramaBackoff

print("Tasas de acierto de los distintos modelos:"+"\n"+"*"*50)  
print("{:<20} {:<5.5f} ".format("Bigramas - backoff:", SumBigramaBackoff/10) + "%")
print("{:<20} {:<5.6f} ".format("Bigramas:" ,SumBigrama/10) + "%")
print("{:<20} {:<5.5f} ".format("Unigramas",SumUnigrama/10) + "%")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/sergisanz/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Tasas de acierto de los distintos modelos:
**************************************************
Bigramas - backoff:  75.19675 %
Bigramas:            5.216710 %
Unigramas            74.80406 %


In [23]:
from nltk.tokenize import word_tokenize
fraseAnalisis = "Are you exaggerating or have you got the wrong figures?"

frase = word_tokenize(fraseAnalisis)

bigrama = bigram_tagger.tag(frase)
bigramaBackoff = backoff_tagger.tag(frase)
unigrama = unigram_tagger.tag(frase)

print("Unigrama:\n")
pprint.pprint(unigrama)
print("\nBigrama:\n")
pprint.pprint(bigrama)
print("\nBigrama Backoff:\n")
pprint.pprint(bigramaBackoff)

Unigrama:

[('Are', 'NOUN'),
 ('you', 'PRON'),
 ('exaggerating', 'NOUN'),
 ('or', 'CONJ'),
 ('have', 'VERB'),
 ('you', 'PRON'),
 ('got', 'NOUN'),
 ('the', 'DET'),
 ('wrong', 'NOUN'),
 ('figures', 'NOUN'),
 ('?', '.')]

Bigrama:

[('Are', None),
 ('you', None),
 ('exaggerating', None),
 ('or', None),
 ('have', None),
 ('you', None),
 ('got', None),
 ('the', None),
 ('wrong', None),
 ('figures', None),
 ('?', None)]

Bigrama Backoff:

[('Are', 'NOUN'),
 ('you', 'PRON'),
 ('exaggerating', 'NOUN'),
 ('or', 'CONJ'),
 ('have', 'VERB'),
 ('you', 'PRON'),
 ('got', 'NOUN'),
 ('the', 'DET'),
 ('wrong', 'NOUN'),
 ('figures', 'NOUN'),
 ('?', '.')]
