<a href="https://colab.research.google.com/github/SantanaC4/pos_tagging_treebank/blob/main/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation Part of Speech Tagging from Treebank corpus using N-gram tecnique

In [None]:
import pandas as pd

In [None]:
##Upload Penn Treebank Corpus
# That function filter the pair (word, tag) of the data

def pair_filter(section):
  if (section == "training"):
    url = 'https://raw.githubusercontent.com/SantanaC4/pos_tagging_treebank/main/Sec0-18_training'
  if (section == "development"):
    url = 'https://raw.githubusercontent.com/SantanaC4/pos_tagging_treebank/main/Sec-19-21_development'
  if (section == "testing"):
    url = 'https://raw.githubusercontent.com/SantanaC4/pos_tagging_treebank/main/Sec-22-24_testing'

  df = pd.read_csv(url, header=None, sep='\n')
  spliting =  [i.split(" ") for i in df[0]]
  extracting_pair = []

  count = 0
  for i in spliting:
    if (len(i) == 1):
      if (i[0] != "''_''"):
          extracting_pair.append(tuple(i[0].split("_")))
    else:
      for j in i:
        if (j != "''_''" and j != "'_''"):
          extracting_pair.append(tuple(j.split("_")))
  return (extracting_pair)

## Function to sort tags probability
def sort_tags_probability(tags_frequency, word_frequency):
    result = []
    for (key, value) in tags_frequency.items():
      result.append((round(value/word_frequency, 2), key))
    return (sorted(result, reverse=True))

## Function to evaluating bigram model
def evaluating(model):
  tt = pair_filter("testing")
  aux = "<s>"

  accuracy = 0
  for (word, tag) in tt:
    if (model[aux + " " + word][0][1] == 'Unknown word'):
      if (model[aux + " " + '<UNK>'][0][1] == tag):
        accuracy += 1
    elif (model[aux + " " + word][0][1] == tag):
      accuracy += 1
    aux = tag

  return (accuracy/len(tt))

##Unigram Tagger 
####Building a dictionary with probability of tags for each word based on the following equation.

$$
P(t_i|s_i) = \frac{C(t_i,s_i)}{C(s_i)}
$$
####This says that the emission probability of tag i given state i is the total number of times we observe state i emitting tag i divided by the total number of times we observe state i.

In [None]:
## Default dictionary 
## Assign default value to key that is not in dictionary
from collections import defaultdict

unigram = defaultdict(list)
for (word, tag) in pair_filter("training"):
  unigram[word].append(tag)

########################### This is optional#######################################
##########################Treatment for unknown words##############################
# Unknown words: replace the words whose frequency is lesser five for <UNK>
unigram_with_unk = defaultdict(list)
for i in unigram.items():
  if (len(i[1]) < 5):
    for j in i[1]:
      unigram_with_unk['<UNK>'].append(j)
  else:
    for j in i[1]:
      unigram_with_unk[i[0]].append(j)
  
count_unk_tags = defaultdict(int)
for i in unigram_with_unk['<UNK>']:
  count_unk_tags[i] += 1

max_key = max(count_unk_tags, key=count_unk_tags.get)
tags_probability = defaultdict(lambda: [(count_unk_tags[max_key],max_key)])
unigram = unigram_with_unk

#################################################################################
#################################################################################


#### Warning ####
#### the line below should be uncommented out if the optional part above for commented
#### defaultdisct((lambda: [(0, 'NN')]) assign the 'NN' tag for unseend words in testing set
#### Warning ####

#tags_probability = defaultdict(lambda: [(0, 'NNP')])
word_freq = 0
aux = defaultdict(int)

for (word, tags) in unigram.items():
  word_freq = len(tags)
  for i in tags:
    aux[i] += 1
  tags_probability[word] = sorted([(value/word_freq, key) for (key, value) in aux.items()], reverse=True)
  aux = defaultdict(int)
  word_freq = 0

print(tags_probability['<UNK>'])
print(tags_probability['is'])


[(0.25982905982905985, 'NNP'), (0.17251043530113297, 'NN'), (0.16189624329159213, 'JJ'), (0.11288014311270125, 'CD'), (0.10383621546412244, 'NNS'), (0.04184058835221626, 'VBG'), (0.031922083084873785, 'VBN'), (0.027807592923871995, 'VB'), (0.021904193997217252, 'RB'), (0.019618366129994038, 'VBD'), (0.01892267938779567, 'VBZ'), (0.009222818525144106, 'NNPS'), (0.0062413039157225205, 'VBP'), (0.0027429934406678594, 'JJR'), (0.0021864440469091632, 'JJS'), (0.0019479228781554363, 'FW'), (0.0015503875968992248, 'IN'), (0.0006758099781355595, 'UH'), (0.00035778175313059033, 'PRP'), (0.00033790498906777975, 'RBR'), (0.0003180282250049692, 'SYM'), (0.000278274696879348, 'WRB'), (0.0002385211687537269, 'MD'), (0.00019876764062810574, 'CC'), (0.00011926058437686345, 'DT'), (9.938382031405287e-05, 'LS'), (7.95070562512423e-05, 'WP'), (7.95070562512423e-05, 'WDT'), (7.95070562512423e-05, 'TO'), (5.9630292188431724e-05, 'RP'), (5.9630292188431724e-05, 'PDT'), (5.9630292188431724e-05, '$'), (3.9753

In [None]:
# Evaluating

tt = pair_filter("testing")
tt[0]
accuracy = 0
for (word, tag) in tt:
    if (tags_probability[word][0][1] == tag):
      accuracy += 1

accuracy/len(tt)

0.9001158550334736

Accuracy without group up words less frequenty:
0.9164082866706773 (92%)

Accuracy to '< UNK >' words. 0.9073474478924616 (90%)

##Bigram Tagger

In [None]:
###Using the previous word to compute the probability

def bigram_model(previous, with_unk=0):
  bigram = defaultdict(list)
  word_frequency = defaultdict(int)
  aux = "<s>"

  for (word, tag) in pair_filter("training"):
      bigram[aux + " " + word].append(tag)
      word_frequency[word] += 1
      if (previous == "word"):
        aux = word
      elif (previous == "tag"):
        aux = tag
  
  tags_frequency = defaultdict(int)
  result = defaultdict(lambda: [(0 ,'NNP')])

  #
  if (with_unk == 1):
    aux = "<s>"
    bigram_with_unk = defaultdict(list)
    for i in bigram.items():
      if (len(i[1]) < 5):
        for j in i[1]:
                bigram_with_unk[i[0].split(' ')[0] + " " + '<UNK>'].append(j)
      else:
        for j in i[1]:
                bigram_with_unk[i[0]].append(j)

    print("Quantidade de bigramas sem <UNK>: ", len(bigram.items()))
    print("Quantidade de bigramas com <UNK>: ", len(bigram_with_unk.items()))

    result = defaultdict(lambda: [(0, 'Unknown word')])
    bigram = bigram_with_unk
  #

  for (word, tags) in bigram.items():
    for i in tags:
      tags_frequency[i] += 1
    result[word] = sort_tags_probability(tags_frequency, len(tags))
    tags_frequency = defaultdict(int)
  return result

# Model without <UNK>
bigram_without_unk = bigram_model("tag")

# Model with <UNK>
bigram_with_unk = bigram_model("tag", 1)

Quantidade de bigramas sem <UNK>:  122034
Quantidade de bigramas com <UNK>:  20015


In [None]:
## Evaluating

print("Accuracy of bigram model without <UNK> technique: ", evaluating(bigram_without_unk))
print("Accuracy of bigram model with <UNK> technique: ", evaluating(bigram_with_unk))

Accuracy of bigram model without <UNK> technique:  0.9067639123233988
Accuracy of bigram model with <UNK> technique:  0.8594344097224922
