<a href="https://colab.research.google.com/github/SantanaC4/pos_tagging_treebank/blob/main/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation Part of Speech Tagging from Treebank corpus using N-gram tecnique

In [91]:
import pandas as pd

In [96]:
##Upload Penn Treebank Corpus
# That function filter the pair (word, tag) of the data

def pair_filter(section):
  if (section == "training"):
    url = 'https://raw.githubusercontent.com/SantanaC4/pos_tagging_treebank/main/Sec0-18_training'
  if (section == "development"):
    url = 'https://raw.githubusercontent.com/SantanaC4/pos_tagging_treebank/main/Sec-19-21_development'
  if (section == "testing"):
    url = 'https://raw.githubusercontent.com/SantanaC4/pos_tagging_treebank/main/Sec0-18_training'

  df = pd.read_csv(url, header=None, sep='\n')
  spliting =  [i.split(" ") for i in df[0]]
  extracting_pair = []

  for i in spliting:
    if (len(i) == 1):
      extracting_pair.append(tuple(i[0].split("_")))
    else:
      for j in i:
        if (j != ''):
          extracting_pair.append(tuple(j.split("_")))
  return (extracting_pair)

## Function to sort tags probability
def sort_tags_probability(tags_frequency, word_frequency):
    result = []
    for (key, value) in tags_frequency.items():
      result.append((round(value/word_frequency, 2), key))
    return (sorted(result, reverse=True))

##Unigram Tagger 
####Building a dictionary with probability of tags for each word based on the following equation.

$$
P(t_i|s_i) = \frac{C(t_i,s_i)}{C(s_i)}
$$
####This says that the emission probability of tag i given state i is the total number of times we observe state i emitting tag i divided by the total number of times we observe state i.

In [93]:
## Default dictionary 
## Assign default value to key that is not in dictionary
from collections import defaultdict

unigram = defaultdict(list)
for (word, tag) in pair_filter("training"):
  unigram[word].append(tag)

aux = defaultdict(int)
# defaultdisct(lambda: 'NN') assign the 'NN' tag for unseend words in testing set
tags_probability = defaultdict(lambda: 'NN')
word_freq = 0

for (word, tags) in unigram.items():
  word_freq = len(tags)
  for i in tags:
    aux[i] += 1
  tags_probability[word] = sorted([(value/word_freq, key) for (key, value) in aux.items()], reverse=True)
  aux = defaultdict(int)
  word_freq = 0


print(tags_probability['the'])

[(0.9995952748336833, 'DT'), (0.00017706726026357725, 'JJ'), (0.0001517719373687805, 'NNP'), (2.529532289479675e-05, 'VBP'), (2.529532289479675e-05, 'NN'), (2.529532289479675e-05, 'CD')]


In [94]:
tt = pair_filter("development")
tt[0]
accuracy = 0
for (word, tag) in tt:
  if (tags_probability[word] == 'NN'):
    if (tag == 'NN'):
      accuracy += 1
  else:
    if (tags_probability[word][0][1] == tag):
      accuracy += 1

accuracy/len(tt)

0.913825815068909

##Bigram Tagger

In [98]:
###Using the approach with

bigram = defaultdict(list)
word_frequency = defaultdict(int)
aux = "<s>"

for (word, tag) in pair_filter("training"):
    bigram[aux + " " + word].append(tag)
    word_frequency[word] += 1
    aux = word

tags_frequency = defaultdict(int)

for (word, tags) in bigram.items():
  for i in tags:
    tags_frequency[i] += 1
  bigram[word] = sort_tags_probability(tags_frequency, word_frequency[word.split(" ")[1]])
  tags_frequency = defaultdict(int)

bigram


defaultdict(list,
            {'<s> Pierre': [(0.17, 'NNP')],
             'Pierre Vinken': [(0.5, 'NNP')],
             'Vinken ,': [(0.0, ',')],
             ', 61': [(0.24, 'CD')],
             '61 years': [(0.0, 'NNS')],
             'years old': [(0.45, 'JJ')],
             'old ,': [(0.0, ',')],
             ', will': [(0.07, 'MD'), (0.0, 'NN')],
             'will join': [(0.2, 'VB')],
             'join the': [(0.0, 'DT')],
             'the board': [(0.41, 'NN')],
             'board as': [(0.0, 'RB'), (0.0, 'IN')],
             'as a': [(0.03, 'DT')],
             'a nonexecutive': [(0.83, 'JJ')],
             'nonexecutive director': [(0.02, 'NN')],
             'director Nov.': [(0.0, 'NNP')],
             'Nov. 29': [(0.08, 'CD')],
             '29 .': [(0.0, '.')],
             '. Mr.': [(0.25, 'NNP')],
             'Mr. Vinken': [(0.5, 'NNP')],
             'Vinken is': [(0.0, 'VBZ')],
             'is chairman': [(0.02, 'NN')],
             'chairman of': [(0.01, 'IN')]