<a href="https://colab.research.google.com/github/Sujata018/NLP/blob/main/Postagger4Noisy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import brown    # importing brown corpus to be used as English Dictionary of words

#nltk.download('brown')
#nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
dictWords=set() # Vocabulary in a global set 
dic={}          # Global dictionary to store root (only non repeating consonants) pronunciation of all dictionary words
trigrams={}     # Global dictionary to store trigram probabilities
bigrams={}      # Global dictionary to store bigram probabilities
unigrams={}     # Global dictionary to store frequencies of individual words in the vocabulary

In [None]:
def get_consonants(word):
  '''
  Given a word in the input, this function lists down the consonants in order
  of their appearence, removing the repeating ones.
  e.g.  'arrogance' will return 'rgnc'

  Use: To find a dictionary word phonetically similar to a noisy word,
  consonant orders can be matched.
  e.g. 'arognce' -> 'rgnc' -> 'arrogance'

  '''
  word=word.lower()
  root=''             # variable to store consonant sequence
  ## First check for consonant sequence excluding h,w,y
  for letter in word:
    if (letter not in 'aeiouhwy'):
      if len(root)==0:
        root += letter
      elif root[-1] != letter:
        root += letter  

  ## In case no consonants, i.e. the word is made of vowels and h,w,y,
  ## then check for appearence sequence of h,w,y
  if len(root) == 0:    # The word is built with aeiouhwy only
    for letter in word:
      if (letter not in 'aeiou'):
        if len(root)==0:
          root += letter
        elif root[-1] != letter:
          root += letter  
    
  return root

In [None]:
def tokenize(text,stopwords):
  '''
  Tokenise text into sentences and words
  '''
  tokens=[]        # stores list of sentences. Each sentence is stored as a list of words.
  sentence=[]      # temporarily stores list of words in a single sentence
  word=''          # temporarily stores a word
  L=len(text)

  i=0
  while i < L :
    if text[i] in stopwords:        # if end of a sentence reached
      if word != '':
        sentence += [word]
        word = ''
        
      if sentence != []:
        sentence += [text[i]]
        tokens += [sentence]        # add the sentence to tokens
      sentence=[]
      word=''
      while((i<L) and 
            ((text[i] in stopwords) or 
            (text[i]==' '))):       # Skip any subsequent stopword(s) and blank spaces
          i += 1
      i -= 1 
    elif text[i] == ' ':            # end of a word
      if word != '':
        sentence += [word]          # add word to the sentence
        word = ''
      while(i < L and 
            text[i]==' '):          # Skip any subsequent blank spaces
        i += 1
      i -= 1
    else:                           # beginning or continuation of a word
      word += text[i]               # add letter to the word
    
    i += 1
  
  return tokens 

In [None]:
def buildDic():
  '''
  Build a dictionary of root pronunciations from English Dictionary words in brown corpora
  '''
  global dic, dictWords
  
  # Get the list of English words in lower case 
  dictWords1=set([a.lower() for a in brown.words()])
  
  # Remove all single letter words (excluding valid words 'a' and 'i'), as the brown corpora has all letters in it.
  dictWords=set([a for a in dictWords1 if (len(a)>1 and "'" not in a) or (a in 'ai')])

  # Build a dictionary of root pronunciations from English Dictionary words
  for word in dictWords:
    root=get_consonants(word)
    if root in dic.keys():
      dic[root] += [word]
    else:
      dic[root] = [word]

In [None]:
def buildProbDistBrown():
  '''
  Build trigram and bigram and unigram frequencies from Brown corpus
  '''
  global unigrams, bigrams, trigrams

  brown_sentences=brown.sents()
  unigrams={}
  bigrams={}
  trigrams={}

  word2=''                         # stores the previous to previous word
  word1='<START>'                  # stores the previous word 
  for sentence in brown_sentences:
    for word in sentence:
      word=word.lower()
      if word in unigrams.keys():  # stores unigram count
        unigrams[word] += 1
      else:
        unigrams[word] = 1
      if word1=='<START>':         # stores unigram count for <START>
        if word1 in unigrams.keys():
          unigrams[word1] += 1
        else:
          unigrams[word1] = 1
      if word1!='':                # stores bigram count
        bigram=(word1,word)
        if bigram in bigrams.keys():
          bigrams[bigram] += 1
        else:
          bigrams[bigram] = 1
      if word2!='':                # stores trigram count
        trigram=(word2,word1,word)
        if trigram in trigrams.keys():
          trigrams[trigram] += 1
        else:
          trigrams[trigram] = 1

      word2=word1                 # push read words back in sequence to read the next word            
      word1=word
    
    word2=''                       # initialise previous words at the start of every sentence
    word1='<START>'

  #Calculate trigram probabilities as frequency of the third word, given the 1st and the 2nd word.
  for trigram in trigrams.keys():
    trigrams[trigram] /= bigrams[trigram[:2]]

  #Calculate bigram probabilities as frequency of the second word, given the 1st word.
  for bigram in bigrams.keys():
    bigrams[bigram] /= unigrams[bigram[0]] 

  #Calculate unigram probabilities as frequency of the word, among all words in the vocabulary.
  C=len(unigrams)
  for unigram in unigrams.keys():
    unigrams[unigram] /= C 

In [None]:
def guessWord(sentence,i):
  '''
  Given a non-dictionary word, this function guesses the most probable 
  dictionary word.
  Inputs: sentence - the context of the word
          i        - the location where the word (to be guessed) is present
  Output: word     - the guessed word based on maximum probability
  Logic:
    Get the consonant sequence of the noisy word. If there is a single 
    word in dictionary having same consonant sequence (i.e. similarly 
    pronounced), then return that word. 
    If there are multiple words, then guess the word by maximising trigram probability
       using a context of last two words in the sentence.
       In case the incorrect word is in the first or second position of the sentence,
       or if no words found with non-zero trigram probability, then use bigram 
       probability with last word or previous word as <START>.
  '''
  global dic, trigrams, bigrams
  
  word=sentence[i]
  root=get_consonants(word)       
  
  if root in dic.keys():
    if len(dic[root])==1:
      return dic[root][0]
    else:
      ## Select the most probable word from dic[root] ##
      maxProb=0
      if i > 0:                           # if the word is in 2nd or later position,  
        for cword in dic[root]:           #   check trigram probability of each possibility 
          if i > 1:
            trigram=(sentence[i-2],sentence[i-1],cword) 
          else:
            trigram=('<START>',sentence[i-1],cword)
          if trigram in trigrams.keys():
            if trigrams[trigram]>maxProb: # pick the possible word with maximum probability
              word=cword
              maxProb=trigrams[trigram]
      if maxProb==0 and word==sentence[i]:# In case trigram not found or it is the first word
        for cword in dic[root]:   
          if i == 0:                      #   check bigram probability
            bigram=('<START>',cword)
          else:
            bigram=(sentence[i-1],cword)
          if bigram in bigrams.keys():
            if bigrams[bigram]>maxProb:   # pick the possible word with maximum probability
              word=cword
              maxProb=bigrams[bigram]
  return word  

In [None]:
global dic,bigrams,trigrams

text="hw r yu ? i m gud, yu? What's up? so so."
stopwords='?,.;'

# Tokenise input noisy text
noisy_text_tokenized=tokenize(text,stopwords)  

# Build a dictionary of root pronunciations from English Dictionary words in brown corpus
buildDic()    

# Build unigram, bigram and trigram probabilities from brown corpus
buildProbDistBrown()

# Correct the noisy text using the dictionary of pronunciations.
# If any word is not present in English Dictionary, guess the word using
# bigram and trigram probabilities, and correct

corrected_text=''                           # Corrected text to be stored here
C_text_tokenized=[x[:] for x in noisy_text_tokenized]
                                            # Deepcopy of noisy tokenised text
                                            # the tokenised text will be corrected here
for sentence in C_text_tokenized:           # check each sentence in noisy text
  for i in range(len(sentence)):            #  check each word
    word= sentence[i].lower()
    if word not in dictWords and word not in stopwords:
      word=guessWord(sentence,i)            # if word is not a dictionary word, then guess using trigram, bigram probabilities
      sentence[i]=word                      # correct the word
    corrected_text += (word + ' ')    

# Apply POS TAGger on the noisy and corrected texts and print for verification
a=[nltk.pos_tag(sentence) for sentence in noisy_text_tokenized]
c=[nltk.pos_tag(sentence) for sentence in C_text_tokenized]

print('noisy_text=',text)
print('POS TAGs by nltk',a)
print()
print('corrected_text=',corrected_text)
print('POS TAGs by nltk',c)

noisy_text= hw r yu ? i m gud, yu? What's up? so so.
POS TAGs by nltk [[('hw', 'NN'), ('r', 'NN'), ('yu', 'NN'), ('?', '.')], [('i', 'JJ'), ('m', 'NN'), ('gud', 'NN'), (',', ',')], [('yu', 'NN'), ('?', '.')], [("What's", 'VB'), ('up', 'RP'), ('?', '.')], [('so', 'RB'), ('so', 'RB'), ('.', '.')]]

corrected_text= how are you ? i am gud , you ? What's up ? so so . 
POS TAGs by nltk [[('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('?', '.')], [('i', 'NN'), ('am', 'VBP'), ('gud', 'NN'), (',', ',')], [('you', 'PRP'), ('?', '.')], [("What's", 'VB'), ('up', 'RP'), ('?', '.')], [('so', 'RB'), ('so', 'RB'), ('.', '.')]]
