In [2]:
# Source - Chapter - 7 

import nltk, re, pprint

docu = " Hi there is there an entity in here , if yes please New York it. Also consider the value of the  City when talking about Property which is worth more than a Million Dollars "

def ie_preprocess(docu):
    sentences = nltk.sent_tokenize(docu)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences1 = [nltk.pos_tag(sent) for sent in sentences]
    
    return sentences1

In [3]:
# 

ie_preprocess(docu)

[[('Hi', 'NNP'),
  ('there', 'EX'),
  ('is', 'VBZ'),
  ('there', 'EX'),
  ('an', 'DT'),
  ('entity', 'NN'),
  ('in', 'IN'),
  ('here', 'RB'),
  (',', ','),
  ('if', 'IN'),
  ('yes', 'UH'),
  ('please', 'VB'),
  ('New', 'NNP'),
  ('York', 'NNP'),
  ('it', 'PRP'),
  ('.', '.')],
 [('Also', 'RB'),
  ('consider', 'VBP'),
  ('the', 'DT'),
  ('value', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('City', 'NNP'),
  ('when', 'WRB'),
  ('talking', 'VBG'),
  ('about', 'IN'),
  ('Property', 'NNP'),
  ('which', 'WDT'),
  ('is', 'VBZ'),
  ('worth', 'IN'),
  ('more', 'JJR'),
  ('than', 'IN'),
  ('a', 'DT'),
  ('Million', 'NNP'),
  ('Dollars', 'NNP')]]

In [4]:
# NP Chunking 
# Also see -- https://ifarm.nl/erikt/research/np-chunking.html
#

sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),("the", "DT"), ("cat", "NN")]

grammar = "NP: {<DT>?<JJ>*<NN>}" 

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

result.draw() # All OK - Pop Up 

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [6]:
#  2.3   Chunking with Regular Expressions

'''
If a tag pattern matches at overlapping locations, the leftmost match takes precedence. 
For example, if we apply a rule that matches two consecutive nouns to a text 
containing three consecutive nouns, then only the first two nouns will be chunked:
'''


nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]

# Seen above in nouns - all three Tokens have been Tagged as NOUNS --- the grammar below states -- 
# Chunk together 2 Consecutive Nouns -- the Left pair shall be Chunked not the RIGHT Pair .. 

grammar = "NP: {<NN><NN>}  # Chunk two consecutive nouns"
cp = nltk.RegexpParser(grammar)
result =cp.parse(nouns)
print(result)

result.draw() # All OK - Pop Up 

(S (NP money/NN market/NN) fund/NN)


In [9]:
# 2.4   Exploring Text Corpora
# 
# In Chapter-5 we saw how we could interrogate a tagged corpus to extract phrases matching a particular sequence
# of part-of-speech tags. We can do the same work more easily with a chunker, as follows:

#cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
cp = nltk.RegexpParser('CHUNK: {<NN.*> <TO> <NN.*>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
     tree = cp.parse(sent)
     for subtree in tree.subtrees():
         if subtree.label() == 'CHUNK': 
                print(subtree)   #### Dhankar --- Dont Print Large Dump 
                
                '''
                
                #WITH --- #cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
                
                
                (CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)
(CHUNK like/VB to/TO see/VB)
(CHUNK designed/VBN to/TO provide/VB)
(CHUNK get/VB to/TO hear/VB)
(CHUNK expects/VBZ to/TO tell/VB)
(CHUNK expected/VBN to/TO give/VB)
(CHUNK prefer/VB to/TO pay/VB)
                
                
                '''

(CHUNK exposure/NN to/TO group/NN)
(CHUNK respite/NN to/TO hurry/NN)
(CHUNK urgings/NNS to/TO date/NN)


In [10]:
# 2.6   Representing Chunks: Tags vs Trees
# IOB Tags 

# A token is tagged as B if it marks the beginning of a chunk. Subsequent tokens within the chunk are tagged I. 
# All other tokens are tagged O. The B and I tags are suffixed with the chunk type, e.g. B-NP, I-NP. Of course, 
# it is not necessary to specify a chunk type for tokens that appear outside a chunk, so these are just labeled O.

from nltk.corpus import conll2000
print(conll2000.chunked_sents('train.txt')[99])

# Seen below All Chunks in Braces 

(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)


In [11]:
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])

# Seen below only - NP in Braces 

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


In [12]:
## 3.2   Simple Evaluation and Baselines

# Baseline -- Minimum Acceptable Performnce --- We start off by establishing a baseline for the trivial 
# chunk parser cp that creates no chunks:
# No Chunks Defined 

from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [13]:
# Regular expression chunker that looks for tags beginning with letters that are characteristic 
# of noun phrase tags (e.g. CD, DT, and JJ).

grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [None]:
'''
Verbatim Source -- http://www.nltk.org/book/ch07.html

As you can see, this approach achieves decent results. However, we can improve on it by adopting a more data-driven
approach, where we use the training corpus to find the chunk tag (I, O, or B) that is most likely for each
part-of-speech tag. In other words, we can build a chunker using a unigram tagger (4). 
But rather than trying to determine the correct part-of-speech tag for each word, we are trying to determine
the correct chunk tag, given each word's part-of-speech tag.

In 3.1, we define the UnigramChunker class, which uses a unigram tagger to label sentences with chunk tags. 
Most of the code in this class is simply used to convert back and forth between the chunk tree representation
used by NLTK's ChunkParserI interface, and the IOB representation used by the embedded tagger. The class defines 
two methods: a constructor [1] which is called 
when we build a new UnigramChunker; and the parse method [3] which is used to chunk new sentences.
'''



In [14]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):     # INIT + Self + train Sentences from the TRAIN Data Set 
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] ### It first converts training data to a form that is suitable for training the tagger, 
                                                                           ### using tree2conlltags to map each chunk tree to a list of word,tag,chunk triples
                      for sent in train_sents]         # Loops over each "Sent" in the "train_sents"
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
    
# The parse method [3] takes a tagged sentence as its input, and begins by extracting the part-of-speech tags
# from that sentence. It then tags the part-of-speech tags with IOB chunk tags, using the tagger self.tagger
# that was trained in the constructor. Next, it extracts the chunk tags, and combines them with the original 
# sentence, to yield conlltags. Finally, it uses conlltags2tree to convert the result back into a chunk tree.    


In [21]:
# Named Entity Recognition 

sent1 = nltk.corpus.treebank.tagged_sents()[22]
print(type(sent1))
print(sent1)
#print(nltk.ne_chunk(sent1, binary=True)) #OK --- Binary True gives "NE" only 
#print(nltk.ne_chunk(sent1)) #OK --- Binary True gives "PERSON" "GPE" etc  


##
#


#print(nltk.ne_chunk(test)) #--- Binary True gives "NE" only 

<class 'list'>
[('The', 'DT'), ('U.S.', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('few', 'JJ'), ('industrialized', 'VBN'), ('nations', 'NNS'), ('that', 'WDT'), ('*T*-7', '-NONE-'), ('does', 'VBZ'), ("n't", 'RB'), ('have', 'VB'), ('a', 'DT'), ('higher', 'JJR'), ('standard', 'NN'), ('of', 'IN'), ('regulation', 'NN'), ('for', 'IN'), ('the', 'DT'), ('smooth', 'JJ'), (',', ','), ('needle-like', 'JJ'), ('fibers', 'NNS'), ('such', 'JJ'), ('as', 'IN'), ('crocidolite', 'NN'), ('that', 'WDT'), ('*T*-1', '-NONE-'), ('are', 'VBP'), ('classified', 'VBN'), ('*-5', '-NONE-'), ('as', 'IN'), ('amphobiles', 'NNS'), (',', ','), ('according', 'VBG'), ('to', 'TO'), ('Brooke', 'NNP'), ('T.', 'NNP'), ('Mossman', 'NNP'), (',', ','), ('a', 'DT'), ('professor', 'NN'), ('of', 'IN'), ('pathlogy', 'NN'), ('at', 'IN'), ('the', 'DT'), ('University', 'NNP'), ('of', 'IN'), ('Vermont', 'NNP'), ('College', 'NNP'), ('of', 'IN'), ('Medicine', 'NNP'), ('.', '.')]


In [35]:
# Own Experi -----

def ie_preprocess(docu):
    sentences = nltk.sent_tokenize(docu)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    #sentences1 = [nltk.pos_tag(sent) for sent in sentences]
    sentences2 = [nltk.UnigramTagger(sent) for sent in sentences]
    
    return sentences2


In [36]:
# Own Experi -----

docu1 = "George W Bush is a Fine Gentelman" # Just a STRING
docu2 = "The Lady is a TRAMP"
docu3 = " The first person who saw the dead men lying in the clearing was Captain Sprout"
docu4 = "A list of priority infrastructure projects prepared for Trump includes on green energy"

print(ie_preprocess(docu1))
print("  "*100)
print(ie_preprocess(docu2))
print("  "*100)
print(ie_preprocess(docu3))
print("  "*100)
print(ie_preprocess(docu4))

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# TAG's with POS Tagger 

[[('George', 'NNP'), ('W', 'NNP'), ('Bush', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('Fine', 'JJ'), ('Gentelman', 'NNP')]]
                                                                                                                                                                                                        
[[('The', 'DT'), ('Lady', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('TRAMP', 'NN')]]
                                                                                                                                                                                                        
[[('The', 'DT'), ('first', 'JJ'), ('person', 'NN'), ('who', 'WP'), ('saw', 'VBD'), ('the', 'DT'), ('dead', 'JJ'), ('men', 'NNS'), ('lying', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('clearing', 'NN'), ('was', 'VBD'), ('Captain', 'NNP'), ('Sprout', 'NNP')]]
                                                                                                                                                                                                        
[[('A', 'DT'), ('list', 'NN'), ('of', 'IN'), ('priority', 'NN'), ('infrastructure', 'NN'), ('projects', 'NNS'), ('prepared', 'VBD'), ('for', 'IN'), ('Trump', 'NNP'), ('includes', 'VBZ'), ('on', 'IN'), ('green', 'JJ'), ('energy', 'NN')]]

In [14]:
# 2.8   Exploring Tagged Corpora
# Finding words that APPEAR afer another Word 

from nltk.corpus import brown

brown_learned_text = brown.words(categories='learned')
#
print(type(brown_learned_text))  # http://www.nltk.org/api/nltk.corpus.reader.html
# <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
#
print(sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often')))


<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
[',', '.', 'accomplished', 'analytically', 'appear', 'apt', 'associated', 'assuming', 'became', 'become', 'been', 'began', 'call', 'called', 'carefully', 'chose', 'classified', 'colorful', 'composed', 'contain', 'differed', 'difficult', 'encountered', 'enough', 'equate', 'extremely', 'found', 'happens', 'have', 'ignored', 'in', 'involved', 'more', 'needed', 'nightly', 'observed', 'of', 'on', 'out', 'quite', 'represent', 'responsible', 'revamped', 'seclude', 'set', 'shortened', 'sing', 'sounded', 'stated', 'still', 'sung', 'supported', 'than', 'to', 'when', 'work']


In [12]:
# Experi own -- Fail as we dont have a -- <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>

text1 = "we are often told that often is a word which will appear often in this text also after often shall occur many other words  "

text2 = text1.words(categories='learned')  ### AttributeError: 'str' object has no attribute 'words'
sorted(set(b for (a, b) in nltk.bigrams(text1) if a == 'often'))

AttributeError: 'str' object has no attribute 'words'

In [17]:
# Next, let's look at some larger context, and find words involving particular sequences 
# of tags and words (in this case "<Verb> to <Verb>"). In code-three-word-phrase we consider
# each three-word window in the sentence [1], and check if they meet our criterion [2]. 
# If the tags match, we print the corresponding words [3].

from nltk.corpus import brown
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):             ### [1]
        if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):    ### [2]
            #print(w1, w2, w3)                ### [3] --- Works OK -- Dont Uncomment Large Dump .... #### DHANK 

for tagged_sent in brown.tagged_sents():
    process(tagged_sent)
    
    # 
    '''
    combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become
expected to approve
expected to make
intends to make
seek to set
like to see
designed to provide
get to hear
expects to tell
expected to give
prefer to pay
required to obtain
permitted to teach
designed to reduce
Asked to elaborate

'''
    
    

IndentationError: expected an indented block (<ipython-input-17-0dcf545f4eb6>, line 12)

In [18]:
# Noun to Noun 

from nltk.corpus import brown
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk.trigrams(sentence):             ### [1]
        if (t1.startswith('NN') and t2 == 'TO' and t3.startswith('NN')):    ### [2]
            print(w1, w2, w3)                ### [3] --- Works OK -- Dont Uncomment Large Dump .... #### DHANK 

for tagged_sent in brown.tagged_sents():
    process(tagged_sent)
    
    #  3 Only ? 
    

exposure to group
respite to hurry
urgings to date


In [None]:
#  Once we start doing part-of-speech tagging, we will be creating programs 
# that assign a tag to a word, the tag which is most likely in a given context #
## CONTEXT is Important 



In [30]:
# Py DICTS 

pos = {}
print(pos)
print(type(pos))            # <class 'dict'>
pos['colorless'] = 'ADJ'
print(pos)
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'
print(pos)
#
print(list(pos))           # Dict to LIST 
pos_list = list(pos)
print(type(pos_list))      # <class 'list'>
print(sorted(pos))
print([w for w in pos if w.endswith('as')])
print([w for w in pos if w.startswith('fu')])
# 
# dictionary methods keys(), values() and items() 
# 
print(list(pos.keys()))
#
print(list(pos.values()))
#
print(list(pos.items()))
#
pos['sleep'] = ['N', 'V'] # Sleep == Noun + Verb , here the ['N', 'V'] IS LIST ASSIGNED AS VALUES IN KEY VALUES for DICT
#
print(list(pos.items()))
#In fact, this is what we saw in 4 for the CMU Pronouncing Dictionary, which stores multiple pronunciations for a single word.

{}
<class 'dict'>
{'colorless': 'ADJ'}
{'colorless': 'ADJ', 'sleep': 'V', 'ideas': 'N', 'furiously': 'ADV'}
['colorless', 'sleep', 'ideas', 'furiously']
<class 'list'>
['colorless', 'furiously', 'ideas', 'sleep']
['ideas']
['furiously']
['colorless', 'sleep', 'ideas', 'furiously']
['ADJ', 'V', 'N', 'ADV']
[('colorless', 'ADJ'), ('sleep', 'V'), ('ideas', 'N'), ('furiously', 'ADV')]
[('colorless', 'ADJ'), ('sleep', ['N', 'V']), ('ideas', 'N'), ('furiously', 'ADV')]


In [31]:
# We have seen above that DICT - VALUES can be LISTS like --  ['N', 'V']
# BUT we cant have LISTS as KEY's . LISTS are MUTABLE ...Any Data Structure that is MUTABLE is NOT HASHABLE
# Note that dictionary keys must be immutable types, such as strings and tuples. 
# If we try to define a dictionary using a mutable key, we get a TypeError: unhashable type: 'list'


pos = {['ideas', 'blogs', 'adventures']: 'N'}

# http://stackoverflow.com/questions/6754102/typeerror-unhashable-type
# http://stackoverflow.com/questions/6754102/typeerror-unhashable-type


TypeError: unhashable type: 'list'

In [None]:
# TBD -- 

In [None]:
# TBD -- 

In [None]:
# TBD -- 

In [None]:
# TBD -- 

In [None]:
# TBD -- 

In [33]:
# 4   Automatic Tagging
#
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
# 
# We find the Most Frequent TAG is - NN ( NOUN) - noun, singular ... and Assign it as the DEFAULT TAG 
#
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()

'NN'

In [36]:
from nltk import pos_tag, word_tokenize

raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [38]:
# 4.2   The Regular Expression Tagger
# 
# Note that these are processed in order, and the first one that matches is applied.
#

patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
     (r'.*es$', 'VBZ'),                # 3rd singular present
     (r'.*ould$', 'MD'),               # modals
     (r'.*\'s$', 'NN$'),               # possessive nouns
     (r'.*s$', 'NNS'),                 # plural nouns
     (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
     (r'.*', 'NN')                     # nouns (default)
 ]

## DHANKAR--- 
# Seen above -- The final regular expression «.*» is a catch-all that tags everything as a noun.
# Similary in ELIZA or any other BOT the Last Regex Search is a CATCH ALL -- after all other possibilities have been 
# explored --- we cant have the CATCH All come earlier as it will CATCH All and leave nothing for others . 

In [39]:
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])
regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

In [40]:
# 4.3   The Lookup Tagger

'''
A lot of high-frequency words do not have the NN tag. Let's find the hundred most frequent words and store their most likely tag. 
We can then use this information as the model for a "lookup tagger" (an NLTK UnigramTagger):
'''

fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common(100)

likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)     ####  HOW ??? 

baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)      # Eval tagger against the BROWN Corpus - tagged Sentences 


0.45578495136941344

In [2]:
%matplotlib inline

from matplotlib import pyplot as plt

def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

def display():
    import pylab
    word_freqs = nltk.FreqDist(brown.words(categories='news')).most_common()
    words_by_freq = [w for (w, _) in word_freqs]
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()
    
    # DHANK -- Check why not plotting ?? 

In [6]:
# N Gram -- 5.1   Unigram Tagging
#

from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
print(unigram_tagger.tag(brown_sents[2007]))

#

unigram_tagger.evaluate(brown_tagged_sents)
#
# 0.9349006503968017 --- As both TRAIN and TEST are TRAIN ONLY -- We need to create a TEST from BROWN CORPUS

[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'), ('apartments', 'NNS'), ('are', 'BER'), ('of', 'IN'), ('the', 'AT'), ('terrace', 'NN'), ('type', 'NN'), (',', ','), ('being', 'BEG'), ('on', 'IN'), ('the', 'AT'), ('ground', 'NN'), ('floor', 'NN'), ('so', 'QL'), ('that', 'CS'), ('entrance', 'NN'), ('is', 'BEZ'), ('direct', 'JJ'), ('.', '.')]


0.9349006503968017

In [8]:
size = int(len(brown_tagged_sents) * 0.9)
print("Length of the Brown Tagged Sentences =",size)

print(type(brown_tagged_sents)) # NLTK Specific Data Struct:- <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>

train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
print(unigram_tagger.evaluate(test_sents))


Length of the Brown Tagged Sentences = 4160
<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
0.8130170437556065


In [None]:
# TBD -------- 
#
# Backoff is a method for combining models: when a more specialized model (such as a bigram tagger) cannot 
# assign a tag in a given context, we backoff to a more general model (such as a unigram tagger).