### Exploring the in-built tagger

In [2]:
import nltk
simpleSentence = "Bangalore is the capital of Karnataka."
wordsInSentence =nltk.word_tokenize(simpleSentence)
print(wordsInSentence)
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)
print(partsOfSpeechTags)

['Bangalore', 'is', 'the', 'capital', 'of', 'Karnataka', '.']
[('Bangalore', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('Karnataka', 'NNP'), ('.', '.')]


### Writing your own tagger

In [4]:
def learnDefaultTagger(simpleSentence):
    wordsInSentence =nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger("NN")
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)


In [5]:
def learnRETagger(simpleSentence):
    customPatterns = [
        (r'.*ing$', 'ADJECTIVE'),
        (r'.*ly$', 'ADVERB'),
        (r'.*ion$', 'NOUN'),
        (r'(.*ate|.*en|is)$', 'VERB'),
        (r'^an$', 'INDEFINITE-ARTICLE'),
        (r'^(with|on|at)$', 'PREPOSITION'),
        (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'),
        (r'.*$', None),        
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)
    

In [6]:
def learnLookupTagger(simpleSentence):
    mapping = {
        '.':'.',
        'place': 'NN',
        'on': 'IN',
        'earth': 'NN',
        'Mysore' : 'NNP',
        'is': 'VBZ',
        'an': 'DT',
        'amazing': 'JJ'
    }
    tagger = nltk.UnigramTagger(model = mapping)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [8]:
testSentence = "Mysore is an amazing place on earth. I have visited Mysore 10 times."
learnDefaultTagger(testSentence)
print("\n")
learnRETagger(testSentence)
print("\n")
learnLookupTagger(testSentence)


[('Mysore', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Mysore', 'NN'), ('10', 'NN'), ('times', 'NN'), ('.', 'NN')]


[('Mysore', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Mysore', None), ('10', None), ('times', None), ('.', None)]


[('Mysore', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Mysore', 'NNP'), ('10', None), ('times', None), ('.', '.')]


### Training your own tagger

In [9]:
import pickle


In [19]:
def sampleData():
    return [
        "Bangalore is the capital of Karnataka.",
        "Steve Jobs was the CEO of Apple",
        "iPhone was Invented by Apple",
        "Books can be purchased in Market",
    ]

In [20]:
def buildDictionary():
    dict = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag [1]
            dict[value] = pos
    return dict


In [21]:
def saveMyTagger(tagger, fileName):
    fileHandle = open(fileName, "wb")
    pickle.dump(tagger, fileHandle)
    fileHandle.close()

In [22]:
def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger(model = buildDictionary())
    saveMyTagger(tagger, fileName)

In [25]:
def loadMyTagger(fileName):
    return pickle.load(open(fileName, "rb"))

In [26]:
sentence = 'Iphone is purchased by Steve Jobs in Bangalore Market'
fileName = 'myTagger.pickle'

saveMyTraining(fileName)
myTagger = loadMyTagger(fileName)
print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NN')]


### Learning to write your own grammar

In [49]:
import string
from nltk.parse.generate import generate
import sys

In [50]:
productions = [
    "ROOT -> WORD",
    "WORD -> ' '",
    "WORD -> NUMBER LETTER",
    "WORD -> LETTER NUMBER",
]

In [51]:
digits = list(string.digits)
for digit in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w = digit))

letters = "' | '".join(list(string.ascii_lowercase[:4]))
productions.append("LETTER -> '{}'".format(letters))


In [52]:
grammarString = "\n".join(productions)
grammar = nltk.CFG.fromstring(grammarString)
print(grammar)

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'


In [53]:
for sentence in generate(grammar, n=5, depth = 5):
    palindrome = "".join(sentence).replace(" ", "")
    print("Generated Word: {} , size : {}".format(palindrome,len(palindrome)))

Generated Word:  , size : 0
Generated Word: 0a , size : 2
Generated Word: 0b , size : 2
Generated Word: 0c , size : 2
Generated Word: 0d , size : 2


### Writing a probabilistic CFG

In [54]:
productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]

In [55]:
grammarString = '\n'.join(productions)
grammar = nltk.PCFG.fromstring(grammarString)
print(grammar)

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]


In [57]:
for sentence in generate(grammar, n=10, depth = 5):
    palindrome = "".join(sentence).replace(" ", "")
    print("Generated Word: {} , size : {}".format(palindrome,len(palindrome)))

Generated Word: A , size : 1
Generated Word: AB , size : 2
Generated Word: AC , size : 2
Generated Word: ABD , size : 3
Generated Word: ABE , size : 3
Generated Word: ABF , size : 3
Generated Word: ACD , size : 3
Generated Word: ACE , size : 3
Generated Word: ACF , size : 3
Generated Word: ABDG , size : 4
