### Exploring the in-built tagger

In [1]:
import nltk

In [2]:
simpleSentence = "Bangalore is the capital of Karnataka."

In [3]:
wordsInSentence = nltk.word_tokenize(simpleSentence)

In [4]:
print(wordsInSentence)

['Bangalore', 'is', 'the', 'capital', 'of', 'Karnataka', '.']


In [5]:
partsOfSpeechTags = nltk.pos_tag(wordsInSentence)

In [6]:
print(partsOfSpeechTags)

[('Bangalore', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('Karnataka', 'NNP'), ('.', '.')]


### Writing your own tagger

In [7]:
def learnDefaultTagger(simpleSentence):
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    tagger = nltk.DefaultTagger("NN")
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [8]:
def learnRETagger(simpleSentence):
    customPatterns = [
        (r'.*ing$', 'ADJECTIVE'),
        (r'.*ly$', 'ADVERB'),
        (r'.*ion$', 'NOUN'),
        (r'(.*ate|.*en|is)$', 'VERB'),
        (r'^an$', 'INDEFINITE-ARTICLE'),
        (r'^(with|on|at)$', 'PREPOSITION'),
        (r'^\-?[0-9]+(\.[0-9]+)$', 'NUMBER'),
        (r'.*$', None),
    ]
    tagger = nltk.RegexpTagger(customPatterns)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [9]:
def learnLookupTagger(simpleSentence):
    mapping = {
        '.': '.', 'place': 'NN', 'on': 'IN',
        'earth': 'NN', 'Mysore' : 'NNP', 'is': 'VBZ',
        'an': 'DT', 'amazing': 'JJ'
    }
    tagger = nltk.UnigramTagger(model=mapping)
    wordsInSentence = nltk.word_tokenize(simpleSentence)
    posEnabledTags = tagger.tag(wordsInSentence)
    print(posEnabledTags)

In [10]:
if __name__ == '__main__':
    testSentence = "Mysore is an amazing place on earth. I have visited Mysore 10 times."
    learnDefaultTagger(testSentence)
    learnRETagger(testSentence)
    learnLookupTagger(testSentence)

[('Mysore', 'NN'), ('is', 'NN'), ('an', 'NN'), ('amazing', 'NN'), ('place', 'NN'), ('on', 'NN'), ('earth', 'NN'), ('.', 'NN'), ('I', 'NN'), ('have', 'NN'), ('visited', 'NN'), ('Mysore', 'NN'), ('10', 'NN'), ('times', 'NN'), ('.', 'NN')]
[('Mysore', None), ('is', 'VERB'), ('an', 'INDEFINITE-ARTICLE'), ('amazing', 'ADJECTIVE'), ('place', None), ('on', 'PREPOSITION'), ('earth', None), ('.', None), ('I', None), ('have', None), ('visited', None), ('Mysore', None), ('10', None), ('times', None), ('.', None)]
[('Mysore', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('place', 'NN'), ('on', 'IN'), ('earth', 'NN'), ('.', '.'), ('I', None), ('have', None), ('visited', None), ('Mysore', 'NNP'), ('10', None), ('times', None), ('.', '.')]


### Training your own tagger

In [11]:
import pickle

In [12]:
def sampleData():
    return [
        "Bangalore is the capital of Karnataka.",
        "Steve Jobs was the CEO of Apple.",
        "iPhone was Invented by Apple.",
        "Books can be purchased in Market.",
    ]

In [13]:
def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

In [14]:
def saveMyTagger(tagger, fileName):
    fileHandle = open(fileName, "wb")
    pickle.dump(tagger, fileHandle)
    fileHandle.close()

In [15]:
def saveMyTraining(fileName):
    tagger = nltk.UnigramTagger(model=buildDictionary())
    saveMyTagger(tagger, fileName)

In [16]:
def loadMyTagger(fileName):
    return pickle.load(open(fileName, "rb"))

In [17]:
sentence = 'Iphone is purchased by Steve Jobs in Bangalore Market'
fileName = "myTagger.pickle"

In [18]:
saveMyTraining(fileName)

In [19]:
myTagger = loadMyTagger(fileName)

In [20]:
print(myTagger.tag(nltk.word_tokenize(sentence)))

[('Iphone', None), ('is', 'VBZ'), ('purchased', 'VBN'), ('by', 'IN'), ('Steve', 'NNP'), ('Jobs', 'NNP'), ('in', 'IN'), ('Bangalore', 'NNP'), ('Market', 'NNP')]


### Learning to write your own grammar

In [21]:
import string

In [22]:
from nltk.parse.generate import generate

In [23]:
productions = [
    "ROOT -> WORD",
    "WORD -> ' '",
    "WORD -> NUMBER LETTER",
    "WORD -> LETTER NUMBER",
]

In [24]:
digits = list(string.digits)
for digit in digits[:4]:
    productions.append("NUMBER -> '{w}'".format(w=digit))

In [25]:
letters = "' | '".join(list(string.ascii_lowercase)[:4])
productions.append("LETTER -> '{w}'".format(w=letters))

In [26]:
grammarString = "\n".join(productions)

In [27]:
grammar = nltk.CFG.fromstring(grammarString)

In [28]:
print(grammar)

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> NUMBER LETTER
    WORD -> LETTER NUMBER
    NUMBER -> '0'
    NUMBER -> '1'
    NUMBER -> '2'
    NUMBER -> '3'
    LETTER -> 'a'
    LETTER -> 'b'
    LETTER -> 'c'
    LETTER -> 'd'


In [29]:
for sentence in generate(grammar, n=5, depth=5):
    palindrome = "".join(sentence).replace(" ", "")
    print("Generated Word: {}, Size : {}".format(palindrome, len(palindrome)))

Generated Word: , Size : 0
Generated Word: 0a, Size : 2
Generated Word: 0b, Size : 2
Generated Word: 0c, Size : 2
Generated Word: 0d, Size : 2


### Writing a probabilistic CFG

In [30]:
from nltk.parse.generate import generate

In [32]:
productions = [
    "ROOT -> WORD [1.0]",
    "WORD -> P1 [0.25]",
    "WORD -> P1 P2 [0.25]",
    "WORD -> P1 P2 P3 [0.25]",
    "WORD -> P1 P2 P3 P4 [0.25]",
    "P1 -> 'A' [1.0]",
    "P2 -> 'B' [0.5]",
    "P2 -> 'C' [0.5]",
    "P3 -> 'D' [0.3]",
    "P3 -> 'E' [0.3]",
    "P3 -> 'F' [0.4]",
    "P4 -> 'G' [0.9]",
    "P4 -> 'H' [0.1]",
]

In [33]:
grammarString = "\n".join(productions)

In [34]:
grammar = nltk.PCFG.fromstring(grammarString)

In [35]:
print(grammar)

Grammar with 13 productions (start state = ROOT)
    ROOT -> WORD [1.0]
    WORD -> P1 [0.25]
    WORD -> P1 P2 [0.25]
    WORD -> P1 P2 P3 [0.25]
    WORD -> P1 P2 P3 P4 [0.25]
    P1 -> 'A' [1.0]
    P2 -> 'B' [0.5]
    P2 -> 'C' [0.5]
    P3 -> 'D' [0.3]
    P3 -> 'E' [0.3]
    P3 -> 'F' [0.4]
    P4 -> 'G' [0.9]
    P4 -> 'H' [0.1]


In [36]:
for sentence in generate(grammar, n=10, depth=5):
    palindrome = "".join(sentence).replace(" ", "")
    print("String : {}, Size : {}".format(palindrome, len(palindrome)))

String : A, Size : 1
String : AB, Size : 2
String : AC, Size : 2
String : ABD, Size : 3
String : ABE, Size : 3
String : ABF, Size : 3
String : ACD, Size : 3
String : ACE, Size : 3
String : ACF, Size : 3
String : ABDG, Size : 4


### Writing a recursive CFG

In [37]:
productions = [
    "ROOT -> WORD",
    "WORD -> ' '"
]

In [38]:
alphabets = list(string.digits)

In [39]:
for alphabet in alphabets:
    productions.append("WORD -> '{w}' WORD '{w}'".format(w=alphabet))

In [40]:
grammarString = "\n".join(productions)

In [41]:
grammar = nltk.CFG.fromstring(grammarString)

In [42]:
print(grammar)

Grammar with 12 productions (start state = ROOT)
    ROOT -> WORD
    WORD -> ' '
    WORD -> '0' WORD '0'
    WORD -> '1' WORD '1'
    WORD -> '2' WORD '2'
    WORD -> '3' WORD '3'
    WORD -> '4' WORD '4'
    WORD -> '5' WORD '5'
    WORD -> '6' WORD '6'
    WORD -> '7' WORD '7'
    WORD -> '8' WORD '8'
    WORD -> '9' WORD '9'


In [43]:
for sentence in generate(grammar, n=5, depth=5):
    palindrome = "".join(sentence).replace(" ", "")
    print("Palindrome : {}, Size : {}".format(palindrome, len(palindrome)))

Palindrome : , Size : 0
Palindrome : 00, Size : 2
Palindrome : 0000, Size : 4
Palindrome : 0110, Size : 4
Palindrome : 0220, Size : 4
