### Using inbuilt NERs

In [1]:
import nltk

In [2]:
def sampleNE():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent))

In [3]:
def sampleNE2():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent, binary=True))

In [4]:
if __name__ == '__main__':
    sampleNE()
    sampleNE2()

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)
(S
  (NE Pierre/NNP Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


### Creating, inversing, and using dictionaries

In [5]:
class LearningDictionary():
    def __init__(self, sentence):
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary()

    def buildDictionary(self):
        self.dictionary = {}
        for (word, pos) in self.tagged:
            self.dictionary[word] = pos

    def buildReverseDictionary(self):
        self.rdictionary = {}
        for key in self.dictionary.keys():
            value = self.dictionary[key]
            if value not in self.rdictionary:
                self.rdictionary[value] = [key]
            else:
                self.rdictionary[value].append(key)

    def isWordPresent(self, word):
        return 'Yes' if word in self.dictionary else 'No'


    def getPOSForWord(self, word):
        return self.dictionary[word] if word in self.dictionary else None

    def getWordsForPOS(self, pos):
        return self.rdictionary[pos] if pos in self.rdictionary else None

In [6]:
sentence = "All the flights got delayed due to bad weather"

In [7]:
learning = LearningDictionary(sentence)

In [8]:
words = ["chair", "flights", "delayed", "pencil", "weather"]

In [9]:
pos = ["NN", "VBS", "NNS"]

In [10]:
for word in words:
    status = learning.isWordPresent(word)
    print("Is '{}' present in dictionary ? : '{}'".format(word, status))
    if status is True:
        print("\tPOS For '{}' is '{}'".format(word, learning.getPOSForWord(word)))

Is 'chair' present in dictionary ? : 'No'
Is 'flights' present in dictionary ? : 'Yes'
Is 'delayed' present in dictionary ? : 'Yes'
Is 'pencil' present in dictionary ? : 'No'
Is 'weather' present in dictionary ? : 'Yes'


In [11]:
for pword in pos:
    print("POS '{}' has '{}' words".format(pword, learning.getWordsForPOS(pword)))

POS 'NN' has '['weather']' words
POS 'VBS' has 'None' words
POS 'NNS' has '['flights']' words


### Choosing the feature set

In [12]:
import random

In [13]:
sampledata = [
    ('KA-01-F 1034 A', 'rtc'),
    ('KA-02-F 1030 B', 'rtc'),
    ('KA-03-FA 1200 C', 'rtc'),
    ('KA-01-G 0001 A', 'gov'),
    ('KA-02-G 1004 A', 'gov'),
    ('KA-03-G 0204 A', 'gov'),
    ('KA-04-G 9230 A', 'gov'),
    ('KA-27 1290', 'oth')
]

In [14]:
random.shuffle(sampledata)

In [15]:
testdata = [
    'KA-01-G 0109',
    'KA-02-F 9020 AC',
    'KA-02-FA 0801',
    'KA-01 9129'
]

In [16]:
def learnSimpleFeatures():
    def vehicleNumberFeature(vnumber):
        return {'vehicle_class': vnumber[6]}
    featuresets = [(vehicleNumberFeature(vn), cls) for (vn, cls) in sampledata]
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print("(simple) %s is of type %s" %(num, classifier.classify(feature)))

In [17]:
def learnFeatures():
    def vehicleNumberFeature(vnumber):
        return {
            'vehicle_class': vnumber[6],
            'vehicle_prev': vnumber[5]
        }
    featuresets = [(vehicleNumberFeature(vn), cls) for (vn, cls) in sampledata]
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print("(dual) %s is of type %s" %(num, classifier.classify(feature)))

In [18]:
learnSimpleFeatures()
learnFeatures()

(simple) KA-01-G 0109 is of type gov
(simple) KA-02-F 9020 AC is of type rtc
(simple) KA-02-FA 0801 is of type rtc
(simple) KA-01 9129 is of type gov
(dual) KA-01-G 0109 is of type gov
(dual) KA-02-F 9020 AC is of type rtc
(dual) KA-02-FA 0801 is of type rtc
(dual) KA-01 9129 is of type oth


### Segmenting sentences using classification

In [19]:
def featureExtractor(words, i):
    return ({'current-word': words[i], 'next-is-upper': words[i+1][0].isupper()}, words[i+1][0].isupper())

In [20]:
def getFeaturesets(sentence):
    words = nltk.word_tokenize(sentence)
    featuresets = [featureExtractor(words, i) for i in range(1, len(words) - 1) if words[i] == '.']
    return featuresets

In [21]:
def segmentTextAndPrintSentences(data):
    words = nltk.word_tokenize(data)
    for i in range(0, len(words) - 1):
        if words[i] == '.':
            if classifier.classify(featureExtractor(words, i)[0]) == True:
                print(".")
            else:
                print(words[i], end='')
        else:
            print("{} ".format(words[i]), end='')
    print(words[-1])

In [22]:
traindata = "India, officially the Republic of India (Bhārat Gaṇarājya),[e] is a country in South Asia. it is the seventh-largest country by area, the second-most populous country (with over 1.2 billion people), and the most populous democracy in the world. It is bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast. It shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the northeast; and Myanmar (Burma) and Bangladesh to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives. India's Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."
testdata = "The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE. In the following millennium, the oldest scriptures associated with Hinduism began to be composed. Social stratification, based on caste, emerged in the first millennium BCE, and Buddhism and Jainism arose. Early political consolidations took place under the Maurya and Gupta empires; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia. In the medieval era, Judaism, Zoroastrianism, Christianity, and Islam arrived, and Sikhism emerged, all adding to the region's diverse culture. Much of the north fell to the Delhi sultanate; the south was united under the Vijayanagara Empire. The economy expanded in the 17th century in the Mughal Empire. In the mid-18th century, the subcontinent came under British East India Company rule, and in the mid-19th under British crown rule. A nationalist movement emerged in the late 19th century, which later, under Mahatma Gandhi, was noted for nonviolent resistance and led to India's independence in 1947."

In [23]:
traindataset = getFeaturesets(traindata)

In [24]:
classifier = nltk.NaiveBayesClassifier.train(traindataset)

In [25]:
segmentTextAndPrintSentences(testdata)

The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE .
In the following millennium , the oldest scriptures associated with Hinduism began to be composed .
Social stratification , based on caste , emerged in the first millennium BCE , and Buddhism and Jainism arose .
Early political consolidations took place under the Maurya and Gupta empires ; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia .
In the medieval era , Judaism , Zoroastrianism , Christianity , and Islam arrived , and Sikhism emerged , all adding to the region 's diverse culture .
Much of the north fell to the Delhi sultanate ; the south was united under the Vijayanagara Empire .
The economy expanded in the 17th century in the Mughal Empire .
In the mid-18th century , the subcontinent came under British East India Company rule , and in the mid-19th under British crown rule .
A nationalist movement emerged in the late 19th century , which later , u

### Writing a POS tagger with context

In [26]:
sentences = [
    "What is your address when you're in Bangalore?",
    "the president's address on the state of the economy.",
    "He addressed his remarks to the lawyers in the audience.",
    "In order to address an assembly, we should be ready",
    "He laughed inwardly at the scene.",
    "After all the advance publicity, the prizefight turned out to be a laugh.",
    "We can learn to laugh a little at even our most serious foibles."
]

In [27]:
def getSentenceWords():
    sentwords = []
    for sentence in sentences:
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        sentwords.append(words)
    return sentwords

In [28]:
def noContextTagger():
    tagger = nltk.UnigramTagger(getSentenceWords())
    print(tagger.tag('the little remarks towards assembly are laughable'.split()))

In [29]:
def withContextTagger():
    def wordFeatures(words, wordPosInSentence):
        endFeatures = {
            'last(1)': words[wordPosInSentence][-1],
            'last(2)': words[wordPosInSentence][-2:],
            'last(3)': words[wordPosInSentence][-3:],
        }
        if wordPosInSentence > 1:
            endFeatures['prev'] = words[wordPosInSentence - 1]
        else:
            endFeatures['prev'] = '|NONE|'
        return endFeatures
    allsentences = getSentenceWords()
    featureddata = []
    for sentence in allsentences:
        untaggedSentence = nltk.tag.untag(sentence)
        featuredsentence = [(wordFeatures(untaggedSentence, index), tag) for index, (word, tag) in enumerate(sentence)]
        featureddata.extend(featuredsentence)
    breakup = int(len(featureddata) * 0.5)
    traindata = featureddata[breakup:]
    testdata = featureddata[:breakup]
    classifier = nltk.NaiveBayesClassifier.train(traindata)
    print("Accuracy of the classifier : {}".format(nltk.classify.accuracy(classifier, testdata)))

In [30]:
noContextTagger()
withContextTagger()

[('the', 'DT'), ('little', 'JJ'), ('remarks', 'NNS'), ('towards', None), ('assembly', 'NN'), ('are', None), ('laughable', None)]
Accuracy of the classifier : 0.46153846153846156
