### Using inbuilt NERs

In [1]:
import nltk


In [2]:
def sampleNE():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent))

In [None]:
def sampleNE2():
    sent = nltk.corpus.treebank.tagged_sents()[0]
    print(nltk.ne_chunk(sent, binary = True))

In [4]:
sampleNE()

(S
  (PERSON Pierre/NNP)
  (ORGANIZATION Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


In [5]:
sampleNE2()

(S
  (NE Pierre/NNP Vinken/NNP)
  ,/,
  61/CD
  years/NNS
  old/JJ
  ,/,
  will/MD
  join/VB
  the/DT
  board/NN
  as/IN
  a/DT
  nonexecutive/JJ
  director/NN
  Nov./NNP
  29/CD
  ./.)


### Creating, inversing, and using dictionaries

In [6]:
class LearningDictionary():
    def __init__(self,sentence):
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary()
        
    def buildDictionary(self):
        self.dictionary = {}
        for (word,pos) in self.tagged:
            self.dictionary[word] = pos
    
    def buildReverseDictionary(self):
        self.rdictionary = {}
        for key in self.dictionary.keys():
            value = self.dictionary[key]
            if value not in self.rdictionary:
                self.rdictionary[value] = [key]
            else:
                self.rdictionary[value].append(key)
    
    def isWordPresent(self,word):
        return 'Yes' if word in self.dictionary else 'No'
    
    def getPOSForWord(self,word):
        return self.dictionary[word] if word in self.dictionary else None

    def getWordsForPos(self, pos):
        return self.rdictionary[pos] if pos in self.rdictionary else None

In [7]:
sentence = "All the flights got delayed due to bad weather"
learning = LearningDictionary(sentence)
words = ["chair", "flights", "delayed", "pencil", "weather"]
pos = ["NN","VBS", "NNS"]

In [18]:
for word in words:
    status = learning.isWordPresent(word)
    print("Is '{}' present in dictionary? : '{}'".format(word,status))
    if status == 'Yes':
        print("\tPOS for '{}' is {}".format(word, learning.getPOSForWord(word)))
    print("\n")

Is 'chair' present in dictionary? : 'No'


Is 'flights' present in dictionary? : 'Yes'
	POS for 'flights' is NNS


Is 'delayed' present in dictionary? : 'Yes'
	POS for 'delayed' is VBN


Is 'pencil' present in dictionary? : 'No'


Is 'weather' present in dictionary? : 'Yes'
	POS for 'weather' is NN




In [19]:
for pword in pos:
    print("POS '{}' has '{}' words".format(pword,learning.getWordsForPos(pword)))

POS 'NN' has '['weather']' words
POS 'VBS' has 'None' words
POS 'NNS' has '['flights']' words


### Choosing the feature set

In [20]:
import random

In [23]:
sampledata =[
    ('KA-01-F 1034 A' , 'rtc'),
    ('KA-02-F 1030 B' , 'rtc'),
    ('KA-03-FA 1200 C' , 'rtc'),
    ('KA-01-G 0001 A' , 'gov'),
    ('KA-02-G 1004 A' , 'gov'),
    ('KA-03-G 0204 A' , 'gov'),
    ('KA-04-G 9230 A' , 'gov'),
    ('KA-27-F 1290' , 'oth')
]

In [24]:
random.shuffle(sampledata)

In [26]:
testdata =[
    'KA-01-G 0109',
    'KA-02-F 9020 AC',
    'KA-02-FA 0801',
    'KA-01 9129'
]

In [33]:
def learnSimpleFeatures():
    def vehicleNumberFeature(vnumber):
        return {'vehicle_class': vnumber[6]}
    featuresets = [(vehicleNumberFeature(vn), cls) for (vn,cls) in sampledata]
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print("(simple) %s is of type %s" %(num, classifier.classify(feature)))

In [34]:
def learnFeatures():
    def vehicleNumberFeature(vnumber):
        return{
            'vehicle_class' : vnumber[6],
            'vehicle_prev': vnumber[5]
        }
    featuresets = [(vehicleNumberFeature(vn), cls) for (vn,cls) in sampledata]
    classifier = nltk.NaiveBayesClassifier.train(featuresets)
    for num in testdata:
        feature = vehicleNumberFeature(num)
        print("(dual) %s is of type %s" %(num, classifier.classify(feature)))

In [35]:
learnSimpleFeatures()

[({'vehicle_class': 'F'}, 'rtc'), ({'vehicle_class': 'G'}, 'gov'), ({'vehicle_class': 'G'}, 'gov'), ({'vehicle_class': 'F'}, 'rtc'), ({'vehicle_class': 'G'}, 'gov'), ({'vehicle_class': 'F'}, 'oth'), ({'vehicle_class': 'F'}, 'rtc'), ({'vehicle_class': 'G'}, 'gov')]
(simple) KA-01-G 0109 is of type gov
(simple) KA-02-F 9020 AC is of type rtc
(simple) KA-02-FA 0801 is of type rtc
(simple) KA-01 9129 is of type gov


In [36]:
learnFeatures()

(dual) KA-01-G 0109 is of type gov
(dual) KA-02-F 9020 AC is of type rtc
(dual) KA-02-FA 0801 is of type rtc
(dual) KA-01 9129 is of type oth


### Segmenting sentences using classification

In [44]:
def featureExtractor(words, i):
    return({'current-word': words[i],
           'next-is-upper': words[i+1][0].isupper()}, words[i+1][0].isupper())

In [38]:
def getFeaturessets(sentence):
    words = nltk.word_tokenize(sentence)
    featuresets = [featureExtractor(words, i) for i in range(1,len(words)-1) if words[i] == '.']
    return featuresets

In [48]:
def segmentTextAndPrintSentences(data):
    words = nltk.word_tokenize(data)
    for i in range(0, len(words)-1):
        if words[i] == '.':
            if classifier.classify(featureExtractor(words, i)[0]) == True:
                print(".")
            else:
                print(words[i], end = ' ')
        else:
            print("{}".format(words[i]), end = ' ')
    print(words[-1])

In [57]:
traindata = "India, officially the Republic of India (Bhārat Gaṇarājya),[e] is a country in South Asia. it is the seventh-largest country by area, the second-most populous country (with over 1.2 billion people), and the most populous democracy in the world. It is bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast. It shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the northeast; and Myanmar (Burma) and Bangladesh to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives. India's Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."

In [58]:
testdata = "The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE. In the following millennium, the oldest scriptures associated with Hinduism began to be composed. Social stratification, based on caste, emerged in the first millennium BCE, and Buddhism and Jainism arose. Early political consolidations took place under the Maurya and Gupta empires; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia. In the medieval era, Judaism, Zoroastrianism, Christianity, and Islam arrived, and Sikhism emerged, all adding to the region's diverse culture. Much of the north fell to the Delhi sultanate; the south was united under the Vijayanagara Empire. The economy expanded in the 17th century in the Mughal Empire. In the mid-18th century, the subcontinent came under British East India Company rule, and in the mid-19th under British crown rule. A nationalist movement emerged in the late 19th century, which later, under Mahatma Gandhi, was noted for nonviolent resistance and led to India's independence in 1947."

In [59]:
traindataset = getFeaturessets(traindata)

In [60]:
classifier = nltk.NaiveBayesClassifier.train(traindataset)

In [61]:
segmentTextAndPrintSentences(testdata)

The Indian subcontinent was home to the urban Indus Valley Civilisation of the 3rd millennium BCE .
In the following millennium , the oldest scriptures associated with Hinduism began to be composed .
Social stratification , based on caste , emerged in the first millennium BCE , and Buddhism and Jainism arose .
Early political consolidations took place under the Maurya and Gupta empires ; the later peninsular Middle Kingdoms influenced cultures as far as southeast Asia .
In the medieval era , Judaism , Zoroastrianism , Christianity , and Islam arrived , and Sikhism emerged , all adding to the region 's diverse culture .
Much of the north fell to the Delhi sultanate ; the south was united under the Vijayanagara Empire .
The economy expanded in the 17th century in the Mughal Empire .
In the mid-18th century , the subcontinent came under British East India Company rule , and in the mid-19th under British crown rule .
A nationalist movement emerged in the late 19th century , which later , u

### Classifying documents

In [62]:
import feedparser

In [64]:
urls = {
    'mlb': 'https://sports.yahoo.com/mlb/rss.xml',
    'nfl': 'https://sports.yahoo.com/nfl/rss.xml'
}

In [66]:
feedmap = {}
stopwords = nltk.corpus.stopwords.words('english')

In [67]:
def featureExtractor(words):
    features = {}
    for word in words:
        if word not in stopwords:
            features["word({})".format(word)] = True
    return features

In [72]:
sentences = []

In [74]:
for category in urls.keys():
    feedmap[category] = feedparser.parse(urls[category])
    print("downloading {}".format(urls[category]))
    for entry in feedmap[category]['entries']:
        data = entry['summary']
        words = data.split()
        sentences.append((category,words))

downloading https://sports.yahoo.com/mlb/rss.xml
downloading https://sports.yahoo.com/nfl/rss.xml


In [76]:
featuresets = [(featureExtractor(words), category) for category, words in sentences]
random.shuffle(featuresets)

In [77]:
total = len(featuresets)
off = total // 2
trainset = featuresets[off:]
testset = featuresets[:off]


In [78]:
classifier = nltk.NaiveBayesClassifier.train(trainset)

In [79]:
print(nltk.classify.accuracy(classifier,testset))

0.8823529411764706


In [80]:
classifier.show_most_informative_features(5)

Most Informative Features
            word(Eagles) = True              nfl : mlb    =     14.0 : 1.0
             word(Aaron) = True              mlb : nfl    =      7.4 : 1.0
               word(NFL) = True              nfl : mlb    =      6.4 : 1.0
              word(2022) = True              nfl : mlb    =      5.3 : 1.0
           word(season.) = True              nfl : mlb    =      5.3 : 1.0


In [82]:
for (i,entry) in enumerate(feedmap['nfl']['entries']):
    if i < 4:
        features = featureExtractor(entry['title'].split())
        category = classifier.classify(features)
        print('{} -> {}'.format(category, entry['summary']))

nfl -> Philadelphia came out flat as a two-touchdown favorite in Houston, but it pulled away in the second half behind a dominant defense to improve to 8-0.
mlb -> On this week's Bears Wire podcast, we discuss how Ryan Poles' moves at the trade deadline show he's prioritizing Justin Fields.
nfl -> As we enter Week 9, the midpoint of the season provides a checkpoint for those first-half surprises – from the good, to the bad, to the ugly.
nfl -> Coming off their loss to the San Francisco 49ers, the Rams face another stiff test against Tom Brady and the Tampa Bay Buccaneers. Who's the favorite?


In [83]:
for (i,entry) in enumerate(feedmap['mlb']['entries']):
    if i < 4:
        features = featureExtractor(entry['title'].split())
        category = classifier.classify(features)
        print('{} -> {}'.format(category, entry['summary']))

mlb -> Justin Verlander finally earned his first World Series victory as Houston held on for a 3-2 win in Philadelphia. The Astros are going home with a 3-2 lead in the series.
mlb -> Justin Verlander's teammates gave him the rookie treatment after the game and doused him with all sorts of stuff after his first Fall Classic win.
mlb -> Albert Pujols is set to fulfill his personal-services contract with the Angels. What does that mean for the Angels and his association with the Cardinals?
mlb -> Justin Verlander, Jeremy Peña could be forever linked in Astros' history after leading Houston to a Game 5 win vs. the Phillies.


### Writing a POS tagger with context

In [85]:
sentences = [
"What is your address when you're in Bangalore?",
"the president's address on the state of the economy.",
"He addressed his remarks to the lawyers in the audience.",
"In order to address an assembly, we should be ready",
"He laughed inwardly at the scene.",
"After all the advance publicity, the prizefight turned out to be a laugh.",
"We can learn to laugh a little at even our most serious foibles."
]

In [87]:
def getSentenceWords():
    sentwords = []
    for sentence in sentences:
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        sentwords.append(words)
    return sentwords

In [89]:
def noContextTagger():
    tagger = nltk.UnigramTagger(getSentenceWords())
    print(tagger.tag('the little remarks towards assembly are laughable'.split()))

In [90]:
def withContextTagger():
    def wordFeatures(words, wordPosInSentence):
        endFeatures = {
            'last(1)': words[wordPosInSentence][-1],
            'last(2)': words[wordPosInSentence][-2:],
            'last(3)': words[wordPosInSentence][-3:],
        }
        if wordPosInSentence > 1:
            endFeatures['prev'] = words[wordPosInSentence - 1]
        else:
            endFeatures['prev'] = '|NONE|'
        return endFeatures
    allsentences = getSentenceWords()
    featureddata = []
    for sentence in allsentences:
        untaggedSentence = nltk.tag.untag(sentence)
        featuredsentence = [(wordFeatures(untaggedSentence, index), tag) for
        index, (word, tag) in enumerate(sentence)]
        featureddata.extend(featuredsentence)
    breakup = int(len(featureddata) * 0.5)
    traindata = featureddata[breakup:]
    testdata = featureddata[:breakup]
    classifier = nltk.NaiveBayesClassifier.train(traindata)
    print("Accuracy of the classifier : {}".format(nltk.classify.accuracy(classifier, testdata)))

In [92]:
noContextTagger()


[('the', 'DT'), ('little', 'JJ'), ('remarks', 'NNS'), ('towards', None), ('assembly', 'NN'), ('are', None), ('laughable', None)]


In [93]:
withContextTagger()

Accuracy of the classifier : 0.46153846153846156
