### Creating an NLP pipeline

In [1]:
import nltk
import threading
import queue
import feedparser
import uuid

In [3]:
threads = []
queues = [queue.Queue(), queue.Queue()]

In [9]:
def extractWords():
    url = 'https://timesofindia.indiatimes.com/rssfeeds/1081479906.cms'
    feed = feedparser.parse(url)
    for entry in feed['entries'][:5]:
        text = entry['title']
        if 'ex' in text:
            continue
        words = nltk.word_tokenize(text)
        data = {'uuid': uuid.uuid4(), 'input': words}
        queues[0].put(data, True)
        print(">> {} : {}".format(data["uuid"], text))

In [10]:
def extractPOS():
    while True:
        if queues[0].empty():
            break
        else:
            data = queues[0].get()
            words = data['input']
            postags = nltk.pos_tag(words)
            queues[0].task_done()
            queues[1].put({'uuid': data['uuid'], 'input': postags}, True)
            

In [11]:
def extractNE():
    while True:
        if queues[1].empty():
            break
        else:
            data = queues[1].get()
            postags = data['input']
            queues[1].task_done()
            chunks = nltk.ne_chunk(postags, binary = False)
            print(" << {} : ".format(data['uuid']), end = '')
            for path in chunks:
                try:
                    label = path.label()
                    print(path, end = ',')
                except:
                    pass
            print()

In [12]:
e = threading.Thread(target = extractWords())
e.start()
threads.append(e)

>> 7c6ac89e-beb6-4001-b51d-4591f5634528 : How much fanaticism is too much? - #BigStory
>> 2a9e7866-b8d2-4810-b282-2ead4c6c2730 : Live: Phone Bhoot, Mili, Double XL BO Day 1
>> 8c9805c2-2d70-4ecc-8836-3897140dd09f : Bomma Blockbuster- Review: 2/5
>> c6acd8c5-26b0-4b5e-94f1-1ab7adb698b6 : Actresses who gained weight for films
>> 453e6313-34ac-4a79-826b-10cbefbb9707 : Movie Review: 'Mili'- 2.5/5


In [13]:
p = threading.Thread(target = extractPOS())
p.start()
threads.append(p)

In [14]:
n = threading.Thread(target = extractNE())
n.start()
threads.append(n)

 << 7c6ac89e-beb6-4001-b51d-4591f5634528 : 
 << 2a9e7866-b8d2-4810-b282-2ead4c6c2730 : (PERSON Phone/NN Bhoot/NNP),(PERSON Mili/NNP),(PERSON Double/NNP XL/NNP BO/NNP),
 << 8c9805c2-2d70-4ecc-8836-3897140dd09f : (PERSON Bomma/NNP),
 << c6acd8c5-26b0-4b5e-94f1-1ab7adb698b6 : 
 << 453e6313-34ac-4a79-826b-10cbefbb9707 : (PERSON Movie/NNP),


In [15]:
queues[0].join()
queues[1].join()

for t in threads:
    t.join()

### Solving the text similarity problem

In [17]:
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [19]:
class TextSimilarityExample:
    def __init__(self):
        self.statements = [
            'ruled india',
            'Chalukyas ruled Badami',
            'So many kingdoms ruled India',
            'Lalbagh is a botanical garden in India'
        ]
    def TF(self, sentence):
        words = nltk.word_tokenize(sentence.lower())
        freq = nltk.FreqDist(words)
        dictionary = {}
        for key in freq.keys():
            norm = freq[key]/float(len(words))
            dictionary[key] = norm
        return dictionary
    
    def IDF(self):
        def idf(TotalNumberOfDocuments, NumberOfDocumentsWithThisWord):
            return 1.0 + math.log(TotalNumberOfDocuments/NumberOfDocumentsWithThisWord)
        numDocuments = len(self.statements)
        uniqueWords = {}
        idfValues = {}
        for sentence in self.statements:
            for word in nltk.word_tokenize(sentence.lower()):
                if word not in uniqueWords:
                    uniqueWords[word] = 1
                else:
                    uniqueWords[word] += 1
        for word in uniqueWords:
            idfValues[word] = idf(numDocuments, uniqueWords[word])
        return idfValues
    
    def TF_IDF(self, query):
        words = nltk.word_tokenize(query.lower())
        idf = self.IDF()
        vectors = {}
        for sentence in self.statements:
            tf = self.TF(sentence)
            for word in words:
                tfv = tf[word] if word in tf else 0.0
                idfv = idf[word] if word in idf else 0.0
                mul = tfv * idfv
                if word not in vectors:
                    vectors[word] = []
                vectors[word].append(mul)
        return vectors
    
    def displayVectors(self, vectors):
        print(self.statements)
        for word in vectors:
            print("{} -> {}".format(word, vectors[word]))
            
    def cosineSimilarity(self):
        vec = TfidfVectorizer()
        matrix = vec.fit_transform(self.statements)
        for j in range(1, 5):
            i = j - 1
            print("\tsimilarity of document {} with others".format(i))
            similarity = cosine_similarity(matrix[i:j], matrix)
            print(similarity)
    
    def demo(self):
        inputQuery = self.statements[0]
        vectors = self.TF_IDF(inputQuery)
        self.displayVectors(vectors)
        self.cosineSimilarity()

In [20]:
similarity = TextSimilarityExample()
similarity.demo()

['ruled india', 'Chalukyas ruled Badami', 'So many kingdoms ruled India', 'Lalbagh is a botanical garden in India']
ruled -> [0.6438410362258904, 0.42922735748392693, 0.2575364144903562, 0.0]
india -> [0.6438410362258904, 0.0, 0.2575364144903562, 0.18395458177882582]
	similarity of document 0 with others
[[1.         0.29088811 0.46216171 0.19409143]]
	similarity of document 1 with others
[[0.29088811 1.         0.13443735 0.        ]]
	similarity of document 2 with others
[[0.46216171 0.13443735 1.         0.08970163]]
	similarity of document 3 with others
[[0.19409143 0.         0.08970163 1.        ]]


### Identifying topics

In [21]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora, models


In [28]:
class IdentifyingTopicExample:
    def getDocuments(self):
        url = 'https://sports.yahoo.com/mlb/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['title']
            if 'ex' in text:
                continue
            self.documents.append(text)
            print("-- {}".format(text))
        print("INFO: Fetchng documents from {} completed".format(url))

    def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase = doc.lower()
            words = tokenizer.tokenize(lowercase)
            non_stop_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stop_words)
    
    def doLDA(self):
        dictionary = corpora.Dictionary(self.cleaned)
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        ldamodel = models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary)
        print(ldamodel.print_topics(num_topics=2, num_words=4))
        
    def run(self):
        self.getDocuments()
        self.cleanDocuments()
        self.doLDA()
        
        

In [29]:
topicExample = IdentifyingTopicExample()
topicExample.run()

-- Verlander's first win in World Series wasn't easy
-- Detroit Tigers LHP Joey Wentz is breakout candidate for 2023 after strong finish
-- Yanks’ Cashman plans to push ahead on Aaron Judge talks
-- Will free-agent starter Chris Bassitt be one and done with the Mets? | Baseball Night in NY
INFO: Fetchng documents from https://sports.yahoo.com/mlb/rss.xml completed
[(0, '0.050*"lhp" + 0.050*"detroit" + 0.050*"breakout" + 0.050*"finish"'), (1, '0.038*"one" + 0.038*"done" + 0.038*"ny" + 0.038*"chris"')]


### Resolving anaphora

In [35]:
from nltk.chunk import tree2conlltags
from nltk.corpus import names
import random

In [41]:
class AnaphoraExample:
    def __init__(self):
        males = [(name, 'male') for name in names.words('male.txt')]
        females = [(name, 'female') for name in names.words('female.txt')]
        combined = males + females
        random.shuffle(combined)
        training = [(self.feature(name), gender) for (name, gender) in combined]
        self._classifier = nltk.NaiveBayesClassifier.train(training)
        
    def feature(self,word):
        return {'last(1)' : word[-1]}
    
    def gender(self, word):
        return self._classifier.classify(self.feature(word))
    
    def learnAnaphora(self):
        sentences = [
            "John is a man. He walks",
            "John and Mary are married. They have two kids",
            "In order for Ravi to be successful, he should follow John",
            "John met Mary in Barista. She asked him to order a Pizza"
        ]
        
        for sent in sentences:
            chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False)
            stack = []
            print(sent)
            items = tree2conlltags(chunks)
            for item in items:
                if item[1] == 'NNP' and (item[2] == 'B-PERSON' or item[2] == 'O'):
                    stack.append((item[0], self.gender(item[0])))
                elif item[1] == 'CC':
                    stack.append(item[0])
                elif item[1] == 'PRP':
                    stack.append(item[0])   
            print("\t {}".format(stack))

In [42]:
anaphora = AnaphoraExample()
anaphora.learnAnaphora()

John is a man. He walks
	 [('John', 'male'), 'He']
John and Mary are married. They have two kids
	 [('John', 'male'), 'and', ('Mary', 'female'), 'They']
In order for Ravi to be successful, he should follow John
	 [('Ravi', 'female'), 'he', ('John', 'male')]
John met Mary in Barista. She asked him to order a Pizza
	 [('John', 'male'), ('Mary', 'female'), 'She', 'him']


### Disambiguating word sense

In [44]:
def understandWordSenseExample():
    words = ['wind', 'date', 'left']
    print("--examples--")
    for word in words:
        syns = nltk.corpus.wordnet.synsets(word)
        for syn in syns[:2]:
            for example in syn.examples()[:2]:
                print("{} -> {} -> {}".format(word, syn.name(), example))

In [45]:
def understandBuiltinWSD():
    print("--built in wsd")
    maps = [
        ('Is it the fish net that you are using to catch fish ?', 'fish',
        'n'),
        ('Please dont point your finger at others.', 'point', 'n'),
        ('I went to the river bank to see the sun rise', 'bank', 'n'),
    ]
    for m in maps:
        print("sense '{}' for '{}' -> '{}'".format(m[0], m[1],nltk.wsd.lesk(m[0], m[1], m[2])))
        nltk.wsd.lesk(m[0], m[1], m[2])

In [46]:
understandWordSenseExample()
understandBuiltinWSD()

--examples--
wind -> wind.n.01 -> trees bent under the fierce winds
wind -> wind.n.01 -> when there is no wind, row
wind -> wind.n.02 -> the winds of change
date -> date.n.01 -> what is the date today?
date -> date.n.02 -> his date never stopped talking
left -> left.n.01 -> she stood on the left
--built in wsd
sense 'Is it the fish net that you are using to catch fish ?' for 'fish' -> 'Synset('pisces.n.02')'
sense 'Please dont point your finger at others.' for 'point' -> 'Synset('point.n.25')'
sense 'I went to the river bank to see the sun rise' for 'bank' -> 'Synset('savings_bank.n.02')'


### Performing sentiment analysis

In [81]:
import nltk.sentiment.util
import nltk.sentiment.sentiment_analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [90]:
def mySentimentAnalyzer():
    def score_feedback(text):
        positive_words = ['love', 'genuine', 'liked']
        if '_NEG' in ''.join(nltk.sentiment.util.mark_negation(text.split())):
            score = -1
        else:
            analysis = nltk.sentiment.util.extract_unigram_feats(text.split(), positive_words)
            if True in analysis.values():
                score = 1
            else:
                score = 0
        return score
    
    feedback = """I love the items in this shop, very genuine and quality is well maintained. 
    I have visited this shop and had samosa, my friends liked it very much. ok average food in this shop. 
    Fridays are very busy in this shop, do not place orders during this day."""
        
    print(' -- custom scorer --')
    for text in feedback.split("\n"):
        print("score = {} for >> {}".format(score_feedback(text), text))

In [91]:
def advancedSentimentAnalyzer():
    sentences = [
        ':)',
        ':(',
        'She is so :(',
        'I love the way cricket is played by the champions',
        'She neither likes coffee nor tea',
    ]
    senti = SentimentIntensityAnalyzer()
    print(' -- built-in intensity analyser --')
    for sentence in sentences:
        print('[{}]'.format(sentence), end=' --> ')
        kvp = senti.polarity_scores(sentence)
        for k in kvp:
            print('{} = {}, '.format(k, kvp[k]), end='')
        print()

In [92]:
advancedSentimentAnalyzer()
mySentimentAnalyzer()


 -- built-in intensity analyser --
[:)] --> neg = 0.0, neu = 0.0, pos = 1.0, compound = 0.4588, 
[:(] --> neg = 1.0, neu = 0.0, pos = 0.0, compound = -0.4404, 
[She is so :(] --> neg = 0.555, neu = 0.445, pos = 0.0, compound = -0.5777, 
[I love the way cricket is played by the champions] --> neg = 0.0, neu = 0.375, pos = 0.625, compound = 0.875, 
[She neither likes coffee nor tea] --> neg = 0.318, neu = 0.682, pos = 0.0, compound = -0.3252, 
 -- custom scorer --
score = 1 for >> I love the items in this shop, very genuine and quality is well maintained. 
score = 1 for >>     I have visited this shop and had samosa, my friends liked it very much. ok average food in this shop. 
score = -1 for >>     Fridays are very busy in this shop, do not place orders during this day.


### Creating a conversational assistant or chatbot

In [93]:
def builtinEngines(whichOne):
    if whichOne == 'eliza':
        nltk.chat.eliza.demo()
    elif whichOne == 'iesha':
        nltk.chat.iesha.demo()
    elif whichOne == 'rude':
        nltk.chat.rude.demo()
    elif whichOne == 'suntsu':
        nltk.chat.suntsu.demo()
    elif whichOne == 'zen':
        nltk.chat.zen.demo()
    else:
        print("unknown built-in chat engine {}".format(whichOne))

In [95]:
def myEngine():
    chatpairs = (
        (r"(.*?)Stock price(.*)",
        ("Today stock price is 100",
        "I am unable to find out the stock price.")),
        (r"(.*?)not well(.*)",
        ("Oh, take care. May be you should visit a doctor",
        "Did you take some medicine ?")),
        (r"(.*?)raining(.*)",
        ("Its monsoon season, what more do you expect ?",
        "Yes, its good for farmers")),
        (r"How(.*?)health(.*)",
        ("I am always healthy.",
        "I am a program, super healthy!")),
        (r".*",
        ("I am good. How are you today ?",
        "What brings you here ?"))
    )
    def chat():
        print("!"*80)
        print(" >> my Engine << ")
        print("Talk to the program using normal english")
        print("="*80)
        print("Enter 'quit' when done")
        chatbot = nltk.chat.util.Chat(chatpairs,nltk.chat.util.reflections)
        chatbot.converse()
        
    chat()

In [None]:
for engine in ['eliza', 'iesha', 'rude', 'suntsu', 'zen']:
    print("=== demo of {} ===".format(engine))
    builtinEngines(engine)
    print()
myEngine()

=== demo of eliza ===
Therapist
---------
Talk to the program by typing in plain English, using normal upper-
and lower-case letters and punctuation.  Enter "quit" when done.
Hello.  How are you feeling today?
