# Libraries

In [1]:
import wikipedia
import numpy as np
import pandas as pd

import re, nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix

from math import log

# Text Cleaning

In [2]:
def spell_correct(text):
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\0s", "0", text)    
    return text

def remove_url(text):
    URL_REGEX = re.compile(r'''((http[s]?://)[^ <>'"{}|\\^`[\]]*)''')
    return URL_REGEX.sub(r' ', text)

def remove_handles(text):
    HANDLES_REGEX = re.compile(r'@\S+')
    return HANDLES_REGEX.sub(r' ', text)

def remove_incomplete_last_word(text):
    INCOMPLETE_LAST_WORD_REGEX = re.compile(r'\S+…')
    return INCOMPLETE_LAST_WORD_REGEX.sub(r' ', text )
    
def remove_hashtags(text):
    HASHTAGS_REGEX = re.compile(r'#\S+')
    return HASHTAGS_REGEX.sub(r' ', text)

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

remove_shortwords = lambda x: ' '.join(word for word in x.split() if len(word) > 2)

with open('stopwords.txt') as f:
    stop_words = f.readlines()
stop_words = list(map(lambda x :x.strip(), stop_words))
stop_words = list(set(stop_words))
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = str(x)
    x = remove_url(x)
    x = remove_handles(x)
    x = remove_incomplete_last_word(x)
    x = remove_hashtags(x)
    x = lower_case(x)
    x = spell_correct(x)
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    x = remove_shortwords(x)
    
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

def cleanSentence(x):
    WIKI_REGEX = re.compile(r"\[\d+\]")
    x =  WIKI_REGEX.sub(' ', str(x))
    x = remove_extra_spaces(x)
    return x

# KeyWords Extractor

In [3]:
#Function for sorting tf_idf in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def extractKeyWords(data, K, index):
    # Text Cleaning
    corpus = data.apply(lambda x: cleanText(str(x), True, False, False, False))
    
    # creating a vector of words
    cv=CountVectorizer(stop_words=stop_words)
    X=cv.fit_transform(corpus)
    
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)

    # get feature names
    feature_names=cv.get_feature_names()

    # fetch document for which keywords needs to be extracted
    doc = cleanText(str(index), True, False, False, False)

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
    
    #sort the tf-idf vectors by descending order of scores

    sorted_items=sort_coo(tf_idf_vector.tocoo())

    keywords=extract_topn_from_vector(feature_names,sorted_items,K)

    # now print the results
    return keywords.keys()

# Summarizer

In [4]:
def idf(word, text):
    sentences = sent_tokenize(text)
    sentences = list(map(lambda x : cleanText(str(x), True, False, True, False), sentences))
    N = len(sentences)
    ni = 0
    for sent in sentences:
        if word in sent:
            ni += 1
    return log(N/ni)

def diffWords(sent1, sent2):
    sent1 = cleanText(str(sent1), True, False, True, False)
    sent2 = cleanText(str(sent2), True, False, True, False)
    n1 = len(sent1.split())
    n2 = len(sent2.split())
    n = (n1+n2)/2
    ni = 0
    for word in sent1.split():
        if word in sent2.split():
            ni += 1
    return (n-ni)

def minDiffWords(sent, lst):
    if len(lst) == 0:
        return 100
    diffwords = []
    for s in lst:
        diffwords.append(diffWords(sent, s))
    return min(diffwords)

In [5]:
def textSummarize(n, text):
    clean_text = cleanText(str(text), True, False, True, False)     

    sentences = list(sent_tokenize(str(text)))
    
    sentences = list(map(cleanSentence, sentences))
    
    
    word2count ={}

    for word in nltk.word_tokenize(clean_text):
        if word not in stop_words:
            if word not in word2count.keys():
                word2count[word] = 0
            word2count[word] += 1

    for key in word2count.keys():
        word2count[key] = word2count[key]/max(word2count.values())

    word2idf = {}
    for word in nltk.word_tokenize(clean_text):
        if word not in stop_words:
            if word not in word2idf.keys():
                word2idf[word] = idf(word, text)
        
    sent2score = {}

    for sentence in sentences:
        for word in nltk.word_tokenize(str(sentence).lower()):
            if word in word2count.keys():
                if sentence not in sent2score.keys():
                    sent2score[sentence] = 0
                sent2score[sentence] += word2count[word]*word2idf[word]

    sorted_sentences = [(k, sent2score[k]) for k in sorted(sent2score, key=sent2score.get, reverse=True)]

    summary = []
    i = 1
    for k, v in sorted_sentences:
        if minDiffWords(k, summary) > 4:
            summary.append(str(k))
            i += 1
        if i > n:
            break
    summary_indices = {}
    for s in summary:
        summary_indices[s] = sentences.index(s)
        
    summary_ordered = [k for k in sorted(summary_indices, key=summary_indices.get)]
    return summary_ordered

In [14]:
def matches(query, text):
    count = 0
    for q in query:
        if q in text:
            count += 1
    return count

In [24]:
botname = 'Maester Pymon'
query = input(f'{botname}: What do you want to know about?\nYou: ')
search_results = wikipedia.search(query.lower())
if len(search_results) > 0:
    print(f'{botname}: I will answer your queries about {search_results[0]}')
    input('You: ')
    print(f'{botname}:\n{wikipedia.summary(search_results[0], sentences=3)}')
    
    fetched_content = wikipedia.page(query).content

    cleaned = re.split(r"=+", fetched_content)
    structured_result = []
    for c in cleaned[1:]:
        structured_result.append(' '.join(c.split()))

    all_contents = []
    for i, e in enumerate(structured_result):
        if len(e.strip().split()) > 9:
            all_contents.append(e.strip())

    all_contents = pd.DataFrame(all_contents)

    all_contents.columns = ['Text']
    

    all_contents['Tokens'] = all_contents['Text'].apply(lambda x :cleanText(str(x), True, False, True, False))
    
    q = ''
    while q != []:
        q = cleanText(input('You: '), True, False, True, False).split()
        print(q)
        all_contents['scores'] = 0
        for i in range(len(all_contents)):
            all_contents['scores'][i] = matches(['lost', 'hand'],all_contents.loc[i]['Tokens'])
        print(f'{botname}:\n{all_contents.sort_values("scores", ascending=False).iloc[0]["Text"]}')
else:
    print('Not Found')

Maester Pymon: What do you want to know about?
You: jaime lannister
Maester Pymon: I will answer your queries about Jaime Lannister
You: ok
Maester Pymon:
Jaime Lannister is a fictional character in the A Song of Ice and Fire series of fantasy novels by American author George R. R. Martin, and its television adaptation Game of Thrones. He becomes a prominent point of view character in the novels beginning in A Storm of Swords (2000).
Introduced in A Game of Thrones (1996), Jaime is a knight of the Kingsguard and a member of House Lannister, the wealthiest and one of the most powerful families in the kingdom of Westeros.
You: who chopped his hand
['chopped', 'hand']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Maester Pymon:
In A Game of Thrones (1996), Jaime is introduced as one of the Kingsguard, the royal security detail, and the son of wealthy and powerful Tywin Lannister, the former Hand of the King. Jaime's twin is Cersei, the Queen of Westeros by virtue of her marriage to King Robert Baratheon. Perhaps the greatest swordsman in the kingdom, Jaime is derisively referred to as "the Kingslayer" because he killed the "Mad King" Aerys Targaryen in the coup that put Robert on the Iron Throne.Eric Dodds of Time described Jaime as "handsome, an incomparably skilled fighter and disarmingly witty", with The New Yorker calling the Lannisters "a crowd of high-cheekboned beauties ... who form a family constellation so twisted, charismatic, and cruel that it rivals Flowers in the Attic for blond dysfunction". Lev Grossman wrote for Time that while Jaime and Cersei's younger brother Tyrion is a grotesque dwarf, "the rest of the Lannisters are stunted too, but on the inside." The Los Angeles Times ca

In [25]:
all_contents

Unnamed: 0,Text,Tokens,scores
0,"In A Game of Thrones (1996), Jaime is introduc...",game throne jaime introduced kingsguard royal ...,1
1,Darren Franich of Entertainment Weekly noted t...,darren franich entertainment weekly noted nove...,1
2,Jaime Lannister accompanies the royal family t...,jaime lannister accompanies royal family winte...,1
3,"Tyrion makes several attempts to free Jaime, f...",tyrion make attempt free jaime disguised lanni...,0
4,Jaime is freed by Catelyn and sent to King's L...,jaime freed catelyn sent king landing exchange...,1
5,Jaime and Cersei's relationship breaks down af...,jaime cersei relationship break repeatedly dec...,1
6,Jaime travels to Raventree Hall and negotiates...,jaime travel raventree hall negotiates lord ty...,0
7,Jaime is portrayed by Danish actor Nikolaj Cos...,jaime portrayed danish actor nikolaj coster wa...,1
8,"Jaime's storyline in the first season remains,...",jaime storyline season remains identical book ...,0
9,Robb brings a captive Jaime with his camp as t...,robb brings captive jaime camp march westerlan...,0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
all_contents.loc[10]['Tokens']

'jaime brienne captured squad bolton soldier jaime manages convince rape brienne leader locke take umbrage jaime try use status secure release chop jaime sword hand taken harrenhal maester qyburn treat jaime wound jaime reveals brienne killed aery roose bolton let jaime return king landing insists keeping brienne prisoner abetting treason jaime ultimately return rescue brienne killed bear locke amusement return king landing jaime reunited cersei'

**fuzzywuzzy**

In [None]:
a/0

In [None]:
query = "mahendra singh dhoni"

In [None]:
fetched_content = wikipedia.page(query).content

cleaned = re.split(r"=+", fetched_content)
structured_result = []
for c in cleaned[1:]:
    structured_result.append(' '.join(c.split()))

all_contents = []
for i, e in enumerate(structured_result):
    if len(e.strip().split()) > 9:
        all_contents.append(e.strip())

all_contents = pd.DataFrame(all_contents)

all_contents.columns = ['Text']

all_contents['Keywords'] = all_contents['Text'].apply(lambda x :extractKeyWords(all_contents['Text'], 10, x))

In [None]:
all_contents

In [None]:
q = cleanText(input(' '), True, False, True, False).split()

In [None]:
q

In [None]:
all_contents['scores'] = all_contents['Keywords'].apply(lambda x : matcher(q, x))

In [None]:
all_contents['Text'][17]

In [None]:
all_contents.sort_values('scores', ascending=False).iloc[0]['Text']