In [None]:
from whoosh.analysis import Filter
from whoosh import index as Index
from whoosh import writing
from whoosh.fields import Schema, TEXT, ID, STORED
from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter, StemFilter
from whoosh import qparser
from whoosh.qparser import QueryParser, GtLtPlugin, PhrasePlugin, SequencePlugin
from whoosh import scoring

import nltk
from nltk.data import find
from nltk.tag import PerceptronTagger
from nltk.corpus import stopwords

import os, os.path # os - portable way of using operating system dependent functionality
import shutil #High-level file operations

import tkinter
from tkinter import *
import tkinter.font as tkFont
import tkinter.ttk as ttk
from tkinter.ttk import Combobox,Treeview,Scrollbar

import datetime
import sqlite3

import pandas

import webbrowser

from collections import Counter

In [None]:
nltk.download('punkt',quiet='true')
nltk.download("averaged_perceptron_tagger",quiet='true')
nltk.download("wordnet",quiet='true')
nltk.download("brown",quiet='true')

def get_KeyPhrases(textInput, k = 15, version = 'Summary'):
    #version = Summary or version = PMI
    
    #setting up tagger
    #(from http://stackoverflow.com/a/35964709)
    PICKLE = "averaged_perceptron_tagger.pickle"
    AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
    tagger = PerceptronTagger(load=False)
    tagger.load(AP_MODEL_LOC)
    pos_tag = tagger.tag

    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()
    
    # This grammar is described in the paper by S. N. Kim,
    # T. Baldwin, and M.-Y. Kan.
    # Evaluating n-gram based evaluation metrics for automatic
    # keyphrase extraction.
    # Technical report, University of Melbourne, Melbourne 2010.
    
    stopwords = stopwords.words('english')


    def leaves(tree):
        """Finds NP (nounphrase) leaf nodes of a chunk tree."""
        for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
            yield subtree.leaves()

    def acceptable_word(word):
        """Checks conditions for acceptable word: length, stopword."""
        accepted = bool(2 < len(word) and word.lower() not in stopwords)
        return accepted        

    def normalise(word):
        """Normalises words to lowercase and stems and lemmatizes it."""
        word = word.lower()
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word)
        return word

    def get_terms(tree):
        for leaf in leaves(tree):
            #can modify normalise to w.lower() if dont want to normalize word
            term = [ normalise(w) for w,t in leaf if acceptable_word(w) ]
            yield term
        
    def get_nounPhrases(textInput, minWordLength = 2):
        lemmatizer = nltk.WordNetLemmatizer()
        stemmer = nltk.stem.porter.PorterStemmer()

        grammar = r"""

        NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
        NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
                  """

        chunker = nltk.RegexpParser(grammar)
    
        toks = nltk.word_tokenize(textInput)
        #print(toks)
        pos_tag = tagger.tag
        postoks = pos_tag(toks)

        tree = chunker.parse(postoks)
        terms = get_terms(tree)
   
        nounPhraseList = []
        for tid,term in enumerate(terms):
            templist = []
            for wid, word in enumerate(term):
                #print("TID: ",tid," WID: ",(wid+1), word)
                templist.append(word)
        
            s = " "
            nounPhraseList.append(s.join(templist))

        nounPhraseList = [word for word in nounPhraseList if len(word.split())>=minWordLength]
        return nounPhraseList
    
    counter = Counter()
    for nounPhrase in  get_nounPhrases(textInput):
        #print(nounPhrase)
        counter.update([nounPhrase])
    if version.lower() == 'summary':       
        topkNPdf =pandas.DataFrame([[key,value] for key,value in counter.items()],columns=['Term','Frequency'])
        #topkNPdf = topkNPdf.reset_index(drop=True)

        #if less than max (15), use correct number of key phrases
        if topkNPdf.shape[0]<k:
            print("\n \nTop" ,topkNPdf.shape[0], "key phrases (minimum phrase length = 2 ): \n")
        else:
            print("\n \nTop" ,k, "key phrases (minimum phrase length = 2): \n") 


        topkNPdf= topkNPdf.sort_values('Frequency', axis=0, ascending=False).head(k)
        topkNPdf = topkNPdf.reset_index(drop=True)
        return topkNPdf
    
    elif version.lower() == 'pmi':
        return counter.most_common(k);

In [None]:
get_KeyPhrases('. '.join(df2['body'].tolist()))

In [None]:
class CustomFilter(Filter):
    # This filter will run for both the index and the query
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [None]:
def whooshSearch(df, userQuery):
    
    #Defining constants for the data paths ***** MODIFY ACCORDINGLY *****
    INDEX_DIR = "C:/UofT/4th_year/Capstone/Python_directory/schema"
    
    
    #BUILD SCHEMA ****
    #schema has fields - piece of info for each doc in the index
    customWordFilter = RegexTokenizer()|LowercaseFilter()|CustomFilter(nltk.stem.porter.PorterStemmer().stem)|CustomFilter(nltk.WordNetLemmatizer().lemmatize)

    ixSchema = Schema(comment_ID = ID(stored=True),
                         comment_Subreddit = ID(stored=True),
                         #note analyzer is a wrapper for a tokenizer and zero or more filters -- i.e. allows you to combine them
                         comment_Content = TEXT(analyzer = customWordFilter))
  

    #BUILD INDEX ****
    
    
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)


#     if index exists - remove it
#     #Return True if path is an existing directory.
#     if os.path.isdir(INDEX_DIR):
#         #Delete an entire directory tree; path must point to a directory
#         shutil.rmtree(INDEX_DIR)
#     #create the directory for the index
#     os.makedirs(INDEX_DIR)

    #initiate index - takes two inputs, the index directory and the schema for the index
    ix = Index.create_in(INDEX_DIR,ixSchema)
    
    
    
    #INDEX COMMENTS ****
    #creating a utility writer 
    #params: index – the whoosh.index.Index to write to.
    #period – the maximum amount of time (in seconds) between commits.
    #limit – the maximum number of documents to buffer before committing/between commits.
    writer = writing.BufferedWriter(ix, period=20, limit=1000)
    try:
        # write each file to index
        # enumerate returns index,value index points too --> index,a[index]
        
        counter1 = 0
        for row in df.iterrows():
            index,data = row
            writer.add_document(comment_ID = data['name'],
                                comment_Subreddit = data['subreddit'],
                                comment_Content = data['body'])
            counter1 = counter1 + 1
            
#             if (counter1 % 100 == 0):
#                 print("already indexed:", counter1+1)

    finally:
        # save the index
        #print("done indexing")
        # *** Note *** -> Must explictly call close() on the writer object to release the write lock and makesure uncommited changes are saved 
        writer.close()   
      
    
    #PARSE USER QUERY ****
    
    #in the query parser --> we pass the DEFAULT field to search and the schema of the index we are searching
    #NOTE: Users can still specify a search on a different field in the schema via --> <fieldname>: <query>
    qp = QueryParser("comment_Content", schema=ix.schema)

     #Once you have a QueryParser object, you can call parse() on it to parse a query string into a query object:
        #default query lang: 
        #If the user doesn’t explicitly specify AND or OR clauses: 
        #by default, the parser treats the words as if they were connected by AND,
        #meaning all the terms must be present for a document to match
        #we will change this 
        #to phrase search "<query>" - use quotes
        
    qp.add_plugin(qparser.GtLtPlugin)   
    #qp.remove_plugin_class(qparser.PhrasePlugin)
    qp.add_plugin(qparser.PhrasePlugin)  
    query = qp.parse(userQuery)
    print("\n\n Query: ")
    print(query)
    print("\n\n")
    
    ##IMPLEMENT SEARCHER ****
    resultsDF = pandas.DataFrame() #creates a new dataframe that's empty to store the results comment content
    with ix.searcher(weighting = scoring.BM25F()) as searcher:
        queryResults = searcher.search(query, limit = None)
        print("Total Number of Results:",len(queryResults))
        print("Number of scored and sorted docs in this Results object:",queryResults.scored_length())
        for result in queryResults:
#             print(result)
#             print("\n",result['comment_ID'])
            resultsDF = resultsDF.append(df.loc[df['name']== result['comment_ID']][['name','subreddit','body']])
            

    #print(dataDf.loc[dataDf['body']==comment].index.values[0])
        
    return resultsDF

In [None]:
df2 = pandas.DataFrame(columns=['name','subreddit', 'body', 'link'],index=[0,1,2,3,4,5])
df2.loc[0] = pandas.Series({'body':"Awesome room service", 'name':'1', 'subreddit':'0.75', 'link':'http://google.com'})
df2.loc[1] = pandas.Series({'body':"Good view", 'name':'0.5', 'subreddit':'0.75', 'link':'http://google.com'})
df2.loc[2] = pandas.Series({'body':"Ok view", 'name':'0', 'subreddit':'0.5', 'link':'http://google.com'})
df2.loc[3] = pandas.Series({'body':"Bad room service", 'name':'-0.5', 'subreddit':'0', 'link':'http://google.com'})
df2.loc[4] = pandas.Series({'body':"Terrible prices", 'name':'-1', 'subreddit':'-0.75', 'link':'http://google.com'})
df2.loc[5] = pandas.Series({'body':"Disastrous prices", 'name':'-1', 'subreddit':'-1', 'link':'http://google.com'})
df2

In [None]:
df = pandas.DataFrame(columns=['text','MI', 'PMI'],index=[0,1,2,3])
df.loc[0] = pandas.Series({'text':"Awesome room service", 'MI':1, 'PMI':0.75})
df.loc[1] = pandas.Series({'text':"Good view", 'MI':0.5, 'PMI':0.75})
df.loc[2] = pandas.Series({'text':"Ok view", 'MI':0, 'PMI':0.5})
df.loc[3] = pandas.Series({'text':"Bad room service", 'MI':-0.5, 'PMI':0})
df.loc[4] = pandas.Series({'text':"Terrible prices", 'MI':-1, 'PMI':-0.75})
df.loc[5] = pandas.Series({'text':"Disastrous prices", 'MI':-1, 'PMI':-1})
df

In [None]:
###Create Table
class TableApp(Frame):

    def __init__(self, parent, dataframe):
        Frame.__init__(self, parent)
        self.LoadTable(dataframe)
        self.grid(sticky = (N,S,W,E))
        parent.grid_rowconfigure(0, weight = 1)
        parent.grid_columnconfigure(0, weight = 1)
              
    ###Get Table Values
    def LoadTable(self, df):
        tv = Treeview(self)
        tv['columns'] = ('pointwisemutual', 'mutual', 'viewfull')
        tv.heading("#0", text='Key Phrase')
        tv.column("#0", anchor="w", width=300)
        tv.heading('pointwisemutual', text='PMI')
        tv.column('pointwisemutual', anchor='center', width=50)
        tv.heading('mutual', text='MI')
        tv.column('mutual', anchor='center', width=50)
        tv.heading('viewfull', text='See Full Thread')
        tv.column('viewfull', anchor='center', width=100)
        tv.grid(sticky = (N,S,W,E))
        self.treeview = tv
        self.grid_rowconfigure(0, weight = 1)
        self.grid_columnconfigure(0, weight = 1)
        
        self.MItable = df
        for (i,row) in self.MItable.iterrows():
            self.treeview.insert('', 'end', text=row["body"], values=(row["name"],row["subreddit"], 'View'))
            
        self.treeview.bind("<Button-1>", self.OnClick)
        
    def OnClick(self, event):
        item = self.treeview.identify('item',event.x,event.y)
        body = self.treeview.item(item,"text")
        print("you clicked on", body)
        link = df2[df2['body']==body]["link"].tolist()[0]
        webbrowser.open(link)

In [None]:
###Create Timeline Bar Graph
class TimelineApp(Frame):

    def __init__(self, parent):
        Frame.__init__(self, parent)
        self.CreateGraph()
        self.grid(sticky = (N,S,W,E))
        parent.grid_rowconfigure(0, weight = 1)
        parent.grid_columnconfigure(0, weight = 1)
        
    def CreateGraph(self):
        timetable = pandas.DataFrame(columns=['month','total'],index=[0,1,2])
        timetable.loc[0] = pandas.Series({'month':"Jan", 'total':50})
        timetable.loc[1] = pandas.Series({'month':"Feb", 'total':35})
        timetable.loc[2] = pandas.Series({'month':"Mar", 'total':29})
        timetable.loc[3] = pandas.Series({'month':"Apr", 'total':15})
        timetable.loc[4] = pandas.Series({'month':"May", 'total':10})
        timetable.loc[5] = pandas.Series({'month':"Jun", 'total':47})
        timetable.loc[6] = pandas.Series({'month':"Jul", 'total':38})
        timetable.loc[7] = pandas.Series({'month':"Aug", 'total':7})
        timetable.loc[8] = pandas.Series({'month':"Sep", 'total':23})
        timetable.loc[9] = pandas.Series({'month':"Oct", 'total':19})
        timetable.loc[10] = pandas.Series({'month':"Nov", 'total':28})
        timetable.loc[11] = pandas.Series({'month':"Dec", 'total':33})        
        
        ###month = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
                
        c_width = 55
        c_height = 180
        c = Canvas(self, width=c_width, height=c_height, bg= 'gray')
        c.pack()
        
        ###sizing the graph
        ###y height = max value * y_stretch
        y_height=timetable.total.max()
        y_stretch = 140/y_height
        # gap between lower canvas edge and x axis
        y_gap = 20
        # stretch enough to get all data items in
        x_stretch = 15
        x_width = 28
        # gap between left canvas edge and y axis
        x_gap = 20
        
        c.create_line(0,c_height - y_gap,550,c_height - y_gap)
                
        for i,row in timetable.iterrows():
            # calculate reactangle coordinates (integers) for each bar
            x0 = i * x_stretch + i * x_width + x_gap
            y0 = c_height - (row["total"] * y_stretch + y_gap)
            x1 = i * x_stretch + i * x_width + x_width + x_gap
            y1 = c_height - y_gap
            # draw the bar
            c.create_rectangle(x0, y0, x1, y1, fill="midnightblue")
            # put the y value above each bar
            c.create_text(x1, y0, anchor=SE, text=row["total"])
            c.create_text(x1, c_height-2, anchor=SE, text=row["month"])
            print(i)

In [None]:
###Main Application
class App(tkinter.Tk):

    ###Initialize things
    def __init__(self, parent):
        tkinter.Tk.__init__(self, parent)
        self.parent = parent
        self.initialize()
    
    ###Actually initialize the program
    def initialize(self):
    
        ###Create the main container frames
        firstframe = Frame(self, bg='yellowgreen', width = 1000, height=50, padx=3, pady=3)
        secondframe = Frame(self, bg='gray', width=1000, height=100, padx=3, pady=3)
        thirdframe = Frame(self, bg='white', width = 1000, height = 100, padx=3, pady=3)
        self.fourthframe = Frame(self, bg='lavender', width = 1000, height = 100, padx=3, pady=3)
        fifthframe = Frame(self, bg='yellowgreen', width = 1000, height = 300, padx=3, pady=3)
        sixthframe = Frame(self, bg='gray', width = 1000, height = 300, padx=3, pady=3)

        ###layout all of the main containers
        firstframe.grid(row=0, sticky="ew")
        secondframe.grid(row=1, sticky="nsew")
        thirdframe.grid(row=3, sticky="ew")
        self.fourthframe.grid(row=4, sticky="ew")
        fifthframe.grid(row=5, sticky="ew")
        sixthframe.grid(row=6, sticky="ew")
        
        ###first frame = search box
        submit = Button(firstframe, text ="Search", background="yellowgreen", command=self.OnButtonClick)
        
        ###self.entryVariable is the search item
        self.entryVariable = tkinter.StringVar()
        self.entry = tkinter.Entry(firstframe,textvariable=self.entryVariable)
        self.entry.bind("<Return>", self.OnPressEnter)
        self.entryVariable.set("")
        
        ###top frame layout
        submit.grid(row=0, column=1)
        self.entry.grid(row=0, column=0, sticky='EW')

        ###second frame = summary stats
        statslabel1= Label(secondframe, text="Total Comments", bg="gray")
        statslabel2= Label(secondframe, text="Total Subreddits", bg="gray")
        statslabel3= Label(secondframe, text="Total Positive", bg="gray", fg="darkgreen")
        statslabel4= Label(secondframe, text="Total Neutral", bg="gray", fg="yellow")
        statslabel5= Label(secondframe, text="Total Negative", bg="gray", fg="red")
        
        self.statResult1 = tkinter.StringVar()
        self.statResult2 = tkinter.StringVar()        
        self.statResult3 = tkinter.StringVar()        
        self.statResult4 = tkinter.StringVar()        
        self.statResult5 = tkinter.StringVar()
        
        self.statResult1.set(u"0")
        self.statResult2.set(u"0")
        self.statResult3.set(u"0")
        self.statResult4.set(u"0")
        self.statResult5.set(u"0")
        
        resultlabel1 = Label(secondframe,textvariable=self.statResult1, bg="gray")
        resultlabel2 = Label(secondframe,textvariable=self.statResult2, bg="gray")
        resultlabel3 = Label(secondframe,textvariable=self.statResult3, bg="gray")
        resultlabel4 = Label(secondframe,textvariable=self.statResult4, bg="gray")
        resultlabel5 = Label(secondframe,textvariable=self.statResult5, bg="gray")

        ###second frame layout
        statslabel1.grid(row=0, column=0)
        statslabel2.grid(row=0, column=2)
        statslabel3.grid(row=1, column=0, sticky="w")
        statslabel4.grid(row=1, column=2)
        statslabel5.grid(row=1, column=4)

        resultlabel1.grid(row=0, column=1)
        resultlabel2.grid(row=0, column=3)
        resultlabel3.grid(row=1, column=1)
        resultlabel4.grid(row=1, column=3)
        resultlabel5.grid(row=1, column=5)

        ###third frame = Key Phrase with PMI and MI headers
        keyphraselabel = Label(thirdframe, text="Key Phrases for")
        keyphrasesearch = Label(thirdframe, textvariable=self.entryVariable)
        keyphraseshow = Label(thirdframe, text="Show: ")
        
        choice = StringVar()
        choice.set("all")
        self.RB1 = Radiobutton(thirdframe, text="all", variable=choice, value="all", state="disabled")
        self.RB2 = Radiobutton(thirdframe, text="pos", variable=choice, value="pos", state="disabled")
        self.RB3 = Radiobutton(thirdframe, text="neg", variable=choice, value="neg", state="disabled")
        
        showallcomments = Button(thirdframe, text ="Show All Comments", command=self.SecondWindow)
                
        ###third frame layout
        keyphraselabel.grid(row=0,column=0)
        keyphrasesearch.grid(row=0,column=1)
        keyphraseshow.grid(row=0,column=3)
        self.RB1.grid(row=0, column=4)
        self.RB2.grid(row=0, column=5)
        self.RB3.grid(row=0, column=6)
        showallcomments.grid(row=0, column=7)
        
        ###fourthframe = Table
        self.table = TableApp(self.fourthframe, df2)
        
        ###fifth frame = Timeline Header
        timelinehead = Label(fifthframe, text="Comment Frequency Timeline", bg="yellowgreen")
        
        timelinehead.grid(row=0, column=0)
        
        ###sixth frame = Actual Timeline
        TimelineApp(sixthframe)
        
    ###Click Submit Button and show all search
    def OnButtonClick(self):
        print("OnButtonClick")
        searchTable = whooshSearch(df2,self.entryVariable.get())
        print(searchTable)
        self.RB1.configure(state="normal")
        self.RB2.configure(state="normal")
        self.RB3.configure(state="normal")
        self.RB1.select()
        #self.table = TableApp(self.fourthframe, searchTable)
        self.fourthframe.destroy()
        self.fourthframe = Frame(self, bg='lavender', width = 1000, height = 100, padx=3, pady=3)
        self.fourthframe.grid(row=4, sticky="ew")
        #print(" ".join(searchTable["body"].tolist()))
        #keyPhraseTable = get_KeyPhrases(" ".join(searchTable["body"].tolist()))
        #print(keyPhraseTable)
        #TableApp(self.fourthframe, keyPhraseTable)
        TableApp(self.fourthframe, searchTable)
        self.currentTable = searchTable
        
    ###search function to be implemented
        
    ###summary statistics

    ###Click Enter and show all search
    def OnPressEnter(self,event):
        print("OnPressEnter")
        searchTable = whooshSearch(df2,self.entryVariable.get())
        self.RB1.configure(state="normal")
        self.RB2.configure(state="normal")
        self.RB3.configure(state="normal")
        self.RB1.select()
        self.fourthframe.destroy()
        self.fourthframe = Frame(self, bg='lavender', width = 1000, height = 100, padx=3, pady=3)
        self.fourthframe.grid(row=4, sticky="ew")
        TableApp(self.fourthframe, searchTable)
        self.currentTable = searchTable
        
    ###Create Second Window    
    def SecondWindow(self):
        
        child = tkinter.Toplevel(self)
        child.title("Comments")
        ###Create the main container frames
        windowfirst = Frame(child, bg='yellowgreen', width = 1000, height=50, padx=3, pady=3)
        windowsecond = Frame(child, bg='gray', width=1000, height=100, padx=3, pady=3)

        ###layout all of the main containers
        windowfirst.pack()
        windowsecond.pack()

        ##comment 
        ctr_left = Frame(windowsecond, bg='red', width=500, height=300, padx=3, pady=3)
        ctr_right = Frame(windowsecond, bg='brown', width=500, height=300, padx=3, pady=3)

        ctr_left.grid(row=0, column = 0, sticky="ns")
        ctr_right.grid(row=0, column = 2, sticky="ns")

In [None]:
webbrowser.open('https://www.reddit.com/r/'+'shopify'+'/comments/'+'558i3t') # subreddit and link_id
# scroll down???

In [None]:
webbrowser.open('https://www.reddit.com'+'/r/cscareerquestions/comments/4h6zt6/waterloo_mechatronics_vs_cs_coop/'+'d2nxm5t') #permalink and comment id

In [None]:
root = App(None)
root.title('Shopify Reddit Analyzer')
root.mainloop()