# Whoosh Search Function

In [1]:
#function wont work if import statement is inside, must be outside
from whoosh.analysis import Filter
class CustomFilter(Filter):
    # This filter will run for both the index and the query
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [2]:
def ShopifySearchEngine(df, userQuery):
    #!pip --quiet install whoosh
    from whoosh import index, writing
    from whoosh.fields import Schema, TEXT, ID, STORED
    from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter, StemFilter
    from whoosh import qparser
    from whoosh.qparser import QueryParser, GtLtPlugin, PhrasePlugin, SequencePlugin
    from whoosh import scoring
    import os, os.path # os - portable way of using operating system dependent functionality
    import shutil #High-level file operations
    import pandas
    import nltk
    
    
    #Defining constants for the data paths ***** MODIFY ACCORDINGLY *****
    INDEX_DIR = r"C:\Users\Jason\MIE490 - Capstone - Shopify\Data\Index2"
    
    
    #BUILD SCHEMA ****
    #schema has fields - piece of info for each doc in the index
    customWordFilter = RegexTokenizer()|LowercaseFilter()|CustomFilter(nltk.stem.porter.PorterStemmer().stem)|CustomFilter(nltk.WordNetLemmatizer().lemmatize)

    ixSchema = Schema(comment_ID = ID(stored=True),
                         comment_Subreddit = ID(stored=True),
                         #note analyzer is a wrapper for a tokenizer and zero or more filters -- i.e. allows you to combine them
                         comment_Content = TEXT(analyzer = customWordFilter))
  

    #BUILD INDEX ****
    
    
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)


#     if index exists - remove it
#     #Return True if path is an existing directory.
#     if os.path.isdir(INDEX_DIR):
#         #Delete an entire directory tree; path must point to a directory
#         shutil.rmtree(INDEX_DIR)
#     #create the directory for the index
#     os.makedirs(INDEX_DIR)

    #initiate index - takes two inputs, the index directory and the schema for the index
    ix = index.create_in(INDEX_DIR,ixSchema)
    
    
    
    #INDEX COMMENTS ****
    #creating a utility writer 
    #params: index – the whoosh.index.Index to write to.
    #period – the maximum amount of time (in seconds) between commits.
    #limit – the maximum number of documents to buffer before committing/between commits.
    writer = writing.BufferedWriter(ix, period=20, limit=1000)
    try:
        # write each file to index
        # enumerate returns index,value index points too --> index,a[index]
        
        counter1 = 0
        for row in df.iterrows():
            index,data = row
            writer.add_document(comment_ID = data['name'],
                                comment_Subreddit = data['subreddit'],
                                comment_Content = data['body'])
            counter1 = counter1 + 1
            
#             if (counter1 % 100 == 0):
#                 print("already indexed:", counter1+1)

    finally:
        # save the index
        #print("done indexing")
        # *** Note *** -> Must explictly call close() on the writer object to release the write lock and makesure uncommited changes are saved 
        writer.close()   
      
    
    #PARSE USER QUERY ****
    
    #in the query parser --> we pass the DEFAULT field to search and the schema of the index we are searching
    #NOTE: Users can still specify a search on a different field in the schema via --> <fieldname>: <query>
    qp = QueryParser("comment_Content", schema=ix.schema)

     #Once you have a QueryParser object, you can call parse() on it to parse a query string into a query object:
        #default query lang: 
        #If the user doesn’t explicitly specify AND or OR clauses: 
        #by default, the parser treats the words as if they were connected by AND,
        #meaning all the terms must be present for a document to match
        #we will change this 
        #to phrase search "<query>" - use quotes
        
    qp.add_plugin(qparser.GtLtPlugin)   
    #qp.remove_plugin_class(qparser.PhrasePlugin)
    qp.add_plugin(qparser.PhrasePlugin)  
    query = qp.parse(userQuery)
    print("\n\n Query: ")
    print(query)
    print("\n\n")
    
    ##IMPLEMENT SEARCHER ****
    resultsDF = pandas.DataFrame() #creates a new dataframe that's empty to store the results comment content
    with ix.searcher(weighting = scoring.BM25F()) as searcher:
        queryResults = searcher.search(query, limit = None)
        print("Total Number of Results:",len(queryResults))
        print("Number of scored and sorted docs in this Results object:",queryResults.scored_length())
        for result in queryResults:
#             print(result)
#             print("\n",result['comment_ID'])
            resultsDF = resultsDF.append(df.loc[df['name']== result['comment_ID']][['name','subreddit','body']])
            

    #print(dataDf.loc[dataDf['body']==comment].index.values[0])
        
    return resultsDF     
    


# Example

In [3]:
import pandas
d = {'name': 't12345', 'subreddit': 'shopify','body': 'this is a test comment for shopify'}
df = pandas.DataFrame(data=d,index = range(0,1))

In [4]:
df

Unnamed: 0,body,name,subreddit
0,this is a test comment for shopify,t12345,shopify


In [5]:
ShopifySearchEngine(df,"\"this is a test\"")
## Right now, it supports basic boolean search using AND and/or OR using "Jakob AND and AND Jack OR or Frank" ,
## only capitlized ones get treated as OR and AD
# default "Jack Jones" --> treated as Jack AND Jones
## Can specify phrase as above using the \" \"
## I think the more advanced way is to build the user query from the bottom up but there is no good examples online. 



 Query: 
comment_Content:"thi is a test"



Total Number of Results: 1
Number of scored and sorted docs in this Results object: 1


Unnamed: 0,name,subreddit,body
0,t12345,shopify,this is a test comment for shopify


In [6]:
ShopifySearchEngine(df,"Jack AND and AND Jones OR Frank OR Shopify")
## Right now, it supports basic boolean search using AND and/or OR using "Jakob AND and AND Jack OR or Frank" ,
## only capitlized ones get treated as OR and AD
# default "Jack Jones" --> treated as Jack AND Jones
## Can specify phrase as above using the \" \"
## I think the more advanced way is to build the user query from the bottom up but there is no good examples online. 



 Query: 
((comment_Content:jack AND comment_Content:and AND comment_Content:jone) OR comment_Content:frank OR comment_Content:shopifi)



Total Number of Results: 1
Number of scored and sorted docs in this Results object: 1


Unnamed: 0,name,subreddit,body
0,t12345,shopify,this is a test comment for shopify


In [7]:
import nltk
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
    