In [1]:
from whoosh.qparser import *
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NUMERIC
from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer
from whoosh import index
from whoosh.index import create_in
import os, os.path
import json

In [2]:
schema1 = Schema(author=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True),
                 day=NUMERIC(stored=False),
                 id=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=False),
                 link=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True),
                 month=NUMERIC(stored=False),
                 abstract=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True),
                 tag=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=False),
                 title=TEXT(analyzer=StandardAnalyzer(stoplist=None), stored=True),
                 year=NUMERIC(stored=True)
                 )

In [3]:
file_dir = '/Users/tony/Desktop/search_engine/filedir/'
file_name = os.listdir(file_dir)
print(len(file_name))

# reading files 
file_load = json.load(open('/Users/tony/Desktop/search_engine/filedir/arxivData.json', 'r'))




1


In [4]:

print(type(file_load[0]['author']))

<class 'str'>


In [5]:
#to create an index in a dictionary
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
ix = index.create_in("indexdir", schema1)
#open an existing index object
ix = index.open_dir("indexdir")
#create a writer object to add documents to the index
writer = ix.writer()
#now we can add documents to the index

for i in range(len(file_load)):
    if (i%1000) == 0: 
        print("%s files loaded" % (i))

    writer.add_document(author=file_load[i]['author'],
                        day=file_load[i]['day'],
                        id=file_load[i]['id'],
                        link=file_load[i]['link'],
                        month=file_load[i]['month'],
                        abstract=file_load[i]['summary'],
                        tag=file_load[i]['tag'],
                        title=file_load[i]['title'],
                        year=file_load[i]['year'])

#close the writer and save the added documents in the index
#you should call the commit() function once you finish adding the documents otherwise you will cause an error-
#when you try to edit the index next time and open another writer. 
writer.commit()

0 files loaded
1000 files loaded
2000 files loaded
3000 files loaded
4000 files loaded
5000 files loaded
6000 files loaded
7000 files loaded
8000 files loaded
9000 files loaded
10000 files loaded
11000 files loaded
12000 files loaded
13000 files loaded
14000 files loaded
15000 files loaded
16000 files loaded
17000 files loaded
18000 files loaded
19000 files loaded
20000 files loaded
21000 files loaded
22000 files loaded
23000 files loaded
24000 files loaded
25000 files loaded
26000 files loaded
27000 files loaded
28000 files loaded
29000 files loaded
30000 files loaded
31000 files loaded
32000 files loaded
33000 files loaded
34000 files loaded
35000 files loaded
36000 files loaded
37000 files loaded
38000 files loaded
39000 files loaded
40000 files loaded


In [6]:
#parsing the query
# this is just a simple parser with default field
parser=QueryParser("abstract",schema=schema1) 
#if you want “unfielded” terms to search both the title and content fields,  use a whoosh.qparser.MultifieldParser
#parser = MultifieldParser(["title", "abstract"], schema=schema)
#call parse() on query to parse a query string into a query object
result=parser.parse(u"apple company department")
print(result)

(abstract:apple AND abstract:company AND abstract:department)


In [21]:
#FuzzyTermPlugin lets you search for “fuzzy” terms, that is, terms that don’t have to match exactly. 
#The fuzzy term will match any similar term within a certain number of “edits” 
parser.add_plugin(FuzzyTermPlugin())
result=parser.parse(u"author:Young~")#would match a document has Margare and all terms in the index within one “edit” of cat, for example Margaret insert t
print(result)
#searcher object is used for searching the matched documents
#you can open the searcher using a with statement so the searcher is automatically closed when you’re done with it
#ix is the document index we created before
with ix.searcher() as searcher:
    results=searcher.search(result)#The Results object acts like a list of the matched documents.
    '''
    for i in range(10):
        print (results[i])
    '''
    for hit in results:
        #print(hit["title"])
        print(hit.highlights("author"))

author:young~
2D-3D Pose Consistency-based Conditional Random Fields for 3D Human Pose
  Estimation
name': 'Ju <b class="match term0">Yong</b> Chang'}, {'name': '<b class="match term1">Kyoung</b> Mu Lee
V2V-PoseNet: Voxel-to-Voxel Prediction Network for Accurate 3D Hand and
  Human Pose Estimation from a Single Depth Map
Moon'}, {'name': 'Ju <b class="match term0">Yong</b> Chang'}, {'name': '<b class="match term1">Kyoung</b> Mu Lee
Holistic Planimetric prediction to Local Volumetric prediction for 3D
  Human Pose Estimation
Moon'}, {'name': 'Ju <b class="match term0">Yong</b> Chang'}, {'name': 'Yumin Suh...name': '<b class="match term1">Kyoung</b> Mu Lee
Hadamard Product for Low-rank Bilinear Pooling
Hwa Kim'}, {'name': '<b class="match term1">Kyoung</b>-Woon On'}, {'name': 'Woosang...Woo Ha'}, {'name': '<b class="match term2">Byoung</b>-Tak Zhang
The Evolution of Neural Network-Based Chart Patterns: A Preliminary
  Study
name': '<b class="match term3">Myoung</b> Hoon Ha'}, {'name': 'B