In [1]:
import pandas as pd
import json

In [2]:
rest_info = pd.read_json('./business_tip.json')

In [3]:
#reset index and remove data without tip review
rest_info = rest_info[rest_info['tip_text']!='']
rest_info.reset_index(drop=True,inplace=True)
rest_info.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,tip_text
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",Good beers. Bad bartender. Stay away from Bria...
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",Employees have always been very friendly and h...
2,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",,The Best. Go here all the time use the same pe...
3,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'...",A perfect place to grow as an athlete. Quality...
4,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",Papaya salad with a good spicy kick and tasty ...


### Import packages

In [4]:
import sys, os, lucene, threading, time 
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer 
from org.apache.lucene.analysis.standard import StandardAnalyzer 
from org.apache.lucene.document import \
Document, Field, FieldType ,TextField,StringField,LatLonPoint,FloatPoint,IntPoint,StoredField
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig ,DirectoryReader,IndexReader
from org.apache.lucene.store import SimpleFSDirectory 
from org.apache.lucene.util import Version


### Initialization and Config

In [5]:
lucene.initVM()
PATH = './data1/index' #Index Path
analyzer = StandardAnalyzer() # Standard analyzer
directory =  SimpleFSDirectory(Paths.get(PATH))
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

index_writer = IndexWriter(directory, config)


### Build index for each Filed

In [6]:
for i in range(len(rest_info)):
    doc = Document()
    
    doc.add(Field("business_id", str(rest_info['business_id'][i]),StringField.TYPE_STORED))
    
    doc.add(Field("name", str(rest_info['name'][i]),TextField.TYPE_STORED))

    doc.add(Field("address", str(rest_info['address'][i]),TextField.TYPE_STORED))
    
    doc.add(Field("categories", str(rest_info['categories'][i]),TextField.TYPE_STORED))
    
    doc.add(Field("attributes", str(rest_info['attributes'][i]),TextField.TYPE_STORED))
    
    doc.add(Field("city", str(rest_info['city'][i]),TextField.TYPE_STORED))
    
    doc.add(Field("state", str(rest_info['state'][i]),TextField.TYPE_STORED))

    doc.add(Field("postal_code", str(rest_info['postal_code'][i]),TextField.TYPE_STORED))
    
    doc.add(Field("hours", str(rest_info['hours'][i]),TextField.TYPE_STORED))
    
    doc.add(StringField("lat",str(rest_info['latitude'][i]),Field.Store.YES))
    
    doc.add(StringField("long",str(rest_info['longitude'][i]),Field.Store.YES))

    
    doc.add(LatLonPoint("location",float(rest_info['latitude'][i]),
                                                float(rest_info['longitude'][i])))
    
    doc.add(FloatPoint("stars", float(rest_info['stars'][i]) ))
    
    doc.add(StoredField('stars',float(rest_info['stars'][i]) ))
    
    doc.add(IntPoint("review_count", int(rest_info['review_count'][i]) ))
    
    doc.add(StoredField("review_count", int(rest_info['review_count'][i]) ))
    
    doc.add(Field("review", str(rest_info['tip_text'][i]),TextField.TYPE_STORED))

    index_writer.addDocument(doc)
    

index_writer.close()