In [140]:
import pandas as pd
import requests
import json
from user_token import USER_TOKEN


def is_float(x):
    try:
        float(x)
    except ValueError:
        return False
    return True


In [141]:
df = pd.read_csv("restaurants.csv", on_bad_lines='skip')
df.head()

Unnamed: 0,RestaurantID,RestaurantName,City,Location.Longitude,Location.Latitude,AverageCostForTwo,AggregateRating,RatingText,Votes,Date
0,6317637,Le Petit Souffle,Makati City,121.027535,14.565443,1100,4.8,Excellent,314.0,2017-07-22 22:28:09
1,6300781,Buffet 101,Pasay City,120.979667,14.531333,2000,4.0,Very Good,520.0,2018-04-22 22:28:09
2,6301290,Vikings,Pasay City,120.979333,14.54,2000,4.2,Very Good,677.0,2018-12-03 04:07:25
3,6300010,Spiral - Sofitel Philippine Plaza Manila,Pasay City,120.98009,14.55299,6000,4.9,Excellent,621.0,2018-05-19 23:10:50
4,6314987,Locavore,Pasig City,121.056532,14.572041,1100,4.8,Excellent,532.0,2018-07-23 05:35:42


In [142]:
df = df.loc[~df.isnull().any(axis=1)]
df = df[df["AggregateRating"].apply(lambda x: is_float(x))]
df = df[df["AverageCostForTwo"].apply(lambda x: is_float(x))]

df['AggregateRating'] = df['AggregateRating'].astype('float64')
df['AverageCostForTwo'] = df['AverageCostForTwo'].astype('float64')
df = df[df['AggregateRating'] >= 0]
df = df[df['AggregateRating'] <= 5]
df["Location"] = df[['Location.Longitude', 'Location.Latitude']].values.tolist()
df = df.drop(['Location.Latitude', 'Location.Longitude'], axis=1)

df.head()

Unnamed: 0,RestaurantID,RestaurantName,City,AverageCostForTwo,AggregateRating,RatingText,Votes,Date,Location
0,6317637,Le Petit Souffle,Makati City,1100.0,4.8,Excellent,314.0,2017-07-22 22:28:09,"[121.027535, 14.565443]"
1,6300781,Buffet 101,Pasay City,2000.0,4.0,Very Good,520.0,2018-04-22 22:28:09,"[120.9796667, 14.53133333]"
2,6301290,Vikings,Pasay City,2000.0,4.2,Very Good,677.0,2018-12-03 04:07:25,"[120.9793333, 14.54]"
3,6300010,Spiral - Sofitel Philippine Plaza Manila,Pasay City,6000.0,4.9,Excellent,621.0,2018-05-19 23:10:50,"[120.98009, 14.55299]"
4,6314987,Locavore,Pasig City,1100.0,4.8,Excellent,532.0,2018-07-23 05:35:42,"[121.056532, 14.572041]"


In [143]:
df.shape

(9499, 9)

In [144]:


# del index
url = "http://localhost:9200/rs1/"
headers = {"Authorization": "Bearer " + USER_TOKEN}
res = requests.delete(url=url, headers=headers)
res.json()

{'acknowledged': True}

In [145]:
url = "http://localhost:9200/rs1/"
headers = {"Authorization": "Bearer " + USER_TOKEN}
res = requests.put(url=url, headers=headers)
res.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'rs1'}

In [146]:
# add index mapping
mappings = {
    "properties": {
        "AggregateRating": {
            "type": "float"
        },
        "AverageCostForTwo": {
            "type": "float"
        },
        "City": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "Date": {
            "type": "date",
            "format": "yyyy-MM-dd HH:mm:ss"
        },
        "Location": {
            "type": "geo_point"
        },
        "RatingText": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "RestaurantID": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "RestaurantName": {
            "type": "text",
            "fields": {
                "keyword": {
                    "type": "keyword",
                    "ignore_above": 256
                }
            }
        },
        "Votes": {
            "type": "float"
        }
    }
}

body = json.dumps(mappings, indent=4)

headers = {"Content-Type": "application/json", "Authorization": "Bearer " + USER_TOKEN}
url = "http://localhost:9200/rs1/_mapping"
res = requests.put(data=body, headers=headers, url=url)
res.json()

{'acknowledged': True}

In [147]:
body = "\n".join(['{"index":{}}' + '\n' + row.to_json() for i, row in df.iterrows()]) + '\n'
print(body)

{"index":{}}
{"RestaurantID":"6317637","RestaurantName":"Le Petit Souffle","City":"Makati City","AverageCostForTwo":1100.0,"AggregateRating":4.8,"RatingText":"Excellent","Votes":314.0,"Date":"2017-07-22 22:28:09","Location":[121.027535,14.565443]}
{"index":{}}
{"RestaurantID":"6300781","RestaurantName":"Buffet 101","City":"Pasay City","AverageCostForTwo":2000.0,"AggregateRating":4.0,"RatingText":"Very Good","Votes":520.0,"Date":"2018-04-22 22:28:09","Location":[120.9796667,14.53133333]}
{"index":{}}
{"RestaurantID":"6301290","RestaurantName":"Vikings","City":"Pasay City","AverageCostForTwo":2000.0,"AggregateRating":4.2,"RatingText":"Very Good","Votes":677.0,"Date":"2018-12-03 04:07:25","Location":[120.9793333,14.54]}
{"index":{}}
{"RestaurantID":"6300010","RestaurantName":"Spiral - Sofitel Philippine Plaza Manila","City":"Pasay City","AverageCostForTwo":6000.0,"AggregateRating":4.9,"RatingText":"Excellent","Votes":621.0,"Date":"2018-05-19 23:10:50","Location":[120.98009,14.55299]}
{"in

In [148]:
body = "\n".join(['{"index":{}}' + '\n' + row.to_json() for i, row in df.iterrows()]) + '\n'
headers = {"Content-Type": "application/json", "Authorization": "Bearer " + USER_TOKEN}
url = "http://localhost:9200/rs1/_bulk"
res = requests.post(data=body, headers=headers, url=url)
res.json()

{'took': 1202,
 'errors': False,
 'items': [{'index': {'_index': 'rs1',
    '_type': '_doc',
    '_id': 'e5irZYABikYBi-qbixiU',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'rs1',
    '_type': '_doc',
    '_id': 'fJirZYABikYBi-qbixiV',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'rs1',
    '_type': '_doc',
    '_id': 'fZirZYABikYBi-qbixiV',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'rs1',
    '_type': '_doc',
    '_id': 'fpirZYABikYBi-qbixiV',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no'