In [1]:
# setup library imports
import io, time, json
import requests
import pickle
import csv
import re
from bs4 import BeautifulSoup

In [3]:
def read_api_key(filepath):
    with open(filepath, 'r') as f:
        api_key = f.read().replace('\n','')
    return api_key

In [4]:
def check_restaurant_before_2015(url):
    """
    Check whether or not the restaurant established before 2015.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        (boolean): whether or not the restaurant established before 2015.
    """
    url += "&sort_by=date_asc"
    html = requests.get(url).content
    url=None
    while True:
        soup = BeautifulSoup(html, 'html.parser')
        for div in soup.find_all("div", class_="review review--with-sidebar"):
            tmp = re.search("[0-9]*/[0-9]*/[0-9]*", div.find("span", class_='rating-qualifier').get_text())
            if tmp is not None:
                tmp = tmp.group(0).rstrip()
                tmp = (tmp[tmp.rfind('/') + 1:])
                try:
                    if int(tmp) <= 2015:
                        return True
                    return False
                except:
                    continue
        url=soup.find("a", class_="u-decoration-none next pagination-links_anchor")
        if url is not None:
            url=url['href']
            html=requests.get(url).content
        else:
            break
    return False

In [5]:
# res = check_restaurant_before_2015("https://www.yelp.com/biz/the-porch-at-schenley-pittsburgh")

In [6]:
def all_restaurants(api_key, zip_code):
    # find restaurants established before 2015 according to zip_code
    # zip_code: str
    offset = 0
    headers = {
        "authorization": 'Bearer %s' % api_key, # for yelp API
    }
    total_num = 1
    dic_list = []
    while offset < total_num:
        params = {
            "location": zip_code,
            "categories": "restaurant",
            "offset": offset,
            "limit": 50,
            "sort_by": "review_count",
        }
        result = requests.get('https://api.yelp.com/v3/businesses/search', headers= headers, params=params).json()
        if 'businesses' not in result:
            break
        dic_list += result['businesses']
        total_num = result['total']
        offset += 50
        time.sleep(0.2)
    return dic_list


In [7]:
# %load_ext line_profiler

In [8]:

# data = all_restaurants(read_api_key('/Users/Lena/api/yelp_api_key.txt'), '15213')
# print(len(data))

In [9]:


# %lprun -f all_restaurants all_restaurants(read_api_key('/Users/Lena/api/yelp_api_key.txt'), '15213')



In [15]:
def get_data_from_yelp(zipcode):
    data = all_restaurants(read_api_key('api_key.txt'), zipcode)
    
    res = []
    for data_i in data:
        if 'name' not in data_i \
        or 'rating' not in data_i \
        or 'review_count' not in data_i \
        or 'price' not in data_i \
        or 'transactions' not in data_i:
            continue

        cur_list = [zipcode, data_i['name'], data_i['rating'], data_i['review_count'], data_i['price'], str(data_i['transactions'])]
#         print(cur_list)
        res.append(cur_list)
    print("final length is: ", len(res))
    return res

In [21]:
import csv, sqlite3

con = sqlite3.connect("yelp.db")
cur = con.cursor()
# cur.execute("CREATE TABLE yelp (zipcode TEXT,name TEXT,rating REAL,review_count INT,price TEXT,transactions TEXT)")

with open("zipcode.txt") as f: 
    zipcode = f.read().splitlines()
for i, zc in enumerate(zipcode):
    if i <= 458:
        continue
    print(i)
    print(zc)
    res = get_data_from_yelp(zc)

    cur.executemany("INSERT INTO yelp (zipcode,name,rating,review_count,price,transactions) VALUES (?, ?, ?, ?, ?, ?);", res)
    con.commit()
con.close()


---