In [24]:
# setup library imports
import io, time, json
import requests
import pickle
import csv
import re
from bs4 import BeautifulSoup

### Library Documentation

* Standard Library: 
    * [io](https://docs.python.org/2/library/io.html)
    * [time](https://docs.python.org/2/library/time.html)
    * [json](https://docs.python.org/2/library/json.html)

* Third Party
    * [requests](http://docs.python-requests.org/en/master/)
    * [Beautiful Soup (version 4)](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
    * [yelp-fusion](https://www.yelp.com/developers/documentation/v3/get_started)

In [20]:
def retrieve_html(url):
    # Write solution here
    response = requests.get(url)
    return response.status_code, response.text


In [21]:
def read_api_key(filepath):
    with open(filepath, 'r') as f:
        api_key = f.read().replace('\n','')
    return api_key

In [22]:
def check_restaurant_before_2015(url):
    """
    Check whether or not the restaurant established before 2015.

    Parameters:
        url (string): Yelp URL corresponding to the restaurant of interest.

    Returns:
        (boolean): whether or not the restaurant established before 2015.
    """
    url += "&sort_by=date_asc"
    html = requests.get(url).content
    url=None
    while True:
        soup = BeautifulSoup(html, 'html.parser')
        for div in soup.find_all("div", class_="review review--with-sidebar"):
            tmp = re.search("[0-9]*/[0-9]*/[0-9]*", div.find("span", class_='rating-qualifier').get_text())
            if tmp is not None:
                tmp = tmp.group(0).rstrip()
                tmp = (tmp[tmp.rfind('/') + 1:])
                try:
                    if int(tmp) <= 2015:
                        return True
                    return False
                except:
                    continue
        url=soup.find("a", class_="u-decoration-none next pagination-links_anchor")
        if url is not None:
            url=url['href']
            html=requests.get(url).content
        else:
            break
    return False

In [6]:
# res = check_restaurant_before_2015("https://www.yelp.com/biz/the-porch-at-schenley-pittsburgh")

In [32]:
def all_restaurants(api_key, zip_code):
    # find restaurants established before 2015 according to zip_code
    # zip_code: str
    offset = 0
    headers = {
        "authorization": 'Bearer %s' % api_key, # for yelp API
    }
    total_num = 1
    dic_list = []
    while offset < total_num:
        params = {
            "location": zip_code,
            "categories": "restaurant",
            "offset": offset,
            "limit": 50,
            "sort_by": "review_count",
        }
        result = requests.get('https://api.yelp.com/v3/businesses/search', headers= headers, params=params).json()
        if 'businesses' not in result:
            break
        dic_list += result['businesses']
        total_num = result['total']
        offset += 50
        time.sleep(0.2)
    return dic_list


In [18]:
%load_ext line_profiler

In [33]:

data = all_restaurants(read_api_key('/Users/Lena/api/yelp_api_key.txt'), '15213')
print(len(data))

1000


In [30]:


%lprun -f all_restaurants all_restaurants(read_api_key('/Users/Lena/api/yelp_api_key.txt'), '15213')



In [47]:
import csv, sqlite3

con = sqlite3.connect("yelp.db")
cur = con.cursor()
cur.execute("CREATE TABLE yelp (zipcode TEXT,name TEXT,rating REAL,review_count INT,price TEXT,transactions TEXT)")

with open("zipcode.txt") as f: 
    zipcode = f.read().splitlines()
for i, zc in enumerate(zipcode):
    print(i)
    print(zc)
    res = get_data_from_yelp(zc)

    cur.executemany("INSERT INTO yelp (zipcode,name,rating,review_count,price,transactions) VALUES (?, ?, ?, ?, ?, ?);", res)
    con.commit()
con.close()


0
01085
final length is:  95
1
01201


KeyboardInterrupt: 

In [44]:
def get_data_from_yelp(zipcode):
    data = all_restaurants(read_api_key('/Users/Lena/api/yelp_api_key.txt'), zipcode)
    
    res = []
    for data_i in data:
        if 'name' not in data_i \
        or 'rating' not in data_i \
        or 'review_count' not in data_i \
        or 'price' not in data_i \
        or 'transactions' not in data_i:
            continue

        cur_list = [zipcode, data_i['name'], data_i['rating'], data_i['review_count'], data_i['price'], str(data_i['transactions'])]
#         print(cur_list)
        res.append(cur_list)
    print("final length is: ", len(res))
    return res

---

### Exclude ones that the number of returns with # of total income (N02650), # of salaries (N00200), # of business income (N00900), # of unemployment_compensation (N02300), # of pension (N01700) and # of capital_gain (N01000) are less than 200
### Exclude ones that zipcode (zipcode) is 99999

In [80]:
def preprocess_income_data(filename):
    threshold = 2500
    dic = {'zip_code': [], 'total_income':[], 'salary':[], 'business_income':[], 'unemployment_compensation':[], 'pension':[], 'capital_gain':[]}
    dic_res = {'zip_code': [], 'total_income':[], 'salary':[], 'business_income':[], 'unemployment_compensation':[], 'pension':[], 'capital_gain':[]}
    with open(filename, encoding = 'utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['zipcode'] == '99999' or row['zipcode'] == '0':
                continue
            if len(dic['zip_code']) == 0 or row['zipcode'] != dic['zip_code'][-1]:
                if len(dic['zip_code']) != 0 and (dic['total_income'][-1] < threshold or dic['salary'][-1] < threshold or dic['business_income'][-1] < threshold or dic['unemployment_compensation'][-1] < threshold or dic['pension'][-1] < threshold or dic['capital_gain'][-1] < threshold):
                    if int(dic['zip_code'][-1]) == 6010:
                        print(dic['total_income'][-1])
                        print(dic['salary'][-1])
                        print(dic['business_income'][-1])
                        print(dic['unemployment_compensation'][-1])
                        print(dic['pension'][-1])
                        print(dic['capital_gain'][-1])
                    dic['zip_code'][-1] = row['zipcode']
                    dic['total_income'][-1] = int(row['N02650'])
                    dic['salary'][-1] = int(row['N00200'])
                    dic['business_income'][-1] = int(row['N00900'])
                    dic['unemployment_compensation'][-1] = int(row['N02300'])
                    dic['pension'][-1] = int(row['N01700'])
                    dic['capital_gain'][-1] = int(row['N01000'])
                    dic_res['zip_code'][-1] = row['zipcode']
                    dic_res['total_income'][-1] = int(row['A02650'])
                    dic_res['salary'][-1] = int(row['A00200'])
                    dic_res['business_income'][-1] = int(row['A00900'])
                    dic_res['unemployment_compensation'][-1] = int(row['A02300'])
                    dic_res['pension'][-1] = int(row['A01700'])
                    dic_res['capital_gain'][-1] = int(row['A01000'])
                else:
                    dic['zip_code'].append(row['zipcode'])
                    dic['total_income'].append(int(row['N02650']))
                    dic['salary'].append(int(row['N00200']))
                    dic['business_income'].append(int(row['N00900']))
                    dic['unemployment_compensation'].append(int(row['N02300']))
                    dic['pension'].append(int(row['N01700']))
                    dic['capital_gain'].append(int(row['N01000']))
                    dic_res['zip_code'].append(row['zipcode'])
                    dic_res['total_income'].append(int(row['A02650']))
                    dic_res['salary'].append(int(row['A00200']))
                    dic_res['business_income'].append(int(row['A00900']))
                    dic_res['unemployment_compensation'].append(int(row['A02300']))
                    dic_res['pension'].append(int(row['A01700']))
                    dic_res['capital_gain'].append(int(row['A01000']))
            else:
                dic['total_income'][-1] += int(row['N02650'])
                dic['salary'][-1] += int(row['N00200'])
                dic['business_income'][-1] += int(row['N00900'])
                dic['unemployment_compensation'][-1] += int(row['N02300'])
                dic['pension'][-1] += int(row['N01700'])
                dic['capital_gain'][-1] += int(row['N01000'])
                dic_res['total_income'][-1] += int(row['A02650'])
                dic_res['salary'][-1] += int(row['A00200'])
                dic_res['business_income'][-1] += int(row['A00900'])
                dic_res['unemployment_compensation'][-1] += int(row['A02300'])
                dic_res['pension'][-1] += int(row['A01700'])
                dic_res['capital_gain'][-1] += int(row['A01000'])
        if len(dic['zip_code']) != 0 and (dic['total_income'][-1] < threshold or dic['salary'][-1] < threshold or dic['business_income'][-1] < threshold or dic['unemployment_compensation'][-1] < threshold or dic['pension'][-1] < threshold or dic['capital_gain'][-1] < threshold):
            dic['zip_code'] = dic['zip_code'][:-1]
            dic['total_income'] = dic['total_income'][:-1]
            dic['salary'] = dic['salary'][:-1]
            dic['business_income'] = dic['business_income'][:-1]
            dic['unemployment_compensation'] = dic['unemployment_compensation'][:-1]
            dic['pension'] = dic['pension'][:-1]
            dic['capital_gain'] = dic['capital_gain'][:-1]
            dic_res['zip_code'] = dic_res['zip_code'][:-1]
            dic_res['total_income'] = dic_res['total_income'][:-1]
            dic_res['salary'] = dic_res['salary'][:-1]
            dic_res['business_income'] = dic_res['business_income'][:-1]
            dic_res['unemployment_compensation'] = dic_res['unemployment_compensation'][:-1]
            dic_res['pension'] = dic_res['pension'][:-1]
            dic_res['capital_gain'] = dic_res['capital_gain'][:-1]
    return dic_res

In [81]:
income_dict = preprocess_income_data('15zpallagi.csv')
print(income_dict['zip_code'])
print(len(income_dict['zip_code']))
print(income_dict)

# with open('preprocessed_income_data_2015.pickle', 'wb') as file:
#     pickle.dump(income_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

# with open('preprocessed_income_data_2015.pickle', 'rb') as file:
#     income_dict = pickle.load(file)

['99507', '92336', '94112', '95076']
4
{'zip_code': ['99507', '92336', '94112', '95076'], 'total_income': [19570, 40660, 44190, 37210], 'salary': [17150, 36290, 37490, 32880], 'business_income': [2700, 7380, 8530, 4540], 'unemployment_compensation': [4700, 2620, 2600, 6120], 'pension': [2980, 4700, 5410, 4150], 'capital_gain': [2850, 2510, 5930, 3300]}


In [1]:
!ls

business.json          final.ipynb            read_dataset.py
business_comma.json    irs.db                 zipcode200ge.txt
filter_zipcodes.py     load_data_to_sqlite.py


In [3]:
import json
with open("business_comma.json") as f:
    data = json.load(f)

    

In [5]:
from collections import Counter

post_codes = [dct['postal_code'] for dct in data]
cnter = Counter(post_codes)

In [6]:
len(cnter)

16005

In [11]:
us_post_codes = [dct['postal_code'] for dct in data if len(dct['postal_code'])==5]
us_cnter = Counter(us_post_codes)

In [12]:
len(us_cnter)

971

In [13]:
print(us_cnter)

Counter({'89109': 2965, '85251': 2044, '85281': 1777, '85260': 1741, '89119': 1700, '89102': 1497, '89103': 1379, '85308': 1377, '85032': 1326, '85016': 1298, '89117': 1285, '85254': 1263, '89118': 1254, '85282': 1158, '89123': 1158, '89052': 1151, '89014': 1135, '89146': 1105, '89101': 1087, '85226': 1073, '89147': 1052, '85224': 970, '85018': 926, '89104': 912, '28277': 868, '85225': 836, '89121': 830, '85210': 797, '89128': 793, '85258': 771, '53703': 768, '85255': 728, '61820': 724, '85027': 724, '85234': 713, '85004': 713, '89074': 699, '28202': 695, '85206': 694, '85201': 687, '85204': 685, '85283': 683, '15222': 681, '85202': 681, '89148': 669, '85233': 661, '28203': 622, '85374': 615, '85284': 613, '89113': 604, '85295': 603, '85296': 599, '89130': 596, '85382': 592, '85022': 590, '28205': 587, '85301': 580, '85020': 575, '28078': 568, '89139': 568, '53704': 565, '89120': 565, '28105': 561, '85044': 554, '44113': 550, '15237': 547, '85034': 542, '28027': 535, '85014': 532, '891