In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import json

In [2]:
def fix_zipcodes(zipcode):
    if len(zipcode) < 5:
        to_fill = 5 - len(zipcode)
        return (to_fill * '0') + zipcode

    else:
        return zipcode

def fix_county_name(county_name):
    if 'county' in county_name:
        return county_name[:county_name.find('county')].rstrip()
    else:
        return county_name


In [3]:
BUSINESS_DATA_FILEPATH = 'data/yelp_academic_dataset_business.json'
REVIEWS_DATA_FILEPATH = 'data/yelp_academic_dataset_review.json'
USERS_DATA_FILEPATH = 'data/yelp_academic_dataset_user.json'
COUNTY_DATA_FILEPATH = 'data/county_data.csv'
STATES_DATA_FILEPATH = 'data/states_data.csv'
ZIPCODES_DATA_FILEPATH = 'data/zip_code_database.csv'

# Guide

The goal is to put everything into one table in tabular format.

# Plan

## Location tracking

1. Query each review for the business ID to get long and lat values.
2. Take average of long and lat values for a centered location.
3. Take values by users.
4. Assign political ideologies by county and state.
5. Count helpfulness by user.

## Scrap the location tracking old plan

_1. For each user, count the zip code of the business reviewed
2. Assign it to a county
3. The most common county is the user's base location

## Political ideology

1. Match with county & state levels
2. Cook Partisan Voting Index - review data and revisit

## Business & Reviews

1. Keep zip code
2. Assign categorization: restaurants, home services, auto services, other
3. Assign a size
4. Assign the state
5. Business mean rating
6. Helpfulness count
7. Review rating

In [4]:
COUNTY_DF = pd.read_csv(COUNTY_DATA_FILEPATH, index_col=0)
STATES_DF = pd.read_csv(STATES_DATA_FILEPATH, index_col=0)
ZIPS_DF = pd.read_csv(ZIPCODES_DATA_FILEPATH)

In [5]:
ZIPS_DF = ZIPS_DF[['zip', 'state', 'county']]
ZIPS_DF['zip'] = ZIPS_DF['zip'].astype(str).apply(fix_zipcodes)
ZIPS_DF['county'] = ZIPS_DF['county'].str.lower()
ZIPS_DF.loc[len(ZIPS_DF)] = ['02101', 'MA', 'suffolk county']
ZIPS_DF.head()

Unnamed: 0,zip,state,county
0,501,NY,suffolk county
1,544,NY,suffolk county
2,601,PR,adjuntas municipio
3,602,PR,aguada municipio
4,603,PR,aguadilla municipio


In [6]:
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].str.lower()
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].apply(fix_county_name)
COUNTY_DF.head()

Unnamed: 0,state,state_po,county_name,perc_diffs
50526,ALABAMA,AL,autauga,44.42
50529,ALABAMA,AL,baldwin,53.76
50532,ALABAMA,AL,barbour,7.66
50535,ALABAMA,AL,bibb,57.73
50538,ALABAMA,AL,blount,80.0


In [7]:
COUNTY_DF[COUNTY_DF['state_po'] == 'CA'].sample(10)

Unnamed: 0,state,state_po,county_name,perc_diffs
52799,CALIFORNIA,CA,nevada,-14.79
52814,CALIFORNIA,CA,plumas,16.73
52744,CALIFORNIA,CA,lassen,51.48
52899,CALIFORNIA,CA,sonoma,-51.48
52904,CALIFORNIA,CA,stanislaus,-0.79
52699,CALIFORNIA,CA,el dorado,8.8
52939,CALIFORNIA,CA,yolo,-41.43
52719,CALIFORNIA,CA,imperial,-24.37
52884,CALIFORNIA,CA,sierra,21.34
52674,CALIFORNIA,CA,butte,-1.67


In [8]:
STATES_DF.head()

Unnamed: 0,state,perc_diffs
1,ALABAMA,25.46
3,ALASKA,10.11
5,ARIZONA,-0.1
7,ARKANSAS,6.91
9,CALIFORNIA,-29.16


In [9]:
STATES_LIST = list(COUNTY_DF['state_po'].unique())
print(f"Number of states: {len(STATES_LIST)}")

51
['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']


In [10]:
USER_COLS = ['user_id', 'review_count']
BUSINESS_COLS = ['business_id', 'state', 'city', 'postal_code', 'categories', 'stars', 'review_count']
REVIEW_COLS = ['review_id', 'user_id', 'business_id', 'date', 'stars', 'useful']
ZIPCODES_COLS = ['zip', 'county']

USER_DTYPES = {
    'user_id': np.str,
    'review_count': np.int
}

BUSINESS_DTYPES = {
    'business_id': np.str,
    'state': np.str,
    'city': np.str,
    'postal_code': np.str,
    'categories': np.str,
    'review_count': np.int,
    'stars': np.float
}

REVIEW_DTYPES = {
    'review_id': np.str,
    'user_id': np.str,
    'business_id': np.str,
    'stars': np.int,
    'useful': np.int
}

ZIPS_DTYPES = {
    'zip': np.str,
    'county': np.str
}

In [11]:
# Load 20,000 reviews from reviews
REVIEWS = list()
with open(REVIEWS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, nrows=300000, chunksize=1000, dtype=REVIEW_DTYPES)

    for chunk in tqdm(reader):
        reduced_chunk = chunk[REVIEW_COLS]
        # Only keep US
        REVIEWS.append(reduced_chunk)
    REVIEWS = pd.concat(REVIEWS, ignore_index=True)

300it [00:04, 68.29it/s]


In [12]:
REVIEWS.sample(15)

Unnamed: 0,review_id,user_id,business_id,date,stars,useful
16828,1_DNlZtvcWb2RgZ8CRh1JA,SEi2JT8moDeKuKNU-jmxLA,5fAhoG03Qy99lI0v7jGFYg,2014-11-30 07:41:09,5,0
91171,L0IVeLLRkQrQP9jXrxojmg,7tjW1rcBuaRfHdtJB4-sNQ,fM44ZS_KDUZHNK18f3dnsQ,2015-04-16 20:21:38,2,1
159560,r_bPKlXfPZrSf2R3PYJpag,R6RiIWs6oBKIEWCtKiMNHA,QdzRS1s0tSltIokm2xV-kA,2017-07-21 23:58:35,4,0
20967,0La042kN4Vs6NnsKq-NkqA,qi9his34QNZs0IajjOCzgQ,7qSjRrEf64ySrVc19bgofw,2013-10-24 17:46:13,1,1
126993,VFeUAjQOnkjj-V1tIjSs0w,_L2AEsnER-6j3DjouA8T-A,8GusTJDbsUChxDYP677npg,2017-07-12 02:57:25,5,0
41438,oLeznMvPdUIteQlb3PPRFw,59QMw6nUynxZjIfBIxP5NQ,neYslCKmOIZMfrY0aqs-lg,2015-03-19 23:05:48,5,0
195814,ZSEQwnpdlhAMvwqvLz6Fyg,O0eezF0nDiDs8tHjMatz5A,MP12KzsczEWxqCCf95Zdew,2012-10-15 22:56:49,5,0
129481,fE6DDn89KDi_tp_-cJndIw,bJOHUJPCusrX-Xhn4agELg,mT-SkVfrxjEIOXnHaYZu-g,2017-12-05 21:24:46,5,0
275360,edRTVj4ddmlpu8jl4AReQw,lBrnbXrgw8jCBtIgkqzvqg,Irp5sgl7XASH5ZTw2D47qw,2012-03-18 15:35:55,2,2
259351,fHC_A0FR7GnWWzutvZRk3w,GcRReISCooKwajFzqlFY0Q,6H313b3O6gTULpR7fV2FsQ,2018-03-08 04:37:50,4,0


In [13]:
USERS_LIST = list(REVIEWS['user_id'].unique())
BUSINESS_LIST = list(REVIEWS['business_id'].unique())

In [14]:
# print("Users data loading starting...")

# USERS = list()
#
# with open(USERS_DATA_FILEPATH, 'r') as f:
#     reader = pd.read_json(f, orient='records', lines=True, nrows=200000, chunksize=1000, dtype=USER_DTYPES)
#
#     for chunk in tqdm(reader):
#         reduced_chunk = chunk[USER_COLS]
#         reduced_chunk = reduced_chunk[reduced_chunk['user_id'].isin(USERS_LIST)]
#
#         USERS.append(reduced_chunk)
#
#     USERS = pd.concat(USERS, ignore_index=True)
#
# print("Users loaded.")

In [15]:
BUSINESSES = list()

with open(BUSINESS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, chunksize=1000, dtype=BUSINESS_DTYPES)

    for chunk in tqdm(reader):
        reduced_chunk = chunk[BUSINESS_COLS]
        reduced_chunk = reduced_chunk[reduced_chunk['business_id'].isin(BUSINESS_LIST)]
        BUSINESSES.append(reduced_chunk)

    BUSINESSES = pd.concat(BUSINESSES, ignore_index=True)

print("Businesses loaded.")

161it [00:03, 40.41it/s]

Businesses loaded.





In [16]:
BUSINESSES.sample(10)


Unnamed: 0,business_id,state,city,postal_code,categories,stars,review_count
12680,U0ekeaed4uUdrMe00Vn4ew,WA,Vancouver,98683,"Restaurants, Chicken Wings, Fast Food, Food, A...",1.5,110
7729,oLPwkVXfHk0srRyKh-dK-Q,MA,Boston,2116,"Shopping, Bridal, Event Planning & Services, F...",3.5,62
169,YiwC9uu9RUKqCaeTueNW7w,MA,Peabody,1960,"Internet Service Providers, Shopping, Mobile P...",2.0,13
4390,5P3b82EqZqR7lCqAYj-zaQ,CO,Boulder,80301,"American (New), Mexican, Restaurants, Cafes, A...",4.5,277
1518,D8wLWZiMlw12B2ftoQAm3Q,GA,Marietta,30067,"Nightlife, Sports Bars, Cafes, Beer, Wine & Sp...",3.5,120
10667,gNppUet6K3p7CX_eX1_r_g,GA,Atlanta,30324,"Restaurants, Chinese",3.0,67
5229,oSCkr_n69Gkg8UUdBaNhLQ,OR,Portland,97225,"Boot Camps, Active Life, Fitness & Instruction...",4.0,8
3265,JuJSU3kaPUgvm3SwqFKdkA,TX,Austin,78756,"Makeup Artists, Hair Stylists, Beauty & Spas, ...",5.0,16
11770,p3QUmEM3oz4KITABtNAfYw,MA,Wakefield,1880,"Nightlife, Sushi Bars, Bars, Restaurants, Japa...",4.0,161
12812,JeLd87zP0AMYWDSo6vtiqA,GA,Atlanta,30309,"Elementary Schools, Private Schools, Preschool...",4.0,6


In [17]:
# add business zip code in first
# add business state

for business_id in tqdm(BUSINESS_LIST):
    business_zipcode = BUSINESSES[BUSINESSES['business_id'] == business_id]['postal_code'].iloc[0]
    business_state = BUSINESSES[BUSINESSES['business_id'] == business_id]['state'].iloc[0]

    REVIEWS.loc[REVIEWS['business_id'] == business_id, ['business_state', 'business_zipcode']] = business_state, business_zipcode

REVIEWS = REVIEWS[REVIEWS['business_state'].isin(STATES_LIST)]

100%|██████████| 13706/13706 [08:39<00:00, 26.37it/s]


In [18]:
# check state
# then match county
def match_loc_to_ideology(zipcode):
    state = ZIPS_DF[ZIPS_DF['zip'] == zipcode]['state'].iloc[0]
    county_name = fix_county_name(ZIPS_DF[ZIPS_DF['zip'] == zipcode]['county'].iloc[0])

    state_slice = COUNTY_DF[COUNTY_DF['state_po'] == state]
    ideology_metric = state_slice.loc[state_slice['county_name']==county_name]['perc_diffs'].iloc[0]

    return ideology_metric


In [19]:
# assign business political ideologies by zip code
BUSINESS_ZIPS = list(REVIEWS['business_zipcode'].unique())

for business_zip in tqdm(BUSINESS_ZIPS):
    try:
        business_ideology = match_loc_to_ideology(business_zip)
        REVIEWS.loc[REVIEWS['business_zipcode'] == business_zip, 'business_ideology'] = business_ideology
    except:
        print(business_zip)

  4%|▍         | 21/533 [00:00<00:10, 49.44it/s]




100%|██████████| 533/533 [00:11<00:00, 46.95it/s]

97226





In [20]:
len(USERS_LIST)
# Store users zip code frequencies in a separate data type
# Store user zip counts to county
# Assign each user a political ideology based on county frequency
# store zip code counts per user in a dict from reviews


204997

In [21]:
REVIEWS.to_csv('data/yelp_dataset.csv')
REVIEWS

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_ideology
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,2014-10-11 03:34:02,4,3,MA,01915,-28.82
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,2015-07-03 20:38:25,4,1,MA,01701,-44.89
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,2013-05-28 20:38:06,5,0,OR,97210,-61.31
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2010-01-08 02:29:15,2,1,FL,32821,-23.12
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,2011-07-28 18:05:01,4,0,CO,80302,-56.57
...,...,...,...,...,...,...,...,...,...
299995,I0mRlZcCGAyntyGclLHSIw,vso8sQvCRVUnegGY9fx5rQ,Q4DRuxHhKuxP-01rYiRJnQ,2017-02-03 00:45:08,5,0,OR,97086,-11.07
299996,f_dYCOA9_zadlLtzWA2DWw,suv1g1uAHm0DQu_BMmPznA,sLkK9Ofmk9UFDrjXxh3iLA,2014-12-07 17:26:02,5,0,OR,97227,-61.31
299997,O0SG6BTN0FHSZ7PJ1wgmzg,mVxg_vBXcUAAQJ03uHxiIQ,La_xmePMtWwjxKUqbGvqGQ,2010-05-30 11:06:52,2,0,TX,78703,-45.11
299998,vTovPnBesDco238QpKtAvA,uVpNSrc3IEpR42VGdZ709w,4jOreXu9ctLBWOJWMtNv1w,2017-01-07 05:55:39,4,0,WA,98665,-5.09
