In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
def fix_zipcodes(zipcode):
    if len(zipcode) < 5:
        to_fill = 5 - len(zipcode)
        return (to_fill * '0') + zipcode

    else:
        return zipcode

def fix_county_name(county_name):
    if 'county' in county_name:
        return county_name[:county_name.find('county')].rstrip()
    else:
        return county_name

def match_loc_to_ideology(zipcode):
    zipcode_state = ZIPS_DF[ZIPS_DF['zip'] == zipcode]['state'].iloc[0]
    county_name = fix_county_name(ZIPS_DF[ZIPS_DF['zip'] == zipcode]['county'].iloc[0])

    state_slice = COUNTY_DF[COUNTY_DF['state_po'] == zipcode_state]
    ideology_metric = state_slice.loc[state_slice['county_name']==county_name]['perc_diffs'].iloc[0]

    return ideology_metric

def process_cpvi(cpvi):
    if cpvi.startswith('R'):
        return int(cpvi.split('+')[1])
    elif cpvi.startswith('D'):
        cpvi_metric = int(cpvi.split('+')[1])
        return np.negative(cpvi_metric)
    else:
        return 0

In [3]:
BUSINESS_DATA_FILEPATH = 'data/yelp_academic_dataset_business.json'
REVIEWS_DATA_FILEPATH = 'data/yelp_academic_dataset_review.json'
USERS_DATA_FILEPATH = 'data/yelp_academic_dataset_user.json'
COUNTY_DATA_FILEPATH = 'data/county_data.csv'
STATES_DATA_FILEPATH = 'data/states_data.csv'
ZIPCODES_DATA_FILEPATH = 'data/zip_code_database.csv'
CPI_URL = 'https://en.wikipedia.org/wiki/Cook_Partisan_Voting_Index'
POP_DENSITY_FILEPATH = 'data/apportionment.csv'


In [4]:
COUNTY_DF = pd.read_csv(COUNTY_DATA_FILEPATH, index_col=0)
STATES_DF = pd.read_csv(STATES_DATA_FILEPATH, index_col=0)
ZIPS_DF = pd.read_csv(ZIPCODES_DATA_FILEPATH)

In [5]:
ZIPS_DF = ZIPS_DF[['zip', 'state', 'county']]
ZIPS_DF['zip'] = ZIPS_DF['zip'].astype(str).apply(fix_zipcodes)
ZIPS_DF['county'] = ZIPS_DF['county'].str.lower()
ZIPS_DF.loc[len(ZIPS_DF)] = ['02101', 'MA', 'suffolk county']
ZIPS_DF.head()

Unnamed: 0,zip,state,county
0,501,NY,suffolk county
1,544,NY,suffolk county
2,601,PR,adjuntas municipio
3,602,PR,aguada municipio
4,603,PR,aguadilla municipio


In [6]:
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].str.lower()
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].apply(fix_county_name)
COUNTY_DF.head()

Unnamed: 0,state,state_po,county_name,perc_diffs
50526,ALABAMA,AL,autauga,44.42
50529,ALABAMA,AL,baldwin,53.76
50532,ALABAMA,AL,barbour,7.66
50535,ALABAMA,AL,bibb,57.73
50538,ALABAMA,AL,blount,80.0


In [7]:
COUNTY_DF[COUNTY_DF['state_po'] == 'CA'].sample(10)

Unnamed: 0,state,state_po,county_name,perc_diffs
52769,CALIFORNIA,CA,mendocino,-35.8
52869,CALIFORNIA,CA,santa clara,-47.41
52779,CALIFORNIA,CA,modoc,45.11
52904,CALIFORNIA,CA,stanislaus,-0.79
52889,CALIFORNIA,CA,siskiyou,15.75
52724,CALIFORNIA,CA,inyo,-0.15
52874,CALIFORNIA,CA,santa cruz,-60.28
52669,CALIFORNIA,CA,amador,24.36
52704,CALIFORNIA,CA,fresno,-7.83
52854,CALIFORNIA,CA,san luis obispo,-13.07


In [8]:
STATES_DF.head()

Unnamed: 0,state,perc_diffs
1,ALABAMA,25.46
3,ALASKA,10.11
5,ARIZONA,-0.1
7,ARKANSAS,6.91
9,CALIFORNIA,-29.16


In [9]:
STATES_LIST = list(COUNTY_DF['state_po'].unique())
print(f"Number of states: {len(STATES_LIST)}")

Number of states: 51


In [10]:
CPVI = pd.read_html(CPI_URL)[1]
CPVI.drop(CPVI.tail(1).index, inplace=True)
CPVI['pvi'] = CPVI['PVI'].apply(process_cpvi)
CPVI['state'] = CPVI['State'].str.upper()
CPVI = CPVI[['state', 'pvi']]

In [11]:
POP_DENSITY = pd.read_csv(POP_DENSITY_FILEPATH)

In [12]:
POP_DENSITY.dtypes
# POP_DENSITY

Name                                                    object
Geography Type                                          object
Year                                                     int64
Resident Population                                     object
Percent Change in Resident Population                  float64
Resident Population Density                             object
Resident Population Density Rank                       float64
Number of Representatives                              float64
Change in Number of Representatives                    float64
Average Apportionment Population Per Representative     object
dtype: object

In [13]:
USER_COLS = ['user_id', 'review_count']
BUSINESS_COLS = ['business_id', 'state', 'city', 'postal_code', 'categories', 'stars', 'review_count']
REVIEW_COLS = ['review_id', 'user_id', 'business_id', 'date', 'stars', 'useful']
ZIPCODES_COLS = ['zip', 'county']

USER_DTYPES = {
    'user_id': np.str,
    'review_count': np.int
}

BUSINESS_DTYPES = {
    'business_id': np.str,
    'state': np.str,
    'city': np.str,
    'postal_code': np.str,
    'categories': np.str,
    'review_count': np.int,
    'stars': np.float,
}

REVIEW_DTYPES = {
    'review_id': np.str,
    'user_id': np.str,
    'business_id': np.str,
    'stars': np.int,
    'useful': np.int
}

ZIPS_DTYPES = {
    'zip': np.str,
    'county': np.str
}

In [14]:
df = pd.read_json(BUSINESS_DATA_FILEPATH, lines=True, orient='records')
df.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object

In [15]:
df = df[df['categories'].notnull()].reset_index(drop=True)
df = df[df['categories'].str.contains('Restaurants')]
df.sample(10)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
63930,aQ6AtEeeVC8VJ4iLwaezCw,Jimmy John's,"1558 N High St, Ste. D1",Columbus,OH,43201,39.993978,-83.005862,2.5,19,1,"{'RestaurantsAttire': 'u'casual'', 'NoiseLevel...","Delis, Restaurants, Sandwiches, Fast Food","{'Monday': '10:30-3:30', 'Tuesday': '10:30-3:3..."
13404,5XGN_j6Yj9N-369sh1E2uA,The Sausage Guy,49 Lansdowne St,Boston,MA,2215,42.34739,-71.097108,3.5,31,1,"{'BikeParking': 'False', 'RestaurantsAttire': ...","Hot Dogs, Food Stands, Restaurants, Street Ven...",
57366,KdQkawE7JuIb03_qwP3AlA,W XYZ Bar,1705 NW Amberglen Ct,Hillsboro,OR,97006,45.53275,-122.883765,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Caters...","Nightlife, Food, Beer, Wine & Spirits, Tapas/S...","{'Monday': '16:0-23:0', 'Tuesday': '16:0-23:0'..."
49536,rn2Ka8dII-WH6hXG82F1GA,Quezzas,82 Rainey St,Austin,TX,78701,30.259373,-97.738846,4.5,23,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Restaurants, Mexican, Food, Food Trucks, Pizza","{'Monday': '0:0-0:0', 'Thursday': '17:0-2:30',..."
108299,_S4j-74ZR4MUkWdiMlBoFQ,La Scala Restaurant,1070 N Main St,Randolph,MA,2368,42.192437,-71.060037,3.5,216,1,"{'Alcohol': 'u'full_bar'', 'GoodForKids': 'Fal...","Restaurants, Italian, Wine Bars, Nightlife, Bars","{'Monday': '11:30-23:0', 'Tuesday': '11:30-23:..."
103843,RwvhnuyVObpgJHx8Gpz-Bg,Wawa,2184 E Irlo Bronson Memorial Hwy,Kissimmee,FL,34744,28.286183,-81.355668,3.5,21,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Convenience Stores, Gas Stations, Food, Automo...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
54477,g4eVzBGRR47egXlUt9wuiw,Nathan's Hot Dogs & Bruster's Ice Cream,"Cherokee Plaza, 3857 Peachtree Rd NE",Atlanta,GA,30319,33.881247,-84.333801,3.5,11,0,"{'OutdoorSeating': 'True', 'RestaurantsGoodFor...","Ice Cream & Frozen Yogurt, Fast Food, Restaura...",
101993,vsxNLpgHX5hv0fWAI9XyHA,The Driskill Grill,604 Brazos St,Austin,TX,78701,30.268661,-97.741715,3.5,219,1,"{'BusinessParking': '{'garage': False, 'street...","Caterers, Event Planning & Services, Hotels & ...","{'Monday': '0:0-0:0', 'Tuesday': '17:30-22:0',..."
133497,blDM0UCr0oYZzfrQn_GG5Q,Arboretum Pizza and Grill,4025 Washington St,Roslindale,MA,2131,42.291942,-71.122501,3.0,65,1,"{'HasTV': 'True', 'RestaurantsDelivery': 'True...","Fast Food, Seafood, Pizza, Restaurants, Greek,...","{'Monday': '12:0-21:45', 'Tuesday': '12:0-21:4..."
153215,2SZKV3w6GsLjclavdFfcJQ,Georgia Beer Garden,420 Edgewood Ave NE,Atlanta,GA,30312,33.754526,-84.37387,4.0,76,1,"{'GoodForDancing': 'False', 'OutdoorSeating': ...","Beer Bar, Nightlife, Restaurants, Southern, Ba...","{'Monday': '11:0-2:30', 'Tuesday': '11:0-2:30'..."


In [16]:
BUSINESSES = list()

print('Loading businesses...')

with open(BUSINESS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, chunksize=1000, dtype=BUSINESS_DTYPES)

    for chunk in tqdm(reader):
        reduced_chunk = chunk[BUSINESS_COLS]
        #reduced_chunk['postal_code'] = reduced_chunk['postal_code'].apply(fix_zipcodes)
        reduced_chunk = reduced_chunk[reduced_chunk['categories'].notnull()]
        reduced_chunk = reduced_chunk[reduced_chunk['categories'].str.contains('Restaurants')]
        BUSINESSES.append(reduced_chunk)

    BUSINESSES = pd.concat(BUSINESSES, ignore_index=True)

Loading businesses...


161it [00:04, 33.72it/s]


In [17]:
BUSINESS_LIST = list(BUSINESSES['business_id'].unique())

In [18]:
# Load 300,000 reviews from reviews
REVIEWS = list()
print('Loading reviews...')
with open(REVIEWS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, nrows=300000, chunksize=1000, dtype=REVIEW_DTYPES)

    for chunk in tqdm(reader):
        reduced_chunk = chunk[REVIEW_COLS]
        reduced_chunk = reduced_chunk[reduced_chunk['business_id'].isin(BUSINESS_LIST)]
        # Only keep US
        REVIEWS.append(reduced_chunk)
    REVIEWS = pd.concat(REVIEWS, ignore_index=True)

3it [00:00, 28.00it/s]

Loading reviews...


300it [00:09, 31.36it/s]


In [19]:
BUSINESS_LIST = list(REVIEWS['business_id'].unique()) # rename to get effective list
print(f"Number of businssess: {len(BUSINESS_LIST)}")
USERS_LIST = list(REVIEWS['user_id'].unique())


Number of businssess: 4583


In [20]:
# add business zip code in first
# add business state
print('Adding business state')
for business_id in tqdm(BUSINESS_LIST):
    business_zipcode = BUSINESSES[BUSINESSES['business_id'] == business_id]['postal_code'].iloc[0]
    business_state = BUSINESSES[BUSINESSES['business_id'] == business_id]['state'].iloc[0]

    REVIEWS.loc[REVIEWS['business_id'] == business_id, ['business_state', 'business_zipcode']] = business_state, business_zipcode

REVIEWS = REVIEWS[REVIEWS['business_state'].isin(STATES_LIST)]

  0%|          | 0/4583 [00:00<?, ?it/s]

Adding business state


100%|██████████| 4583/4583 [03:29<00:00, 21.89it/s]


In [21]:
REVIEWS.sample(15)

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode
57791,ee3OL9JpN6jYDA2J10B62Q,YukItNsLqcZoNNN8BsU20Q,WCQ9YGdpS97t6wAoRAGhiA,2016-07-15 18:18:44,5,0,FL,32714
105632,y-KcpDhfy0q5V4vKl_rqSg,bKD-b6Zaxx-CGyLexn1KFg,A5vCjdmIJg4UHfvYMremGQ,2015-12-09 07:46:27,4,1,OR,97086
114577,-U0_ahrm4AbtHeb2e1l7tw,2wo8-L8EY7ClcmKaIh4wlQ,7CXy5h3u_aWsK9qov6B4rg,2015-06-14 16:44:13,5,0,TX,78723
63362,PogK0GQzpAa5cfqHqmaEZQ,MJtbTc8WYvqZLf_D6OLhfQ,IEoxvVxtMpqHDyom4Ad6Tw,2016-10-29 18:14:28,5,5,OR,97266
35678,gXBwO5Va2xhmQoCe7I0FTw,xvdkIO99jl6enTzDM1zYCQ,bZiIIUcpgxh8mpKMDhdqbA,2013-05-08 13:42:53,4,0,TX,78704
164893,66QYerA_in60kdmLi8VOfQ,Boytzi_nxvFLzpyAIqswmg,E5yQc7ujyi_euHtEki_djQ,2016-09-12 01:34:13,5,1,OR,97212
89691,cQWK35fxGg6WNri3Wkwoow,LV_FXhQ9oLVr8_KM5JpmDw,arnbjQR2n0-6-iqdBB_gHw,2017-07-10 05:06:50,5,0,MA,2108
130173,La-dYPMr1dmvYR-sPH0AhQ,NbpulXOzalrDcJ-BNPiCKw,1BvysshfkDS2eJ0k8XiDjQ,2018-07-12 01:01:09,5,0,FL,34747
44942,dFTfc1_Vo-m1afEYMQUdNA,lgVLBf_kloHe2mphcWx79Q,R87N6i-a92vJSSMpu_VYPQ,2014-06-28 15:48:17,3,1,OR,97239
59318,k1RQi8OSHrzcmUAlemkynQ,LnP7yQeyqKfGhnQ65np6jQ,aHrKrskayskrfeZB-wTI1Q,2018-06-21 00:55:15,2,0,GA,30345


In [22]:
# assign business political ideologies by zip code
BUSINESS_ZIPS = list(REVIEWS['business_zipcode'].unique())
print('Adding business zipcode ideology')
for business_zip in tqdm(BUSINESS_ZIPS):
    try:
        business_ideology = match_loc_to_ideology(business_zip)
        REVIEWS.loc[REVIEWS['business_zipcode'] == business_zip, 'business_county_ideology'] = business_ideology
    except:
        print(business_zip)

  2%|▏         | 7/432 [00:00<00:06, 64.25it/s]

Adding business zipcode ideology


 91%|█████████ | 391/432 [00:06<00:00, 66.26it/s]




100%|██████████| 432/432 [00:06<00:00, 62.89it/s]


In [23]:
len(USERS_LIST)

148940

In [24]:
STATES_ABBR = dict()

for state in list(STATES_DF['state']):
    STATES_ABBR[state] = COUNTY_DF[COUNTY_DF['state'] == state]['state_po'].unique()[0]

print(STATES_ABBR)

{'ALABAMA': 'AL', 'ALASKA': 'AK', 'ARIZONA': 'AZ', 'ARKANSAS': 'AR', 'CALIFORNIA': 'CA', 'COLORADO': 'CO', 'CONNECTICUT': 'CT', 'DELAWARE': 'DE', 'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'GEORGIA': 'GA', 'HAWAII': 'HI', 'IDAHO': 'ID', 'ILLINOIS': 'IL', 'INDIANA': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 'KENTUCKY': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 'MASSACHUSETTS': 'MA', 'MICHIGAN': 'MI', 'MINNESOTA': 'MN', 'MISSISSIPPI': 'MS', 'MISSOURI': 'MO', 'MONTANA': 'MT', 'NEBRASKA': 'NE', 'NEVADA': 'NV', 'NEW HAMPSHIRE': 'NH', 'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'NEW YORK': 'NY', 'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OREGON': 'OR', 'PENNSYLVANIA': 'PA', 'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'TENNESSEE': 'TN', 'TEXAS': 'TX', 'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA', 'WEST VIRGINIA': 'WV', 'WISCONSIN': 'WI', 'WYOMING': 'WY'}


In [25]:
print('Processing business state ideology and state pvi')
for state in tqdm(list(STATES_ABBR.keys())):
    try:
        state_ideology = STATES_DF[STATES_DF['state'] == state]['perc_diffs'].iloc[0]

        if state == 'DISTRICT OF COLUMBIA':
            state_pvi = 0
        else:
            state_pvi = CPVI[CPVI['state'] == state]['pvi'].iloc[0]

        REVIEWS.loc[REVIEWS['business_state'] == STATES_ABBR[state],
                    ['business_state_ideology', 'business_state_pvi']] = state_ideology, state_pvi
    except:
        print(state)
        print('bad output')

 18%|█▊        | 9/51 [00:00<00:00, 81.87it/s]

Processing business state ideology and state pvi


100%|██████████| 51/51 [00:00<00:00, 82.54it/s]


In [26]:
print('Adding total reviews and average stars per business')
REVIEWS['count'] = 1
BUSINESS_COUNTS = REVIEWS.groupby('business_id').sum().reset_index()[['business_id', 'count']]
BUSINESS_STARS = REVIEWS[['business_id', 'stars']].groupby('business_id').sum().reset_index()
BUSINESSES_LIST = list(BUSINESS_COUNTS['business_id'].unique())

for business in tqdm(BUSINESSES_LIST):
    tot_reviews = BUSINESS_COUNTS[BUSINESS_COUNTS['business_id'] == business]['count'].iloc[0]
    avg_stars = round((BUSINESS_STARS[BUSINESS_STARS['business_id'] == business]['stars'].iloc[0] / tot_reviews), 3)
    REVIEWS.loc[REVIEWS['business_id'] == business, ['business_review_total', 'avg_star_rating']] = tot_reviews, avg_stars


  0%|          | 0/3901 [00:00<?, ?it/s]

Adding total reviews and average stars per business


100%|██████████| 3901/3901 [01:38<00:00, 39.61it/s]


In [27]:
# add state pop density

In [28]:
# split date into separate columns

REVIEWS['date'] = pd.to_datetime(REVIEWS['date'])
REVIEWS['year'] = REVIEWS['date'].dt.year
REVIEWS['month'] = REVIEWS['date'].dt.month
REVIEWS['day'] = REVIEWS['date'].dt.day

AttributeError: 'Series' object has no attribute 'year'

In [33]:
REVIEWS.drop(columns=['count'], inplace=True)
REVIEWS

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,business_review_total,avg_star_rating,year,month,day
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,2014-10-11 03:34:02,4,3,MA,01915,-28.82,-33.21,-14.0,44.0,3.614,2014,10,11
1,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2010-01-08 02:29:15,2,1,FL,32821,-23.12,3.36,3.0,121.0,3.446,2010,1,8
2,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,2011-07-28 18:05:01,4,0,CO,80302,-56.57,-13.50,-3.0,472.0,4.282,2011,7,28
3,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,2018-01-21 04:41:03,1,2,MA,02144,-44.89,-33.21,-14.0,248.0,3.472,2018,1,21
4,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2006-04-16 02:58:44,2,0,MA,02215,-62.84,-33.21,-14.0,11.0,3.000,2006,4,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208209,_X0ZvwmsQ_CRpGeCFFc_-Q,FwGqxJ_Z5BTKamqxWfD8LQ,eZE0FdYB9rUvbuPS4fNyYA,2015-01-24 23:12:37,2,5,TX,78701,-45.11,5.58,5.0,252.0,4.266,2015,1,24
208210,F6fenNd1vkH3ckuXFPRB6g,u0bwW1sf97hhNzTvkagEsA,enSXnvEKjDCit9A2_vyugg,2011-10-09 20:24:16,4,0,FL,32771,-2.79,3.36,3.0,29.0,2.966,2011,10,9
208211,O0SG6BTN0FHSZ7PJ1wgmzg,mVxg_vBXcUAAQJ03uHxiIQ,La_xmePMtWwjxKUqbGvqGQ,2010-05-30 11:06:52,2,0,TX,78703,-45.11,5.58,5.0,108.0,3.944,2010,5,30
208212,vTovPnBesDco238QpKtAvA,uVpNSrc3IEpR42VGdZ709w,4jOreXu9ctLBWOJWMtNv1w,2017-01-07 05:55:39,4,0,WA,98665,-5.09,-19.20,-8.0,18.0,3.111,2017,1,7


In [34]:
REVIEWS.to_csv('data/yelp_dataset.csv')