In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from tqdm import tqdm
import os

In [2]:
def fix_zipcodes(zipcode):
    if len(zipcode) < 5:
        to_fill = 5 - len(zipcode)
        return (to_fill * '0') + zipcode

    else:
        return zipcode

def fix_county_name(county_name):
    if 'county' in county_name:
        return county_name[:county_name.find('county')].rstrip()
    else:
        return county_name

def match_loc_to_ideology(zipcode):
    zipcode_state = ZIPS_DF[ZIPS_DF['zip'] == zipcode]['state'].iloc[0]
    county_name = fix_county_name(ZIPS_DF[ZIPS_DF['zip'] == zipcode]['county'].iloc[0])

    state_slice = COUNTY_DF[COUNTY_DF['state_po'] == zipcode_state]
    ideology_metric = state_slice.loc[state_slice['county_name']==county_name]['perc_diffs'].iloc[0]

    return ideology_metric

def process_cpvi(cpvi):
    if cpvi.startswith('R'):
        return int(cpvi.split('+')[1])
    elif cpvi.startswith('D'):
        cpvi_metric = int(cpvi.split('+')[1])
        return np.negative(cpvi_metric)
    else:
        return 0

In [32]:
BUSINESS_DATA_FILEPATH = 'data/yelp_academic_dataset_business.json'
REVIEWS_DATA_FILEPATH = 'data/yelp_academic_dataset_review.json'
USERS_DATA_FILEPATH = 'data/yelp_academic_dataset_user.json'
COUNTY_DATA_FILEPATH = 'data/county_data.csv'
STATES_DATA_FILEPATH = 'data/states_data.csv'
ZIPCODES_DATA_FILEPATH = 'data/zip_code_database.csv'
CPI_URL = 'https://en.wikipedia.org/wiki/Cook_Partisan_Voting_Index'
POP_DENSITY_FILEPATH = 'data/apportionment.csv'
INCOME_FILEPATH = 'data/zipcode2019/19zpallagi.csv'

In [56]:
INCOME_COLS = ['STATE', 'zipcode', 'N1', 'N2', 'ELDERLY', 'A00100', 'N02650', 'N02650', 'A02650']

INCOME_DTYPES = {
    'STATE': np.str,
    'zipcode': np.str,
    'ELDERLY': np.int,
    'N1': np.int,
    'N2': np.int,
    'A00100': np.float,
    'N02550': np.float,
    'N02650': np.float,
    'A02650': np.float
}

ZIPCODE_COLS = ['zip', 'state', 'county']

ZIPCODE_DTYPES = {
    'zip': np.str
}

In [59]:
COUNTY_DF = pd.read_csv(COUNTY_DATA_FILEPATH, index_col=0)
STATES_DF = pd.read_csv(STATES_DATA_FILEPATH, index_col=0)
ZIPS_DF = pd.read_csv(ZIPCODES_DATA_FILEPATH, dtype=ZIPCODE_DTYPES, usecols = ZIPCODE_COLS)
INCOME_DF = pd.read_csv(INCOME_FILEPATH, usecols=INCOME_COLS, dtype=INCOME_DTYPES)


In [61]:
ZIPS_DF['county'] = ZIPS_DF['county'].str.lower()
ZIPS_DF.loc[len(ZIPS_DF)] = ['02101', 'MA', 'suffolk county']
ZIPS_DF.head()

Unnamed: 0,zip,state,county
0,501,NY,suffolk county
1,544,NY,suffolk county
2,601,PR,adjuntas municipio
3,602,PR,aguada municipio
4,603,PR,aguadilla municipio


In [35]:
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].str.lower()
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].apply(fix_county_name)
COUNTY_DF.sample(8)


Unnamed: 0,state,state_po,county_name,perc_diffs
65681,TEXAS,TX,dickens,73.18
69511,SOUTH DAKOTA,SD,walworth,54.3
52352,ARKANSAS,AR,pulaski,20.42
57200,IOWA,IA,ida,10.73
55716,IDAHO,ID,madison,63.66
70020,UTAH,UT,iron,0.0
51813,ARKANSAS,AR,izard,-27.21
68429,OKLAHOMA,OK,atoka,-5.64


In [36]:
STATES_DF.head()

Unnamed: 0,state,perc_diffs
1,ALABAMA,25.46
3,ALASKA,10.11
5,ARIZONA,-0.1
7,ARKANSAS,6.91
9,CALIFORNIA,-29.16


In [37]:
STATES_LIST = list(COUNTY_DF['state_po'].unique())
print(f"Number of states: {len(STATES_LIST)}")

Number of states: 51


In [38]:
CPVI = pd.read_html(CPI_URL)[1]
CPVI.drop(CPVI.tail(1).index, inplace=True)
CPVI['pvi'] = CPVI['PVI'].apply(process_cpvi)
CPVI['state'] = CPVI['State'].str.upper()
CPVI = CPVI[['state', 'pvi']]

In [39]:
POP_DENSITY = pd.read_csv(POP_DENSITY_FILEPATH)
POP_DENSITY.head()

Unnamed: 0,Name,Geography Type,Year,Resident Population,Percent Change in Resident Population,Resident Population Density,Resident Population Density Rank,Number of Representatives,Change in Number of Representatives,Average Apportionment Population Per Representative
0,Alabama,State,1910,2138093,16.9,42.2,25.0,10.0,1.0,213809.0
1,Alaska,State,1910,64356,1.2,0.1,52.0,,,
2,Arizona,State,1910,204354,66.2,1.8,49.0,,,
3,Arkansas,State,1910,1574449,20.0,30.3,30.0,7.0,0.0,224921.0
4,California,State,1910,2377549,60.1,15.3,38.0,11.0,3.0,216051.0


In [40]:
POP_DENSITY.dtypes
# POP_DENSITY

Name                                                    object
Geography Type                                          object
Year                                                     int64
Resident Population                                     object
Percent Change in Resident Population                  float64
Resident Population Density                             object
Resident Population Density Rank                       float64
Number of Representatives                              float64
Change in Number of Representatives                    float64
Average Apportionment Population Per Representative     object
dtype: object

In [52]:
INCOME_DF['income_per_return'] = INCOME_DF['A02650'] / INCOME_DF['N1']

# rename cols

In [53]:
INCOME_DF.head()

Unnamed: 0,STATE,zipcode,N1,N2,ELDERLY,A00100,N02650,A02650,income_per_return
0,AL,0,778210,1161150,144610.0,10158838.0,778140.0,10311099.0,13.249764
1,AL,0,525940,992420,113810.0,18974967.0,525940.0,19145621.0,36.402671
2,AL,0,285700,595680,82410.0,17535801.0,285700.0,17690402.0,61.919503
3,AL,0,179070,432180,57970.0,15546951.0,179070.0,15670456.0,87.510225
4,AL,0,257010,692450,85030.0,34974856.0,257010.0,35286228.0,137.295156


In [52]:
USER_COLS = ['user_id', 'review_count']
BUSINESS_COLS = ['business_id', 'state', 'city', 'postal_code', 'categories', 'stars', 'review_count']
REVIEW_COLS = ['review_id', 'user_id', 'business_id', 'date', 'stars', 'useful']
ZIPCODES_COLS = ['zip', 'county']

USER_DTYPES = {
    'user_id': np.str,
    'review_count': np.int
}

BUSINESS_DTYPES = {
    'business_id': np.str,
    'state': np.str,
    'city': np.str,
    'postal_code': np.str,
    'categories': np.str,
    'review_count': np.int,
    'stars': np.float,
}

REVIEW_DTYPES = {
    'review_id': np.str,
    'user_id': np.str,
    'business_id': np.str,
    'stars': np.int,
    'useful': np.int
}

ZIPS_DTYPES = {
    'zip': np.str,
    'county': np.str
}

In [53]:
BUSINESSES = list()

print('Loading businesses...')

with open(BUSINESS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, chunksize=1000, dtype=BUSINESS_DTYPES)

    for chunk in tqdm(reader):
        reduced_chunk = chunk[BUSINESS_COLS]
        reduced_chunk = reduced_chunk[reduced_chunk['state'].isin(STATES_LIST)]
        reduced_chunk['postal_code'] = reduced_chunk['postal_code'].apply(fix_zipcodes)
        reduced_chunk = reduced_chunk[reduced_chunk['categories'].notnull()]
        reduced_chunk = reduced_chunk[reduced_chunk['categories'].str.contains('Restaurants')]
        BUSINESSES.append(reduced_chunk)

    BUSINESSES = pd.concat(BUSINESSES, ignore_index=True)

Loading businesses...


161it [00:04, 40.04it/s]


In [54]:
BUSINESS_LIST = list(BUSINESSES['business_id'].unique())

In [55]:
# import dask.dataframe as dd
# print('reviews loading')
# reviews_df = dd.read_json('data/yelp_academic_dataset_review.json', orient='records', lines=True)
# reviews_df = reviews_df[REVIEW_COLS]
# REVIEWS = reviews_df[reviews_df['business_id'].isin(BUSINESS_LIST)].compute()
# reviews_df = None
# print('reviews loaded')

In [56]:
import sys
print(sys.maxsize)

9223372036854775807


In [57]:
#Load 300,000 reviews from reviews
REVIEWS = list()
print('Loading reviews...')
with open(REVIEWS_DATA_FILEPATH, 'r') as f:
    #reader = pd.read_json(f, orient='records', lines=True, chunksize=100, nrows=8635403, dtype=REVIEW_DTYPES)
    reader = pd.read_json(f, orient='records', lines=True, chunksize=100, nrows=100000, dtype=REVIEW_DTYPES)
    #reader = dd.read_json('data/yelp_academic_dataset_review.json', orient='records', lines=True)
    for chunk in tqdm(reader):
        reduced_chunk = chunk[REVIEW_COLS]
        reduced_chunk = reduced_chunk[reduced_chunk['business_id'].isin(BUSINESS_LIST)]
        # Only keep US
        REVIEWS.append(reduced_chunk)
    REVIEWS = pd.concat(REVIEWS, ignore_index=True)
#REVIEWS = REVIEWS.compute()
print('Reviews loaded')

8it [00:00, 79.17it/s]

Loading reviews...


3000it [00:40, 73.83it/s]


Reviews loaded


In [58]:
BUSINESS_LIST = list(REVIEWS['business_id'].unique()) # rename to get effective list
print(f"Number of businssess: {len(BUSINESS_LIST)}")
USERS_LIST = list(REVIEWS['user_id'].unique())


Number of businssess: 3901


In [59]:
# add business zip code in first
# add business state
print('Adding business state and zip codes...')
for business_id in tqdm(BUSINESS_LIST):
    business_zipcode = BUSINESSES[BUSINESSES['business_id'] == business_id]['postal_code'].iloc[0]
    business_state = BUSINESSES[BUSINESSES['business_id'] == business_id]['state'].iloc[0]

    REVIEWS.loc[REVIEWS['business_id'] == business_id, ['business_state', 'business_zipcode']] = business_state, business_zipcode

REVIEWS = REVIEWS[REVIEWS['business_state'].isin(STATES_LIST)]

  0%|          | 0/3901 [00:00<?, ?it/s]

Adding business state and zip codes...


100%|██████████| 3901/3901 [02:24<00:00, 27.03it/s]


In [60]:
STATES_LIST = list(REVIEWS['business_state'].unique())
list(REVIEWS['business_state'].unique())

['MA', 'FL', 'CO', 'WA', 'GA', 'TX', 'OR', 'OH', 'KS']

In [61]:
REVIEWS.sample(15)

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode
14356,Ips8AB_O9jEINRFh6ayxkQ,QtBNUljlZOPd13kVN0Souw,F6b_O4-wtd3956YOvAoerg,2015-07-28 18:36:04,5,0,TX,78702
111841,BNYWgIhg4JMpy6BTKQDODQ,mf5wrQTAG3G4KGcqiMsdNQ,rXxC_ZH58EItO3m5PASzEA,2016-05-23 17:02:19,5,0,MA,2118
67906,i_1mvUU_H280tKuAEg0ujg,A4fCX866CHbL0ysZxDM4Lw,xRvbrDDE32cBN8ehgv-UGQ,2016-07-27 03:12:04,4,0,OR,97229
23260,6B59KkHANofkD9REVTVxsg,-4UbwSOzbZSt2geQoyHvkA,lW5tYUBXmLPD7w6gcVojNw,2017-05-14 16:21:07,1,0,MA,2474
91312,7Ylg1Fn8Ti0C01jZ7TQqug,G-CvKh82PJ7S9vAW_WBC-g,Ln-8CbKGZGmF-GCqMoMcpA,2015-03-22 20:02:14,5,1,GA,30318
14189,j-3X4wXl18erUDX17Z1inQ,blSjTheMk-QrX3ze6PmZJw,RabRnRZeCMu_uGTnAUUTaA,2016-12-19 16:39:37,4,1,TX,78757
56321,JrymWYPzxrm0w8xrrEG7iQ,8cvp_IjFGoGPq5RU51KRAg,6H313b3O6gTULpR7fV2FsQ,2011-08-27 04:08:10,3,2,MA,2110
130443,4_aX_KeNEipTiuBeQjzHIA,LtdgPLafXSCUclRWZzYp2Q,J1uidHIL7nE_noUuvFXj0A,2013-11-04 01:26:53,4,0,MA,2114
132766,RHBhHK466e-o9RbS1WZhag,37jfN0-f64TPcnPVRsJTKw,R8fLQ6TLz06MQR69KQJ83g,2012-03-11 15:56:04,3,0,WA,98661
3052,mT69vSVaqCrJlvI-00QSIw,IIhkjj0WZIdAMa0uyHMLgA,xdpH27x6qGSG21LLa6TaXQ,2011-03-20 13:23:58,5,0,MA,2043


In [62]:
# assign business political ideologies by zip code
BUSINESS_ZIPS = list(REVIEWS['business_zipcode'].unique())
print('Adding business zipcode ideology')
for business_zip in tqdm(BUSINESS_ZIPS):
    try:
        business_ideology = match_loc_to_ideology(business_zip)
        REVIEWS.loc[REVIEWS['business_zipcode'] == business_zip, 'business_county_ideology'] = business_ideology
    except:
        print(business_zip)

  1%|▏         | 6/432 [00:00<00:08, 52.16it/s]

Adding business zipcode ideology


 90%|████████▉ | 387/432 [00:07<00:00, 54.28it/s]

00000


100%|██████████| 432/432 [00:08<00:00, 52.07it/s]


In [63]:
len(USERS_LIST)

137515

In [64]:
STATES_ABBR = dict()

for state in list(STATES_DF['state']):
    STATES_ABBR[state] = COUNTY_DF[COUNTY_DF['state'] == state]['state_po'].unique()[0]

print(STATES_ABBR)

{'ALABAMA': 'AL', 'ALASKA': 'AK', 'ARIZONA': 'AZ', 'ARKANSAS': 'AR', 'CALIFORNIA': 'CA', 'COLORADO': 'CO', 'CONNECTICUT': 'CT', 'DELAWARE': 'DE', 'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'GEORGIA': 'GA', 'HAWAII': 'HI', 'IDAHO': 'ID', 'ILLINOIS': 'IL', 'INDIANA': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 'KENTUCKY': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 'MASSACHUSETTS': 'MA', 'MICHIGAN': 'MI', 'MINNESOTA': 'MN', 'MISSISSIPPI': 'MS', 'MISSOURI': 'MO', 'MONTANA': 'MT', 'NEBRASKA': 'NE', 'NEVADA': 'NV', 'NEW HAMPSHIRE': 'NH', 'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'NEW YORK': 'NY', 'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OREGON': 'OR', 'PENNSYLVANIA': 'PA', 'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'TENNESSEE': 'TN', 'TEXAS': 'TX', 'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA', 'WEST VIRGINIA': 'WV', 'WISCONSIN': 'WI', 'WYOMING': 'WY'}


In [65]:
STATES_ABBR_REVERSE = {v:k for k, v in STATES_ABBR.items()}
print(STATES_ABBR_REVERSE)

{'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'AR': 'ARKANSAS', 'CA': 'CALIFORNIA', 'CO': 'COLORADO', 'CT': 'CONNECTICUT', 'DE': 'DELAWARE', 'DC': 'DISTRICT OF COLUMBIA', 'FL': 'FLORIDA', 'GA': 'GEORGIA', 'HI': 'HAWAII', 'ID': 'IDAHO', 'IL': 'ILLINOIS', 'IN': 'INDIANA', 'IA': 'IOWA', 'KS': 'KANSAS', 'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND', 'MA': 'MASSACHUSETTS', 'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPPI', 'MO': 'MISSOURI', 'MT': 'MONTANA', 'NE': 'NEBRASKA', 'NV': 'NEVADA', 'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY', 'NM': 'NEW MEXICO', 'NY': 'NEW YORK', 'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO', 'OK': 'OKLAHOMA', 'OR': 'OREGON', 'PA': 'PENNSYLVANIA', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA', 'SD': 'SOUTH DAKOTA', 'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH', 'VT': 'VERMONT', 'VA': 'VIRGINIA', 'WA': 'WASHINGTON', 'WV': 'WEST VIRGINIA', 'WI': 'WISCONSIN', 'WY': 'WYOMING'}


In [66]:
print('Processing business state ideology and state pvi')
for state in tqdm(list(STATES_ABBR.keys())):
    try:
        state_ideology = STATES_DF[STATES_DF['state'] == state]['perc_diffs'].iloc[0]

        if state == 'DISTRICT OF COLUMBIA':
            state_pvi = 0
        else:
            state_pvi = CPVI[CPVI['state'] == state]['pvi'].iloc[0]

        REVIEWS.loc[REVIEWS['business_state'] == STATES_ABBR[state],
                    ['business_state_ideology', 'business_state_pvi']] = state_ideology, state_pvi
    except:
        print(state)
        print('bad output')

 18%|█▊        | 9/51 [00:00<00:00, 83.88it/s]

Processing business state ideology and state pvi


100%|██████████| 51/51 [00:00<00:00, 85.83it/s]


In [67]:
print('Adding total reviews and average stars per business')
REVIEWS['count'] = 1
BUSINESS_COUNTS = REVIEWS.groupby('business_id').sum().reset_index()[['business_id', 'count']]
BUSINESS_STARS = REVIEWS[['business_id', 'stars']].groupby('business_id').sum().reset_index()
BUSINESSES_LIST = list(BUSINESS_COUNTS['business_id'].unique())

for business in tqdm(BUSINESSES_LIST):
    tot_reviews = BUSINESS_COUNTS[BUSINESS_COUNTS['business_id'] == business]['count'].iloc[0]
    avg_stars = round((BUSINESS_STARS[BUSINESS_STARS['business_id'] == business]['stars'].iloc[0] / tot_reviews), 3)
    REVIEWS.loc[REVIEWS['business_id'] == business, ['business_review_total', 'avg_star_rating']] = tot_reviews, avg_stars


  0%|          | 0/3901 [00:00<?, ?it/s]

Adding total reviews and average stars per business


100%|██████████| 3901/3901 [01:24<00:00, 46.30it/s]


In [None]:
# add state pop density
POP_DENSITY = POP_DENSITY[POP_DENSITY['Year'] >= 2000]
POP_DENSITY_COLS = ['Name', 'Year', 'Resident Population', 'Geography Type', 'Resident Population Density']
POP_DENSITY = POP_DEN1SITY[POP_DENSITY_COLS]
POP_DENSITY = POP_DENSITY[POP_DENSITY['Geography Type'] == 'State']
POP_DENSITY.drop(columns=['Geography Type'], inplace=True)
POP_DENSITY_RENAMED_COLS = ['state', 'year', 'pop', 'pop_density']
POP_DENSITY.columns = POP_DENSITY_RENAMED_COLS
POP_DENSITY['state'] = POP_DENSITY['state'].str.upper()
POP_DENSITY = POP_DENSITY[POP_DENSITY['state'].isin(list(STATES_ABBR.keys()))]


In [69]:
POP_DENSITY.head()

Unnamed: 0,state,year,pop,pop_density
513,ALABAMA,2000,4447100,87.8
514,ALASKA,2000,626932,1.1
515,ARIZONA,2000,5130632,45.2
516,ARKANSAS,2000,2673400,51.4
517,CALIFORNIA,2000,33871648,217.4


In [70]:
POP_DENSITY['year'].unique()

array([2000, 2010, 2020])

In [71]:
# split date into separate columns
REVIEWS['date'] = pd.to_datetime(REVIEWS['date'])
REVIEWS['year'] = REVIEWS['date'].dt.year
REVIEWS['month'] = REVIEWS['date'].dt.month
REVIEWS['day'] = REVIEWS['date'].dt.day

In [72]:
sorted(list(REVIEWS['year'].unique()))

[2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018]

In [73]:
# Add population density measures
print("Adding population density...")

Adding population density...


In [74]:
def add_pop_density_backwards(decade):
    upper_year_limit = decade+10
    lower_year_limit = decade

    #pop_slice = POP_DENSITY[POP_DENSITY['year'] == decade]

    for state in STATES_LIST:
        state_pop_density = POP_DENSITY[
            (POP_DENSITY['year'] == decade) &
            (POP_DENSITY['state'] == STATES_ABBR_REVERSE[state])
        ]['pop_density'].iloc[0]

        REVIEWS.loc[
            (REVIEWS['year'].between(lower_year_limit, upper_year_limit, inclusive='right')) &
            (REVIEWS['business_state'] == state),
            ['population_density_backwards']
        ] = state_pop_density

        REVIEWS.loc[
            (REVIEWS['year'] > 2020), ['population_density_backwards']
        ] = state_pop_density

In [75]:
def add_pop_density_forwards(decade):
    upper_year_limit = decade
    lower_year_limit = decade-10

    #pop_slice = POP_DENSITY[POP_DENSITY['year'] == decade]

    for state in STATES_LIST:
        state_pop_density = POP_DENSITY[
            (POP_DENSITY['year'] == decade) &
            (POP_DENSITY['state'] == STATES_ABBR_REVERSE[state])
        ]['pop_density'].iloc[0]

        REVIEWS.loc[
            (REVIEWS['year'].between(lower_year_limit, upper_year_limit, inclusive='right')) &
            (REVIEWS['business_state'] == state),
            ['population_density_forwards']
        ] = state_pop_density

In [83]:
add_pop_density_backwards(2000)
add_pop_density_backwards(2010)
add_pop_density_backwards(2020)

In [84]:
add_pop_density_forwards(2000)
add_pop_density_forwards(2010)
add_pop_density_forwards(2020)

In [None]:
# add income and elderly to reviews

In [None]:
# remove user_id, business_id
# reset index

In [86]:
REVIEWS

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,business_review_total,avg_star_rating,year,month,day,population_density_backwards,population_density_forwards
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,2014-10-11 03:34:02,4,3,MA,01915,-28.82,-33.21,-14.0,44.0,3.614,2014,10,11,839.4,901.2
1,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2010-01-08 02:29:15,2,1,FL,32821,-23.12,3.36,3.0,121.0,3.446,2010,1,8,350.6,401.4
2,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,2011-07-28 18:05:01,4,0,CO,80302,-56.57,-13.50,-3.0,472.0,4.282,2011,7,28,48.5,55.7
3,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,2018-01-21 04:41:03,1,2,MA,02144,-44.89,-33.21,-14.0,248.0,3.472,2018,1,21,839.4,901.2
4,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2006-04-16 02:58:44,2,0,MA,02215,-62.84,-33.21,-14.0,11.0,3.000,2006,4,16,814,839.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190288,_X0ZvwmsQ_CRpGeCFFc_-Q,FwGqxJ_Z5BTKamqxWfD8LQ,eZE0FdYB9rUvbuPS4fNyYA,2015-01-24 23:12:37,2,5,TX,78701,-45.11,5.58,5.0,252.0,4.266,2015,1,24,96.3,111.6
190289,F6fenNd1vkH3ckuXFPRB6g,u0bwW1sf97hhNzTvkagEsA,enSXnvEKjDCit9A2_vyugg,2011-10-09 20:24:16,4,0,FL,32771,-2.79,3.36,3.0,29.0,2.966,2011,10,9,350.6,401.4
190290,O0SG6BTN0FHSZ7PJ1wgmzg,mVxg_vBXcUAAQJ03uHxiIQ,La_xmePMtWwjxKUqbGvqGQ,2010-05-30 11:06:52,2,0,TX,78703,-45.11,5.58,5.0,108.0,3.944,2010,5,30,96.3,111.6
190291,vTovPnBesDco238QpKtAvA,uVpNSrc3IEpR42VGdZ709w,4jOreXu9ctLBWOJWMtNv1w,2017-01-07 05:55:39,4,0,WA,98665,-5.09,-19.20,-8.0,18.0,3.111,2017,1,7,101.2,115.9


In [85]:
# check REVIEWS for na in pop density cols
REVIEWS[REVIEWS['population_density_backwards'].isna()]

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,business_review_total,avg_star_rating,year,month,day,population_density_backwards,population_density_forwards


In [87]:
REVIEWS[REVIEWS['population_density_forwards'].isna()]

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,business_review_total,avg_star_rating,year,month,day,population_density_backwards,population_density_forwards


In [79]:
REVIEWS.drop(columns=['count'], inplace=True)
REVIEWS

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,business_review_total,avg_star_rating,year,month,day,population_density_backwards,population_density_forwards
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,2014-10-11 03:34:02,4,3,MA,01915,-28.82,-33.21,-14.0,44.0,3.614,2014,10,11,839.4,901.2
1,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2010-01-08 02:29:15,2,1,FL,32821,-23.12,3.36,3.0,121.0,3.446,2010,1,8,350.6,401.4
2,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,2011-07-28 18:05:01,4,0,CO,80302,-56.57,-13.50,-3.0,472.0,4.282,2011,7,28,48.5,55.7
3,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,2018-01-21 04:41:03,1,2,MA,02144,-44.89,-33.21,-14.0,248.0,3.472,2018,1,21,839.4,901.2
4,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2006-04-16 02:58:44,2,0,MA,02215,-62.84,-33.21,-14.0,11.0,3.000,2006,4,16,,839.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190288,_X0ZvwmsQ_CRpGeCFFc_-Q,FwGqxJ_Z5BTKamqxWfD8LQ,eZE0FdYB9rUvbuPS4fNyYA,2015-01-24 23:12:37,2,5,TX,78701,-45.11,5.58,5.0,252.0,4.266,2015,1,24,96.3,111.6
190289,F6fenNd1vkH3ckuXFPRB6g,u0bwW1sf97hhNzTvkagEsA,enSXnvEKjDCit9A2_vyugg,2011-10-09 20:24:16,4,0,FL,32771,-2.79,3.36,3.0,29.0,2.966,2011,10,9,350.6,401.4
190290,O0SG6BTN0FHSZ7PJ1wgmzg,mVxg_vBXcUAAQJ03uHxiIQ,La_xmePMtWwjxKUqbGvqGQ,2010-05-30 11:06:52,2,0,TX,78703,-45.11,5.58,5.0,108.0,3.944,2010,5,30,96.3,111.6
190291,vTovPnBesDco238QpKtAvA,uVpNSrc3IEpR42VGdZ709w,4jOreXu9ctLBWOJWMtNv1w,2017-01-07 05:55:39,4,0,WA,98665,-5.09,-19.20,-8.0,18.0,3.111,2017,1,7,101.2,115.9


In [80]:
REVIEWS.to_csv('data/yelp_dataset.csv')
print('File exported.')

File exported.
