In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from tqdm import tqdm

In [2]:
def fix_zipcodes(zipcode):
    if len(zipcode) < 5:
        to_fill = 5 - len(zipcode)
        return (to_fill * '0') + zipcode

    else:
        return zipcode

def fix_county_name(county_name):
    if 'county' in county_name:
        return county_name[:county_name.find('county')].rstrip()
    else:
        return county_name

def match_loc_to_ideology(zipcode):
    zipcode_state = ZIPS_DF[ZIPS_DF['zip'] == zipcode]['state'].iloc[0]
    county_name = fix_county_name(ZIPS_DF[ZIPS_DF['zip'] == zipcode]['county'].iloc[0])

    state_slice = COUNTY_DF[COUNTY_DF['state_po'] == zipcode_state]
    ideology_metric = state_slice.loc[state_slice['county_name']==county_name]['perc_diffs'].iloc[0]

    return ideology_metric

def process_cpvi(cpvi):
    if cpvi.startswith('R'):
        return int(cpvi.split('+')[1])
    elif cpvi.startswith('D'):
        cpvi_metric = int(cpvi.split('+')[1])
        return np.negative(cpvi_metric)
    else:
        return 0

In [3]:
BUSINESS_DATA_FILEPATH = 'data/yelp_academic_dataset_business.json'
REVIEWS_DATA_FILEPATH = 'data/yelp_academic_dataset_review.json'
USERS_DATA_FILEPATH = 'data/yelp_academic_dataset_user.json'
COUNTY_DATA_FILEPATH = 'data/county_data.csv'
STATES_DATA_FILEPATH = 'data/states_data.csv'
ZIPCODES_DATA_FILEPATH = 'data/zip_code_database.csv'
CPI_URL = 'https://en.wikipedia.org/wiki/Cook_Partisan_Voting_Index'
POP_DENSITY_FILEPATH = 'data/apportionment.csv'


In [4]:
COUNTY_DF = pd.read_csv(COUNTY_DATA_FILEPATH, index_col=0)
STATES_DF = pd.read_csv(STATES_DATA_FILEPATH, index_col=0)
ZIPS_DF = pd.read_csv(ZIPCODES_DATA_FILEPATH)

In [5]:
ZIPS_DF = ZIPS_DF[['zip', 'state', 'county']]
ZIPS_DF['zip'] = ZIPS_DF['zip'].astype(str).apply(fix_zipcodes)
ZIPS_DF['county'] = ZIPS_DF['county'].str.lower()
ZIPS_DF.loc[len(ZIPS_DF)] = ['02101', 'MA', 'suffolk county']
ZIPS_DF.head()

Unnamed: 0,zip,state,county
0,501,NY,suffolk county
1,544,NY,suffolk county
2,601,PR,adjuntas municipio
3,602,PR,aguada municipio
4,603,PR,aguadilla municipio


In [6]:
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].str.lower()
COUNTY_DF['county_name'] = COUNTY_DF['county_name'].apply(fix_county_name)
COUNTY_DF.sample(8)


Unnamed: 0,state,state_po,county_name,perc_diffs
69114,OKLAHOMA,OK,payne,1.78
63227,NORTH CAROLINA,NC,warren,6.93
54664,GEORGIA,GA,jones,-20.17
58026,KANSAS,KS,wichita,68.08
71539,VIRGINIA,VA,bristol city,36.92
54601,GEORGIA,GA,jasper,10.54
51932,ARKANSAS,AR,lee,25.63
56045,ILLINOIS,IL,mclean,-3.95


In [7]:
STATES_DF.head()

Unnamed: 0,state,perc_diffs
1,ALABAMA,25.46
3,ALASKA,10.11
5,ARIZONA,-0.1
7,ARKANSAS,6.91
9,CALIFORNIA,-29.16


In [8]:
STATES_LIST = list(COUNTY_DF['state_po'].unique())
print(f"Number of states: {len(STATES_LIST)}")

Number of states: 51


In [9]:
CPVI = pd.read_html(CPI_URL)[1]
CPVI.drop(CPVI.tail(1).index, inplace=True)
CPVI['pvi'] = CPVI['PVI'].apply(process_cpvi)
CPVI['state'] = CPVI['State'].str.upper()
CPVI = CPVI[['state', 'pvi']]

In [10]:
POP_DENSITY = pd.read_csv(POP_DENSITY_FILEPATH)
POP_DENSITY.head()

Unnamed: 0,Name,Geography Type,Year,Resident Population,Percent Change in Resident Population,Resident Population Density,Resident Population Density Rank,Number of Representatives,Change in Number of Representatives,Average Apportionment Population Per Representative
0,Alabama,State,1910,2138093,16.9,42.2,25.0,10.0,1.0,213809.0
1,Alaska,State,1910,64356,1.2,0.1,52.0,,,
2,Arizona,State,1910,204354,66.2,1.8,49.0,,,
3,Arkansas,State,1910,1574449,20.0,30.3,30.0,7.0,0.0,224921.0
4,California,State,1910,2377549,60.1,15.3,38.0,11.0,3.0,216051.0


In [11]:
POP_DENSITY.dtypes
# POP_DENSITY

Name                                                    object
Geography Type                                          object
Year                                                     int64
Resident Population                                     object
Percent Change in Resident Population                  float64
Resident Population Density                             object
Resident Population Density Rank                       float64
Number of Representatives                              float64
Change in Number of Representatives                    float64
Average Apportionment Population Per Representative     object
dtype: object

In [12]:
USER_COLS = ['user_id', 'review_count']
BUSINESS_COLS = ['business_id', 'state', 'city', 'postal_code', 'categories', 'stars', 'review_count']
REVIEW_COLS = ['review_id', 'user_id', 'business_id', 'date', 'stars', 'useful']
ZIPCODES_COLS = ['zip', 'county']

USER_DTYPES = {
    'user_id': np.str,
    'review_count': np.int
}

BUSINESS_DTYPES = {
    'business_id': np.str,
    'state': np.str,
    'city': np.str,
    'postal_code': np.str,
    'categories': np.str,
    'review_count': np.int,
    'stars': np.float,
}

REVIEW_DTYPES = {
    'review_id': np.str,
    'user_id': np.str,
    'business_id': np.str,
    'stars': np.int,
    'useful': np.int
}

ZIPS_DTYPES = {
    'zip': np.str,
    'county': np.str
}

In [13]:
BUSINESSES = list()

print('Loading businesses...')

with open(BUSINESS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, chunksize=1000, dtype=BUSINESS_DTYPES)

    for chunk in tqdm(reader):
        reduced_chunk = chunk[BUSINESS_COLS]
        reduced_chunk = reduced_chunk[reduced_chunk['state'].isin(STATES_LIST)]
        reduced_chunk['postal_code'] = reduced_chunk['postal_code'].apply(fix_zipcodes)
        reduced_chunk = reduced_chunk[reduced_chunk['categories'].notnull()]
        reduced_chunk = reduced_chunk[reduced_chunk['categories'].str.contains('Restaurants')]
        BUSINESSES.append(reduced_chunk)

    BUSINESSES = pd.concat(BUSINESSES, ignore_index=True)

Loading businesses...


161it [00:08, 19.81it/s]


In [14]:
BUSINESS_LIST = list(BUSINESSES['business_id'].unique())

In [15]:
# import dask.dataframe as dd
# print('reviews loading')
# reviews_df = dd.read_json('data/yelp_academic_dataset_review.json', orient='records', lines=True)
# reviews_df = reviews_df[REVIEW_COLS]
# REVIEWS = reviews_df[reviews_df['business_id'].isin(BUSINESS_LIST)].compute()
# reviews_df = None
# print('reviews loaded')

In [16]:
import sys
print(sys.maxsize)

9223372036854775807


In [17]:
#Load 300,000 reviews from reviews
REVIEWS = list()
print('Loading reviews...')
with open(REVIEWS_DATA_FILEPATH, 'r') as f:
    reader = pd.read_json(f, orient='records', lines=True, chunksize=100, nrows=8635403, dtype=REVIEW_DTYPES)
    #reader = dd.read_json('data/yelp_academic_dataset_review.json', orient='records', lines=True)
    for chunk in tqdm(reader):
        reduced_chunk = chunk[REVIEW_COLS]
        reduced_chunk = reduced_chunk[reduced_chunk['business_id'].isin(BUSINESS_LIST)]
        # Only keep US
        REVIEWS.append(reduced_chunk)
    REVIEWS = pd.concat(REVIEWS, ignore_index=True)
#REVIEWS = REVIEWS.compute()
print('Reviews loaded')

3it [00:00, 27.14it/s]

Loading reviews...


86355it [14:51, 96.85it/s] 


Reviews loaded


In [18]:
BUSINESS_LIST = list(REVIEWS['business_id'].unique()) # rename to get effective list
print(f"Number of businssess: {len(BUSINESS_LIST)}")
USERS_LIST = list(REVIEWS['user_id'].unique())


Number of businssess: 43256


In [19]:
# add business zip code in first
# add business state
print('Adding business state and zip codes...')
for business_id in tqdm(BUSINESS_LIST):
    business_zipcode = BUSINESSES[BUSINESSES['business_id'] == business_id]['postal_code'].iloc[0]
    business_state = BUSINESSES[BUSINESSES['business_id'] == business_id]['state'].iloc[0]

    REVIEWS.loc[REVIEWS['business_id'] == business_id, ['business_state', 'business_zipcode']] = business_state, business_zipcode

REVIEWS = REVIEWS[REVIEWS['business_state'].isin(STATES_LIST)]

  0%|          | 0/43256 [00:00<?, ?it/s]

Adding business state and zip codes...


100%|██████████| 43256/43256 [8:00:22<00:00,  1.50it/s]  


In [20]:
STATES_LIST = list(REVIEWS['business_state'].unique())
list(REVIEWS['business_state'].unique())

['MA',
 'FL',
 'CO',
 'WA',
 'GA',
 'TX',
 'OR',
 'OH',
 'KS',
 'WY',
 'MN',
 'VA',
 'KY',
 'NH']

In [21]:
REVIEWS.sample(15)

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode
968516,y6a6ALnowZ7e6zE2rby4CQ,NGBAz-5GCWAv6Em7LtrBeQ,1gat4FrYE9TX3d6ZqOROTA,2017-07-25 13:12:05,3,4,FL,32953
2322656,zjDwUJysQkjIPZmzYdWnkw,se70GX-FKRpWXAlYxLwXqA,HhX8Gv5riCptJXMa2Mn1RA,2018-01-21 01:58:32,5,0,MA,2127
1988075,L42HsdHArmB5JyVVXp-NcA,6V01soR8WqU9CgUy1O65Tw,QW3anlt1nJ9aNkdQAV-Ifw,2019-01-24 23:02:31,5,0,GA,30313
3049253,mM1yZBakcLnkcjiCpI3gLg,L2TWgyCzI5-UtoAXrc3UxA,NGB3HCZYRr3Yu-KhgkqGUw,2019-12-29 23:08:41,4,1,OR,97230
57872,lg9PmJ_m_1ebu8UgnHYzSg,YTIYhAQtfGOhIBiMWzXbjA,QwxkPvSZtyIb0hLBwhBzAg,2017-10-26 05:57:41,1,0,OR,97229
2115542,sxuKHEiuzhmNjZCNAhtAmg,uQ48NhIwO8ogz46m8LsQYA,OUTYLhTgXcZFBeRw3WSuFA,2015-12-08 15:42:48,3,1,OH,43229
5020012,CUvLrAOHsIfm38vTG2txxw,rJtjiBHJmpPrqtidwwTbfQ,lIwaafufo1VMDyhh0YUqLQ,2019-09-03 18:52:58,3,0,FL,32828
788591,Eon69hTHzz01maDCYeTXzw,kwauCaApMbrW4IeliIGqFQ,BOXmNc9VykHjj4bYZ4z2hQ,2010-11-15 04:33:06,5,6,TX,78704
4205898,BgCn4Qs_F3l_vb5klXZSLg,U1VoGIrts29wRyAoIGJ7AQ,1qwxzGRcU1y3tJrsoYQ4Rw,2016-08-07 20:26:57,4,0,MA,2116
2909279,UhzRHth7HYmzzmXIhzj-zQ,nkExS83hy0kk3lMmow6WSw,mAW0poOKFdoaBFXAE-xjJA,2008-04-15 00:47:12,5,3,GA,30305


In [22]:
# assign business political ideologies by zip code
BUSINESS_ZIPS = list(REVIEWS['business_zipcode'].unique())
print('Adding business zipcode ideology')
for business_zip in tqdm(BUSINESS_ZIPS):
    try:
        business_ideology = match_loc_to_ideology(business_zip)
        REVIEWS.loc[REVIEWS['business_zipcode'] == business_zip, 'business_county_ideology'] = business_ideology
    except:
        print(business_zip)

  0%|          | 0/604 [00:00<?, ?it/s]

Adding business zipcode ideology


 63%|██████▎   | 380/604 [01:28<00:45,  4.98it/s]

00000


 73%|███████▎  | 438/604 [01:42<00:33,  5.02it/s]

78792


 75%|███████▌  | 454/604 [01:46<00:35,  4.24it/s]

75749


 77%|███████▋  | 464/604 [01:48<00:27,  5.05it/s]

31132


 83%|████████▎ | 500/604 [01:56<00:20,  5.00it/s]

80604


 84%|████████▍ | 510/604 [01:58<00:22,  4.25it/s]

03017


 87%|████████▋ | 527/604 [02:02<00:15,  4.99it/s]

00961


 88%|████████▊ | 532/604 [02:03<00:16,  4.32it/s]

02154


 91%|█████████ | 550/604 [02:07<00:10,  4.98it/s]

78707
32866


 97%|█████████▋| 583/604 [02:14<00:04,  5.04it/s]

32787


 97%|█████████▋| 586/604 [02:15<00:03,  4.52it/s]

02413


 97%|█████████▋| 588/604 [02:15<00:03,  5.25it/s]

02010
01710


 99%|█████████▉| 599/604 [02:17<00:00,  5.20it/s]

02011


100%|██████████| 604/604 [02:18<00:00,  4.36it/s]


In [23]:
len(USERS_LIST)

1461366

In [24]:
STATES_ABBR = dict()

for state in list(STATES_DF['state']):
    STATES_ABBR[state] = COUNTY_DF[COUNTY_DF['state'] == state]['state_po'].unique()[0]

print(STATES_ABBR)

{'ALABAMA': 'AL', 'ALASKA': 'AK', 'ARIZONA': 'AZ', 'ARKANSAS': 'AR', 'CALIFORNIA': 'CA', 'COLORADO': 'CO', 'CONNECTICUT': 'CT', 'DELAWARE': 'DE', 'DISTRICT OF COLUMBIA': 'DC', 'FLORIDA': 'FL', 'GEORGIA': 'GA', 'HAWAII': 'HI', 'IDAHO': 'ID', 'ILLINOIS': 'IL', 'INDIANA': 'IN', 'IOWA': 'IA', 'KANSAS': 'KS', 'KENTUCKY': 'KY', 'LOUISIANA': 'LA', 'MAINE': 'ME', 'MARYLAND': 'MD', 'MASSACHUSETTS': 'MA', 'MICHIGAN': 'MI', 'MINNESOTA': 'MN', 'MISSISSIPPI': 'MS', 'MISSOURI': 'MO', 'MONTANA': 'MT', 'NEBRASKA': 'NE', 'NEVADA': 'NV', 'NEW HAMPSHIRE': 'NH', 'NEW JERSEY': 'NJ', 'NEW MEXICO': 'NM', 'NEW YORK': 'NY', 'NORTH CAROLINA': 'NC', 'NORTH DAKOTA': 'ND', 'OHIO': 'OH', 'OKLAHOMA': 'OK', 'OREGON': 'OR', 'PENNSYLVANIA': 'PA', 'RHODE ISLAND': 'RI', 'SOUTH CAROLINA': 'SC', 'SOUTH DAKOTA': 'SD', 'TENNESSEE': 'TN', 'TEXAS': 'TX', 'UTAH': 'UT', 'VERMONT': 'VT', 'VIRGINIA': 'VA', 'WASHINGTON': 'WA', 'WEST VIRGINIA': 'WV', 'WISCONSIN': 'WI', 'WYOMING': 'WY'}


In [25]:
STATES_ABBR_REVERSE = {v:k for k, v in STATES_ABBR.items()}
print(STATES_ABBR_REVERSE)

{'AL': 'ALABAMA', 'AK': 'ALASKA', 'AZ': 'ARIZONA', 'AR': 'ARKANSAS', 'CA': 'CALIFORNIA', 'CO': 'COLORADO', 'CT': 'CONNECTICUT', 'DE': 'DELAWARE', 'DC': 'DISTRICT OF COLUMBIA', 'FL': 'FLORIDA', 'GA': 'GEORGIA', 'HI': 'HAWAII', 'ID': 'IDAHO', 'IL': 'ILLINOIS', 'IN': 'INDIANA', 'IA': 'IOWA', 'KS': 'KANSAS', 'KY': 'KENTUCKY', 'LA': 'LOUISIANA', 'ME': 'MAINE', 'MD': 'MARYLAND', 'MA': 'MASSACHUSETTS', 'MI': 'MICHIGAN', 'MN': 'MINNESOTA', 'MS': 'MISSISSIPPI', 'MO': 'MISSOURI', 'MT': 'MONTANA', 'NE': 'NEBRASKA', 'NV': 'NEVADA', 'NH': 'NEW HAMPSHIRE', 'NJ': 'NEW JERSEY', 'NM': 'NEW MEXICO', 'NY': 'NEW YORK', 'NC': 'NORTH CAROLINA', 'ND': 'NORTH DAKOTA', 'OH': 'OHIO', 'OK': 'OKLAHOMA', 'OR': 'OREGON', 'PA': 'PENNSYLVANIA', 'RI': 'RHODE ISLAND', 'SC': 'SOUTH CAROLINA', 'SD': 'SOUTH DAKOTA', 'TN': 'TENNESSEE', 'TX': 'TEXAS', 'UT': 'UTAH', 'VT': 'VERMONT', 'VA': 'VIRGINIA', 'WA': 'WASHINGTON', 'WV': 'WEST VIRGINIA', 'WI': 'WISCONSIN', 'WY': 'WYOMING'}


In [26]:
print('Processing business state ideology and state pvi')
for state in tqdm(list(STATES_ABBR.keys())):
    try:
        state_ideology = STATES_DF[STATES_DF['state'] == state]['perc_diffs'].iloc[0]

        if state == 'DISTRICT OF COLUMBIA':
            state_pvi = 0
        else:
            state_pvi = CPVI[CPVI['state'] == state]['pvi'].iloc[0]

        REVIEWS.loc[REVIEWS['business_state'] == STATES_ABBR[state],
                    ['business_state_ideology', 'business_state_pvi']] = state_ideology, state_pvi
    except:
        print(state)
        print('bad output')

  0%|          | 0/51 [00:00<?, ?it/s]

Processing business state ideology and state pvi


100%|██████████| 51/51 [00:12<00:00,  4.12it/s]


In [27]:
print('Adding total reviews and average stars per business')
REVIEWS['count'] = 1
BUSINESS_COUNTS = REVIEWS.groupby('business_id').sum().reset_index()[['business_id', 'count']]
BUSINESS_STARS = REVIEWS[['business_id', 'stars']].groupby('business_id').sum().reset_index()
BUSINESSES_LIST = list(BUSINESS_COUNTS['business_id'].unique())

for business in tqdm(BUSINESSES_LIST):
    tot_reviews = BUSINESS_COUNTS[BUSINESS_COUNTS['business_id'] == business]['count'].iloc[0]
    avg_stars = round((BUSINESS_STARS[BUSINESS_STARS['business_id'] == business]['stars'].iloc[0] / tot_reviews), 3)
    REVIEWS.loc[REVIEWS['business_id'] == business, ['business_review_total', 'avg_star_rating']] = tot_reviews, avg_stars


Adding total reviews and average stars per business


100%|██████████| 43256/43256 [5:47:57<00:00,  2.07it/s]  


In [28]:
# add state pop density
POP_DENSITY = POP_DENSITY[POP_DENSITY['Year'] >= 2000]
POP_DENSITY_COLS = ['Name', 'Year', 'Resident Population', 'Geography Type', 'Resident Population Density']
POP_DENSITY = POP_DENSITY[POP_DENSITY_COLS]
POP_DENSITY = POP_DENSITY[POP_DENSITY['Geography Type'] == 'State']
POP_DENSITY.drop(columns=['Geography Type'], inplace=True)
POP_DENSITY_RENAMED_COLS = ['state', 'year', 'pop', 'pop_density']
POP_DENSITY.columns = POP_DENSITY_RENAMED_COLS
POP_DENSITY['state'] = POP_DENSITY['state'].str.upper()
POP_DENSITY = POP_DENSITY[POP_DENSITY['state'].isin(list(STATES_ABBR.keys()))]


In [29]:
POP_DENSITY.head()

Unnamed: 0,state,year,pop,pop_density
513,ALABAMA,2000,4447100,87.8
514,ALASKA,2000,626932,1.1
515,ARIZONA,2000,5130632,45.2
516,ARKANSAS,2000,2673400,51.4
517,CALIFORNIA,2000,33871648,217.4


In [30]:
POP_DENSITY['year'].unique()

array([2000, 2010, 2020])

In [31]:
# split date into separate columns
REVIEWS['date'] = pd.to_datetime(REVIEWS['date'])
REVIEWS['year'] = REVIEWS['date'].dt.year
REVIEWS['month'] = REVIEWS['date'].dt.month
REVIEWS['day'] = REVIEWS['date'].dt.day

In [32]:
sorted(list(REVIEWS['year'].unique()))

[2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021]

In [33]:
# Add population density measures
print("Adding population density...")

Adding population density...


In [34]:
def add_pop_density(decade):
    upper_year_limit = decade
    lower_year_limit = decade-10

    #pop_slice = POP_DENSITY[POP_DENSITY['year'] == decade]

    for state in STATES_LIST:
        state_pop_density = POP_DENSITY[
            (POP_DENSITY['year'] == decade) &
            (POP_DENSITY['state'] == STATES_ABBR_REVERSE[state])
        ]['pop_density'].iloc[0]

        REVIEWS.loc[
            (REVIEWS['year'].between(lower_year_limit, upper_year_limit, inclusive='right')) &
            (REVIEWS['business_state'] == state),
            ['population_density']
        ] = state_pop_density


In [35]:
add_pop_density(2010)
add_pop_density(2020)

In [36]:
REVIEWS


Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,count,business_review_total,avg_star_rating,year,month,day,population_density
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,2014-10-11 03:34:02,4,3,MA,01915,-28.82,-33.21,-14.0,1,84.0,3.655,2014,10,11,901.2
1,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2010-01-08 02:29:15,2,1,FL,32821,-23.12,3.36,3.0,1,297.0,3.562,2010,1,8,401.4
2,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,2011-07-28 18:05:01,4,0,CO,80302,-56.57,-13.50,-3.0,1,1009.0,4.280,2011,7,28,55.7
3,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,2018-01-21 04:41:03,1,2,MA,02144,-44.89,-33.21,-14.0,1,512.0,3.371,2018,1,21,901.2
4,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2006-04-16 02:58:44,2,0,MA,02215,-62.84,-33.21,-14.0,1,14.0,2.714,2006,4,16,839.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5147986,1LrBZbLNfkBsFrHwEBlfSg,fYkURme6Piqxu4qUjQV3PQ,gEQxTJDoJYaW0l_6FYtf8g,2020-12-05 21:32:45,5,1,MA,02478,-44.89,-33.21,-14.0,1,88.0,2.989,2020,12,5,901.2
5147987,F5eAqFzDEyU6q5bBuCjglg,7Ey9Xc2hC4obn7spjp1mHw,PrTS_wxKNxf0kRTOcboeIw,2020-10-23 23:28:09,1,1,MA,02180,-44.89,-33.21,-14.0,1,154.0,3.442,2020,10,23,901.2
5147988,FfhmA0G0zrRjHskp-7O8UQ,IlxM3NGJOtNXPz5cupqNDQ,dmkDZKPsK8lmwFuLiFQ0Zw,2021-01-25 14:53:13,5,0,OR,97219,-61.31,-16.08,-6.0,1,25.0,4.720,2021,1,25,
5147989,7vNXRIClt-9rFzMXlrtMXA,tr13Jb83h2itjyXVwaO5eA,rbuj2X4SXIc3MDul4dcxIA,2020-06-13 02:39:26,5,39,OR,97266,-61.31,-16.08,-6.0,1,70.0,4.000,2020,6,13,44.1


In [37]:
REVIEWS.drop(columns=['count'], inplace=True)
REVIEWS

Unnamed: 0,review_id,user_id,business_id,date,stars,useful,business_state,business_zipcode,business_county_ideology,business_state_ideology,business_state_pvi,business_review_total,avg_star_rating,year,month,day,population_density
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,2014-10-11 03:34:02,4,3,MA,01915,-28.82,-33.21,-14.0,84.0,3.655,2014,10,11,901.2
1,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2010-01-08 02:29:15,2,1,FL,32821,-23.12,3.36,3.0,297.0,3.562,2010,1,8,401.4
2,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,2011-07-28 18:05:01,4,0,CO,80302,-56.57,-13.50,-3.0,1009.0,4.280,2011,7,28,55.7
3,J4a2TuhDasjn2k3wWtHZnQ,RNm_RWkcd02Li2mKPRe7Eg,xGXzsc-hzam-VArK6eTvtw,2018-01-21 04:41:03,1,2,MA,02144,-44.89,-33.21,-14.0,512.0,3.371,2018,1,21,901.2
4,28gGfkLs3igtjVy61lh77Q,Q8c91v7luItVB0cMFF_mRA,EXOsmAB1s71WePlQk0WZrA,2006-04-16 02:58:44,2,0,MA,02215,-62.84,-33.21,-14.0,14.0,2.714,2006,4,16,839.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5147986,1LrBZbLNfkBsFrHwEBlfSg,fYkURme6Piqxu4qUjQV3PQ,gEQxTJDoJYaW0l_6FYtf8g,2020-12-05 21:32:45,5,1,MA,02478,-44.89,-33.21,-14.0,88.0,2.989,2020,12,5,901.2
5147987,F5eAqFzDEyU6q5bBuCjglg,7Ey9Xc2hC4obn7spjp1mHw,PrTS_wxKNxf0kRTOcboeIw,2020-10-23 23:28:09,1,1,MA,02180,-44.89,-33.21,-14.0,154.0,3.442,2020,10,23,901.2
5147988,FfhmA0G0zrRjHskp-7O8UQ,IlxM3NGJOtNXPz5cupqNDQ,dmkDZKPsK8lmwFuLiFQ0Zw,2021-01-25 14:53:13,5,0,OR,97219,-61.31,-16.08,-6.0,25.0,4.720,2021,1,25,
5147989,7vNXRIClt-9rFzMXlrtMXA,tr13Jb83h2itjyXVwaO5eA,rbuj2X4SXIc3MDul4dcxIA,2020-06-13 02:39:26,5,39,OR,97266,-61.31,-16.08,-6.0,70.0,4.000,2020,6,13,44.1


In [38]:
REVIEWS.to_csv('data/yelp_dataset.csv')
print('File exported.')

File exported.
