# Popularity Metric Analysis - #1

In [1]:
import json
import pandas as pd
import numpy as np
import math
import datetime

## Reading the Data

In [2]:
business_df = pd.read_json('../yelp/yelp_academic_dataset_business.json', lines=True)

In [3]:
business_df.head(3)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,921 Pearl St,"{'RestaurantsTableService': 'True', 'WiFi': 'u...",6iYb2HFDywm3zjuRg0shjw,"Gastropubs, Food, Beer Gardens, Restaurants, B...",Boulder,"{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",1,40.017544,-105.283348,Oskar Blues Taproom,80302,86,4.0,CO
1,7000 NE Airport Way,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",tCbdrRPZA0oiIYSmHG3J0w,"Salad, Soup, Sandwiches, Delis, Restaurants, C...",Portland,"{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",1,45.588906,-122.593331,Flying Elephants at PDX,97218,126,4.0,OR
2,4720 Hawthorne Ave,"{'BusinessAcceptsCreditCards': 'True', 'Restau...",bvN78flM8NLprQ1a1y5dRg,"Antiques, Fashion, Used, Vintage & Consignment...",Portland,"{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0...",1,45.511907,-122.613693,The Reclaimory,97214,13,4.5,OR


In [4]:
checkins_df = pd.read_json('../yelp/yelp_academic_dataset_checkin.json', lines=True)

In [5]:
checkins_df.head(3)

Unnamed: 0,business_id,date
0,--0r8K_AQ4FZfLsX3ZYRDA,2017-09-03 17:13:59
1,--0zrn43LEaB4jUWTQH_Bg,"2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010..."
2,--164t1nclzzmca7eDiJMw,"2010-02-26 02:06:53, 2010-02-27 08:00:09, 2010..."


In [6]:
#reviews_df = pd.read_json('../yelp/yelp_academic_dataset_review.json', lines=True)

In [7]:
#reviews_df.head(3)

In [8]:
tips_df = pd.read_json('../yelp/yelp_academic_dataset_tip.json', lines=True)

In [9]:
tips_df.head(3)

Unnamed: 0,business_id,compliment_count,date,text,user_id
0,ENwBByjpoa5Gg7tKgxqwLg,0,2011-07-22 19:07:35,Carne asada chips...,WCjg0jdHXMlwbqS9tZUx8Q
1,jKO4Og6ucdX2-YCTKQVYjg,0,2014-09-10 07:33:29,Best happy hour from 3pm to 6pm! $1 off martin...,42-Z02y9bABShAGZhuSzrQ
2,9Bto7mky640ocgezVKSfVg,0,2013-12-13 23:23:41,"Nice people, skilled staff, clean location - b...",5u7E3LYp_3eB8dLuUBazXQ


In [10]:
print(f"There are {len(business_df)} businesses, {len(checkins_df)} checkins " +
      f"and {len(tips_df)} tips")

There are 160585 businesses, 138876 checkins and 1162119 tips


## Processing the Data

#### Removing unnecessary features and creating restaurants dataframe

In [11]:
business_df = business_df.dropna()

In [12]:
# Only keeping restaurants
restaurants_df = business_df.loc[business_df["categories"].str.contains("Restaurants")]

In [13]:
# Dropping columns that are not needed for popularity analysis
restaurants_df = restaurants_df.drop(columns=['address', 'postal_code', 'state', 'name', 'city', 
                                        'attributes', 'latitude', 'longitude', 'hours', 'categories'])

In [14]:
restaurants_df.head(2)

Unnamed: 0,business_id,is_open,review_count,stars
0,6iYb2HFDywm3zjuRg0shjw,1,86,4.0
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0


In [15]:
print(f"There are {len(restaurants_df)} restaurants")

There are 42646 restaurants


#### Only keeping checkins associated with restaurants

In [16]:
restaurants_df = pd.merge(restaurants_df, checkins_df, on='business_id')
restaurants_df = restaurants_df.rename(columns={"date": "checkins_dates"})

In [17]:
print(len(restaurants_df))

42089


In [18]:
def get_checkins_count(row):
    checkins = row[4].split(",")
    return len(checkins)

In [19]:
restaurants_df["checkins_count"] = restaurants_df.apply(get_checkins_count, axis=1)

In [20]:
restaurants_df.head(3)

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count
0,6iYb2HFDywm3zjuRg0shjw,1,86,4.0,"2017-09-10 04:48:12, 2017-09-10 04:49:28, 2017...",184
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288


#### Analyzing Tips distribution

In [21]:
def get_year(row):
    date = datetime.datetime.strptime(str(row[2]), "%Y-%m-%d %H:%M:%S")
    return date.year
    

tips_df["year"] = tips_df.apply(get_year, axis=1)

In [22]:
tips_df["year"].value_counts()

2012    181139
2011    149316
2014    134835
2013    133550
2017    112507
2016    109153
2015    105454
2018     77223
2019     62195
2010     60043
2020     33073
2021      2212
2009      1419
Name: year, dtype: int64

#### Adding tips to restaurants

In [23]:
print(f"There are {len(tips_df)} tips")

There are 1162119 tips


In [24]:
tips = {}

In [25]:
def group_tips(row):
    business_id = str(row[0])
    date = str(row[2])
    if business_id in tips.keys():
        tips[business_id].append(date)
    else:
        tips[business_id] = [date]

In [26]:
for index, row in tips_df.iterrows():
    group_tips(row)

In [27]:
# Looking at total tips count per business_id for a few examples
for k in list(tips.keys())[:10]:
    print(f"{k}: {len(tips[k])}")

ENwBByjpoa5Gg7tKgxqwLg: 67
jKO4Og6ucdX2-YCTKQVYjg: 64
9Bto7mky640ocgezVKSfVg: 17
XWFjKtRGZ9khRGtGg2ZvaA: 97
mkrx0VhSMU3p3uhyJGCoWA: 58
VQftVUvHfMQdDTmnO0iQqg: 8
2PxZ-fICnd432NJHefXrcA: 1499
oQyf1788YWsiDLupGva6sw: 6
OQ2oHkcWA8KNC1Lsvj1SBA: 949
Wqetc51pFQzz04SXh_AORA: 50


In [28]:
# Checking the total matches the total number of tips
counts = [len(tips[k]) for k in list(tips.keys())]
sum(counts)

1162119

In [29]:
tips_processed = []
for k in list(tips.keys()):
    tips_processed.append([k, len(tips[k]), ",".join(tips[k])])

In [30]:
tips_count_df = pd.DataFrame(tips_processed, columns=["business_id", "tips_count", "tips_dates"])

In [31]:
tips_count_df.head()

Unnamed: 0,business_id,tips_count,tips_dates
0,ENwBByjpoa5Gg7tKgxqwLg,67,"2011-07-22 19:07:35,2012-06-16 22:16:18,2016-0..."
1,jKO4Og6ucdX2-YCTKQVYjg,64,"2014-09-10 07:33:29,2013-08-19 06:17:35,2013-0..."
2,9Bto7mky640ocgezVKSfVg,17,"2013-12-13 23:23:41,2012-08-06 20:55:31,2011-0..."
3,XWFjKtRGZ9khRGtGg2ZvaA,97,"2017-07-11 23:07:16,2013-02-17 18:33:55,2012-1..."
4,mkrx0VhSMU3p3uhyJGCoWA,58,"2016-11-30 08:46:36,2012-07-26 23:32:45,2011-1..."


In [32]:
len(tips_count_df)

110915

In [33]:
restaurants_df = pd.merge(restaurants_df, tips_count_df, how="left", on='business_id')

In [34]:
len(restaurants_df)

42089

In [35]:
restaurants_df.head()

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count,tips_count,tips_dates
0,6iYb2HFDywm3zjuRg0shjw,1,86,4.0,"2017-09-10 04:48:12, 2017-09-10 04:49:28, 2017...",184,7.0,"2019-06-07 22:24:44,2019-03-06 22:53:59,2017-0..."
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180,47.0,"2013-12-18 05:57:05,2013-04-08 01:55:49,2013-0..."
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288,32.0,"2010-12-02 21:10:51,2018-01-18 21:40:39,2011-1..."
3,HPA_qyMEddpAEtFof02ixg,1,39,4.0,"2010-07-28 21:12:50, 2010-08-01 23:58:02, 2010...",36,4.0,"2014-03-15 17:16:33,2013-04-27 23:57:33,2014-0..."
4,ufCxltuh56FF4-ZFZ6cVhg,1,135,4.5,"2012-08-29 22:10:36, 2012-09-11 18:11:11, 2012...",246,19.0,"2014-06-13 16:58:37,2017-03-15 14:53:27,2015-0..."


In [36]:
restaurants_df["tips_count"].isna().sum()

2795

In [37]:
restaurants_df["tips_count"] = restaurants_df["tips_count"].fillna(0)

In [38]:
restaurants_df["tips_dates"] = restaurants_df["tips_dates"].fillna("")

## Saving the Data

In [39]:
restaurants_df.to_csv("./restaurants.csv", index=False)