# Exploring Metrics for Business Popularity

In [1]:
import json
import pandas as pd
import numpy as np
import math
import datetime

In [2]:
business_df = pd.read_json('./yelp/yelp_academic_dataset_business.json', lines=True)
business_df.drop(columns=['address', 'postal_code', 'name', 'stars', 'review_count'], inplace=True)
# business_df.dropna(inplace=True)  # the nans are for attributes and opening hours: may not be integral part
business_df.set_index("business_id", inplace=True)
print(len(business_df))

checkins_df = pd.read_json('./yelp/yelp_academic_dataset_checkin.json', lines=True)
print(len(checkins_df))

reviews_df = pd.read_json('yelp/yelp_academic_dataset_review.json', lines=True)
reviews_df.drop(columns=["text", "user_id", "review_id"], inplace=True)
reviews_df["date"] = pd.to_datetime(reviews_df["date"])

tips_df = pd.read_json('yelp/yelp_academic_dataset_tip.json', lines=True)
print(len(tips_df))

YEAR_RANGE = (datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2019, 12, 31, 23, 59), None)

160585
138876
1162119


In [3]:
# Consider business from 2017-2018 dataset
df_2017 = pd.read_csv("datasets/2017-2018_restaurants.csv")
df_2017.set_index("business_id", inplace=True)

business_df = business_df.loc[df_2017.index]

In [4]:
def filter_date(date):
    if date < YEAR_RANGE[0] or date > YEAR_RANGE[1]:
        return False
    return True

def calculate_checkin_count(row):
    dates = [datetime.datetime.strptime(d.strip(), "%Y-%m-%d %H:%M:%S") for d in row.split(",")]
    dates = list(filter(filter_date, dates))

    return len(dates)

checkins_df.set_index("business_id", inplace=True)
checkins_df['checkin_count'] = checkins_df["date"].apply(calculate_checkin_count)
checkins_df.head()

Unnamed: 0_level_0,date,checkin_count
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
--0r8K_AQ4FZfLsX3ZYRDA,2017-09-03 17:13:59,0
--0zrn43LEaB4jUWTQH_Bg,"2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010...",0
--164t1nclzzmca7eDiJMw,"2010-02-26 02:06:53, 2010-02-27 08:00:09, 2010...",0
--2aF9NhXnNVpDV0KS3xBQ,"2014-11-03 16:35:35, 2015-01-30 18:16:03, 2015...",1
--2mEJ63SC_8_08_jGgVIg,"2010-12-15 17:10:46, 2013-12-28 00:27:54, 2015...",0


In [5]:
checkins_df.drop(columns="date", inplace=True)

In [6]:
# merge checkin
business_df = pd.merge(business_df, checkins_df, on='business_id', how="left")

len(business_df)

30094

## Process Reviews

In [7]:
reviews_data = reviews_df.groupby(by='business_id').agg(
    review_count=("date",
                  lambda x: x[(x >= YEAR_RANGE[0]) & (x <= YEAR_RANGE[1])].count()))

reviews_data.head()

Unnamed: 0_level_0,review_count
business_id,Unnamed: 1_level_1
--0DF12EMHYI8XIgoFha6A,0
--0r8K_AQ4FZfLsX3ZYRDA,2
--0zrn43LEaB4jUWTQH_Bg,0
--164t1nclzzmca7eDiJMw,0
--2aF9NhXnNVpDV0KS3xBQ,1


In [8]:
business_df = pd.merge(business_df, reviews_data, on='business_id', how="left")
len(business_df)

30094

In [9]:
def round_star(raw_star):

    i = raw_star // 1
    f = raw_star % 1
    if f >= 0.75:
        ans = i + 1
    elif f >=0.25:
        ans = i + 0.5
    else:
        ans = i

    return ans

ranged_reviews_df = reviews_df[(reviews_df["date"] >= YEAR_RANGE[0]) & (reviews_df["date"] <= YEAR_RANGE[1])]

business_df["raw_stars"] = ranged_reviews_df.groupby("business_id").mean()["stars"]
business_df["stars"] = business_df["raw_stars"].apply(round_star)

business_df[["stars", "raw_stars"]]

Unnamed: 0_level_0,stars,raw_stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6iYb2HFDywm3zjuRg0shjw,4.0,4.227273
tCbdrRPZA0oiIYSmHG3J0w,4.5,4.428571
D4JtQNTI4X3KcbzacDJsMw,3.5,3.500000
ufCxltuh56FF4-ZFZ6cVhg,4.5,4.666667
dmbbf3AqeG61_QHRZi1M1w,4.0,4.000000
...,...,...
yQL8SrSETbbCI1U5esVJQw,4.5,4.703947
r5Uag1JqYjr2nbxQCVqm8A,4.5,4.300000
Q78fYV6B6P6GmX07YVgi4g,3.0,3.060606
uXdQkuEtvLAzfc3MsO-sTQ,4.0,3.813953


In [10]:
weights = ranged_reviews_df.groupby("stars").mean()["useful"].values

weights[:1] = -weights[:1]

ranged_reviews_df.loc[ranged_reviews_df.index, "weighted_stars"] = ranged_reviews_df["stars"].apply(lambda x: x * weights[x - 1])


business_df["weighted_stars"] = ranged_reviews_df.groupby("business_id").mean()["weighted_stars"]
business_df[["stars", "raw_stars", "weighted_stars"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0_level_0,stars,raw_stars,weighted_stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6iYb2HFDywm3zjuRg0shjw,4.0,4.227273,3.338557
tCbdrRPZA0oiIYSmHG3J0w,4.5,4.428571,3.755950
D4JtQNTI4X3KcbzacDJsMw,3.5,3.500000,3.078625
ufCxltuh56FF4-ZFZ6cVhg,4.5,4.666667,3.836140
dmbbf3AqeG61_QHRZi1M1w,4.0,4.000000,4.787566
...,...,...,...
yQL8SrSETbbCI1U5esVJQw,4.5,4.703947,3.505930
r5Uag1JqYjr2nbxQCVqm8A,4.5,4.300000,3.543347
Q78fYV6B6P6GmX07YVgi4g,3.0,3.060606,2.164611
uXdQkuEtvLAzfc3MsO-sTQ,4.0,3.813953,2.821628


## Process tips

In [11]:
tips_data = tips_df.groupby(by='business_id').agg(
    tip_count=("date",
               lambda x: x[(x >= YEAR_RANGE[0]) & (x <= YEAR_RANGE[1])].count()))


In [12]:
business_df = pd.merge(business_df, tips_data, on='business_id', how="left")
len(business_df)

30094

## Put all the dates and counts together

In [13]:
business_df["visit_count"] = business_df[["checkin_count", "review_count", "tip_count"]].sum(axis=1)

In [14]:
business_df.drop(columns=["city", "state", "latitude", "longitude", "is_open", "attributes", "categories", "hours"], inplace=True)

In [15]:
business_df.head()

Unnamed: 0_level_0,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,visit_count
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6iYb2HFDywm3zjuRg0shjw,78,22,4.227273,4.0,3.338557,3,103
tCbdrRPZA0oiIYSmHG3J0w,163,7,4.428571,4.5,3.75595,1,171
D4JtQNTI4X3KcbzacDJsMw,22,18,3.5,3.5,3.078625,1,41
ufCxltuh56FF4-ZFZ6cVhg,26,12,4.666667,4.5,3.83614,0,38
dmbbf3AqeG61_QHRZi1M1w,8,1,4.0,4.0,4.787566,0,9


In [16]:
df_2017_2019 = pd.merge(df_2017, business_df, on="business_id", how="left")

In [17]:
df_2017_2019[["visit_count_x", "visit_count_y"]]

Unnamed: 0_level_0,visit_count_x,visit_count_y
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6iYb2HFDywm3zjuRg0shjw,132,103
tCbdrRPZA0oiIYSmHG3J0w,209,171
D4JtQNTI4X3KcbzacDJsMw,73,41
ufCxltuh56FF4-ZFZ6cVhg,85,38
dmbbf3AqeG61_QHRZi1M1w,8,9
...,...,...
yQL8SrSETbbCI1U5esVJQw,604,270
r5Uag1JqYjr2nbxQCVqm8A,781,210
Q78fYV6B6P6GmX07YVgi4g,217,86
uXdQkuEtvLAzfc3MsO-sTQ,168,102


In [18]:
df_2017_2019.to_csv("datasets/2017-2018-2019_restaurants.csv")