# Exploring Metrics for Business Popularity

In [1]:
import json
import pandas as pd
import numpy as np
import math
import datetime

import matplotlib.pyplot as plt

import collections

from sklearn.preprocessing import MinMaxScaler

In [2]:
business_df = pd.read_json('./yelp/yelp_academic_dataset_business.json', lines=True)
business_df.drop(columns=['address', 'postal_code', 'name'], inplace=True)
# business_df.dropna(inplace=True)  # the nans are for attributes and opening hours: may not be integral part
business_df.set_index("business_id", inplace=True)
print(len(business_df))

checkins_df = pd.read_json('./yelp/yelp_academic_dataset_checkin.json', lines=True)
print(len(checkins_df))

reviews_df = pd.read_json('yelp/yelp_academic_dataset_review.json', lines=True)
reviews_df.drop(columns=["text", "user_id", "review_id"], inplace=True)
reviews_df["date"] = pd.to_datetime(reviews_df["date"])

tips_df = pd.read_json('yelp/yelp_academic_dataset_tip.json', lines=True)

160585
138876


In [3]:
def calculate_checkin_count(row):
    dates = row.split(",")
    return len(dates)

def start_date(row):
    dates = row.split(",")
    return dates[0]

def last_date(row):
    dates = row.split(",")
    return dates[-1]

checkins_df.set_index("business_id", inplace=True)
checkins_df['checkin_count'] = checkins_df["date"].apply(calculate_checkin_count)
checkins_df['first_checkin'] = pd.to_datetime(checkins_df["date"].apply(start_date))
checkins_df['last_checkin'] = pd.to_datetime(checkins_df["date"].apply(last_date))
checkins_df.head()

Unnamed: 0_level_0,date,checkin_count,first_checkin,last_checkin
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
--0r8K_AQ4FZfLsX3ZYRDA,2017-09-03 17:13:59,1,2017-09-03 17:13:59,2017-09-03 17:13:59
--0zrn43LEaB4jUWTQH_Bg,"2010-10-08 22:21:20, 2010-11-01 21:29:14, 2010...",9,2010-10-08 22:21:20,2011-08-29 19:01:31
--164t1nclzzmca7eDiJMw,"2010-02-26 02:06:53, 2010-02-27 08:00:09, 2010...",311,2010-02-26 02:06:53,2013-11-25 02:51:33
--2aF9NhXnNVpDV0KS3xBQ,"2014-11-03 16:35:35, 2015-01-30 18:16:03, 2015...",8,2014-11-03 16:35:35,2020-12-29 16:22:00
--2mEJ63SC_8_08_jGgVIg,"2010-12-15 17:10:46, 2013-12-28 00:27:54, 2015...",4,2010-12-15 17:10:46,2016-06-11 19:56:11


In [4]:
# merge checkin
business_df = pd.merge(business_df, checkins_df, on='business_id')

len(business_df)

138876

## Process Reviews

In [5]:
def round_star(raw_star):

    i = raw_star // 1
    f = raw_star % 1
    if f >= 0.75:
        ans = i + 1
    elif f >=0.25:
        ans = i + 0.5
    else:
        ans = i

    return ans

business_df["raw_stars"] = reviews_df.groupby("business_id").mean()["stars"]
business_df["rounded_stars"] = business_df["raw_stars"].apply(round_star)

business_df[["stars", "rounded_stars", "raw_stars"]]

Unnamed: 0_level_0,stars,rounded_stars,raw_stars
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6iYb2HFDywm3zjuRg0shjw,4.0,4.0,4.000000
tCbdrRPZA0oiIYSmHG3J0w,4.0,4.0,4.193798
bvN78flM8NLprQ1a1y5dRg,4.5,4.5,4.615385
oaepsyvc0J17qwi8cfrOWg,3.0,2.5,2.666667
PE9uqAjdw0E4-8mjGl3wVA,4.0,4.0,4.214286
...,...,...,...
D2mHoIDXx9N8mS1pGoKV9Q,4.0,4.0,4.000000
bQX-kwVTyZgcdZGEPzce6Q,4.5,4.5,4.368421
wvFZ06nmPmQ2-IVoPqVYLA,5.0,5.0,4.875000
GB75wPibj3IjNauaoCxyGA,4.0,4.0,4.100000


In [6]:
reviews_data = reviews_df.groupby(by='business_id').agg(raw_review_count=("date", lambda x: x.count()),
                                                        first_review=("date", lambda x: x.min()),
                                                        last_review=("date", lambda x: x.max()))

business_df = pd.merge(business_df, reviews_data, on='business_id')

## Process tips

In [7]:
tips_data = tips_df.groupby(by='business_id').agg(tip_count=("date", lambda x: x.count()),
                                                  first_tip=("date", lambda x: x.min()),
                                                  last_tip=("date", lambda x: x.max()))

business_df = pd.merge(business_df, tips_data, on='business_id')
len(business_df)

106660

## Put all the dates and counts together

In [8]:
business_df["first_date"] = business_df[["first_checkin", "first_review", "first_tip"]].min(axis=1)
business_df["last_date"] = business_df[["last_checkin", "last_review", "last_tip"]].max(axis=1)

business_df["visit_count"] = business_df[["checkin_count", "raw_review_count", "tip_count"]].sum(axis=1)

## Restaurants

In [9]:
restaurants_df = business_df.loc[business_df["categories"].str.contains("Restaurants") | business_df["categories"].str.contains("Food")]
print(f"There are {len(restaurants_df)} restaurants in the dataset")

restaurants_df.head(3)

There are 55935 restaurants in the dataset


Unnamed: 0_level_0,city,state,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,...,rounded_stars,raw_review_count,first_review,last_review,tip_count,first_tip,last_tip,first_date,last_date,visit_count
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",...,4.0,86,2017-09-12 03:32:30,2021-01-22 05:20:38,7,2017-09-09 04:42:34,2019-09-17 04:30:53,2017-09-09 04:42:34,2021-01-22 05:20:38,277
tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",...,4.0,129,2010-03-09 16:02:04,2020-08-06 09:00:59,47,2011-05-28 02:06:25,2019-06-25 18:21:12,2010-03-09 16:02:04,2021-01-21 17:55:35,1356
D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",...,3.5,175,2010-09-26 04:03:35,2020-12-24 21:24:42,32,2010-12-02 21:10:51,2019-09-01 03:01:29,2010-09-26 04:03:35,2021-01-23 01:43:50,495


In [10]:
restaurants_df.drop(columns=["first_checkin", "first_review", "first_tip", "last_checkin", "last_review", "last_tip", "date"], inplace=True)

restaurants_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0_level_0,city,state,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,checkin_count,raw_stars,rounded_stars,raw_review_count,tip_count,first_date,last_date,visit_count
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",184,4.0,4.0,86,7,2017-09-09 04:42:34,2021-01-22 05:20:38,277
tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",1180,4.193798,4.0,129,47,2010-03-09 16:02:04,2021-01-21 17:55:35,1356
D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",288,3.685714,3.5,175,32,2010-09-26 04:03:35,2021-01-23 01:43:50,495


In [11]:
restaurants_df.to_csv("datasets/new_restaurants.csv")