# Popularity Metric Analysis - #2

In [1]:
import json
import pandas as pd
import numpy as np
import math
import datetime

## Reading the Data

In [2]:
restaurants_df = pd.read_csv("./restaurants.csv")

In [3]:
restaurants_df.head()

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count,tips_count,tips_dates
0,6iYb2HFDywm3zjuRg0shjw,1,86,4.0,"2017-09-10 04:48:12, 2017-09-10 04:49:28, 2017...",184,7.0,"2019-06-07 22:24:44,2019-03-06 22:53:59,2017-0..."
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180,47.0,"2013-12-18 05:57:05,2013-04-08 01:55:49,2013-0..."
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288,32.0,"2010-12-02 21:10:51,2018-01-18 21:40:39,2011-1..."
3,HPA_qyMEddpAEtFof02ixg,1,39,4.0,"2010-07-28 21:12:50, 2010-08-01 23:58:02, 2010...",36,4.0,"2014-03-15 17:16:33,2013-04-27 23:57:33,2014-0..."
4,ufCxltuh56FF4-ZFZ6cVhg,1,135,4.5,"2012-08-29 22:10:36, 2012-09-11 18:11:11, 2012...",246,19.0,"2014-06-13 16:58:37,2017-03-15 14:53:27,2015-0..."


In [4]:
restaurants_df["tips_dates"] = restaurants_df["tips_dates"].fillna("")

In [5]:
restaurants_df["tips_dates"].isna().sum()

0

## Analyzing Data

### Looking at Restaurants

In [6]:
avg_review_count = round(restaurants_df["review_count"].mean())
avg_checkins_count = round(restaurants_df["checkins_count"].mean())
avg_tips_count = round(restaurants_df["tips_count"].mean())
print(f"Restaurants have on {avg_review_count} reviews, {avg_checkins_count} checkins and {avg_tips_count} tips on average")

Restaurants have on 123 reviews, 260 checkins and 18 tips on average


In [7]:
v = len(restaurants_df[restaurants_df["checkins_count"] < 5])
print(f"There are {v} restaurants with 5 checkins or less")

There are 2149 restaurants with 5 checkins or less


In [8]:
average_star_rating = round(restaurants_df["stars"].mean(), 2)
print(f"Restaurants star rating is {average_star_rating}/5 average")

Restaurants star rating is 3.57/5 average


In [9]:
restaurants_df['stars'].value_counts()

4.0    11714
3.5    10651
3.0     6520
4.5     6233
2.5     3275
2.0     1808
5.0     1091
1.5      691
1.0      106
Name: stars, dtype: int64

In [10]:
open_restaurants_df = restaurants_df[restaurants_df["is_open"] == 1]
closed_restaurants_df = restaurants_df[restaurants_df["is_open"] == 0]

In [11]:
open_avg = round(open_restaurants_df["stars"].mean(), 2)
open_sd = round(open_restaurants_df["stars"].std(), 2)
print(f"Open restaurants star rating is {open_avg}/5 average with sd {open_sd}")

closed_avg = round(closed_restaurants_df["stars"].mean(), 2)
closed_sd = round(closed_restaurants_df["stars"].std(), 2)
print(f"Closed restaurants star rating is {closed_avg}/5 average with sd {closed_sd}")

# From the results we can conclude that the star rating is not a good indicator of whether
# a restaurant will close as the mean and sd are the same

Open restaurants star rating is 3.57/5 average with sd 0.78
Closed restaurants star rating is 3.55/5 average with sd 0.68


In [12]:
restaurants_df[['checkins_count', 'tips_count', 'review_count']].corr()

Unnamed: 0,checkins_count,tips_count,review_count
checkins_count,1.0,0.894577,0.844195
tips_count,0.894577,1.0,0.888469
review_count,0.844195,0.888469,1.0


### Setting First and Last Checkin Dates

In [13]:
def get_first_date(row):
    dates = row[4].split(",")
    first_date = datetime.datetime.strptime(dates[0].strip(), "%Y-%m-%d %H:%M:%S")
    first_date_str = f"{first_date: %d-%m-%Y}"
    return first_date_str

def get_first_date_year(row):
    dates = row[4].split(",")
    first_date = datetime.datetime.strptime(dates[0].strip(), "%Y-%m-%d %H:%M:%S")
    return first_date.year

In [14]:
restaurants_df["start_date"] = restaurants_df.apply(get_first_date, axis=1)

In [15]:
restaurants_df["start_date_year"] = restaurants_df.apply(get_first_date_year, axis=1)

In [16]:
def get_last_date(row):
    dates = row[4].split(",")
    last_date = datetime.datetime.strptime(dates[len(dates)-1].strip(), "%Y-%m-%d %H:%M:%S")
    last_date_str = f"{last_date: %d-%m-%Y}"
    return last_date_str

def get_last_date_year(row):
    dates = row[4].split(",")
    last_date = datetime.datetime.strptime(dates[len(dates)-1].strip(), "%Y-%m-%d %H:%M:%S")
    return last_date.year

In [17]:
restaurants_df["end_date"] = restaurants_df.apply(get_last_date, axis=1)

In [18]:
restaurants_df["end_date_year"] = restaurants_df.apply(get_last_date_year, axis=1)

In [19]:
restaurants_df.head(5)

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count,tips_count,tips_dates,start_date,start_date_year,end_date,end_date_year
0,6iYb2HFDywm3zjuRg0shjw,1,86,4.0,"2017-09-10 04:48:12, 2017-09-10 04:49:28, 2017...",184,7.0,"2019-06-07 22:24:44,2019-03-06 22:53:59,2017-0...",10-09-2017,2017,13-01-2021,2021
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180,47.0,"2013-12-18 05:57:05,2013-04-08 01:55:49,2013-0...",22-04-2010,2010,21-01-2021,2021
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288,32.0,"2010-12-02 21:10:51,2018-01-18 21:40:39,2011-1...",06-11-2010,2010,23-01-2021,2021
3,HPA_qyMEddpAEtFof02ixg,1,39,4.0,"2010-07-28 21:12:50, 2010-08-01 23:58:02, 2010...",36,4.0,"2014-03-15 17:16:33,2013-04-27 23:57:33,2014-0...",28-07-2010,2010,23-08-2019,2019
4,ufCxltuh56FF4-ZFZ6cVhg,1,135,4.5,"2012-08-29 22:10:36, 2012-09-11 18:11:11, 2012...",246,19.0,"2014-06-13 16:58:37,2017-03-15 14:53:27,2015-0...",29-08-2012,2012,09-10-2020,2020


### Getting All Restaurants Open in 2016

In [20]:
restaurants_2016_df = restaurants_df[restaurants_df["start_date_year"] < 2016]
restaurants_2016_df = restaurants_2016_df[restaurants_2016_df["end_date_year"] > 2016]

In [21]:
later_closed = restaurants_2016_df[restaurants_2016_df["is_open"] == 0]

print(f"There were {len(restaurants_2016_df)} restaurants open in 2016 of which {len(later_closed)} have since closed down")

There were 24161 restaurants open in 2016 of which 4531 have since closed down


### Calculating Frequency using Tips & Checkins

In [22]:
def get_dates_in_2016(dates):
    dates_in_2016 = []
    if len(dates) == 1 and dates[0] == '':
        return []
    for date_str in dates:
        date = datetime.datetime.strptime(date_str.strip(), "%Y-%m-%d %H:%M:%S")
        if date.year == 2016:
            dates_in_2016.append(date)
        
    return dates_in_2016
        

def calculate_frequency_2016(row):
    checkins_dates = row[4].split(",")
    tips_dates = row[7].split(",")
    
    checkins_in_2016 = get_dates_in_2016(checkins_dates)
    tips_in_2016 = get_dates_in_2016(tips_dates)
    
    total = len(checkins_in_2016) + len(tips_in_2016)
    return total

In [23]:
restaurants_2016_df["total"] = restaurants_2016_df.apply(calculate_frequency_2016, axis=1)

In [24]:
restaurants_2016_df.head(3)

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count,tips_count,tips_dates,start_date,start_date_year,end_date,end_date_year,total
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180,47.0,"2013-12-18 05:57:05,2013-04-08 01:55:49,2013-0...",22-04-2010,2010,21-01-2021,2021,144
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288,32.0,"2010-12-02 21:10:51,2018-01-18 21:40:39,2011-1...",06-11-2010,2010,23-01-2021,2021,24
3,HPA_qyMEddpAEtFof02ixg,1,39,4.0,"2010-07-28 21:12:50, 2010-08-01 23:58:02, 2010...",36,4.0,"2014-03-15 17:16:33,2013-04-27 23:57:33,2014-0...",28-07-2010,2010,23-08-2019,2019,2


In [25]:
q25 = restaurants_2016_df['total'].quantile(0.25)
median = restaurants_2016_df['total'].quantile(0.5)
q75 = restaurants_2016_df['total'].quantile(0.75)
restaurants_2016_df["normalized_total"] = round(((restaurants_2016_df["total"]-median)/(q75-q25) + 1) * 0.5, 3)

### Analyzing the Data from 2016

#### General Information

In [26]:
restaurants_2016_df.head(2)

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count,tips_count,tips_dates,start_date,start_date_year,end_date,end_date_year,total,normalized_total
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180,47.0,"2013-12-18 05:57:05,2013-04-08 01:55:49,2013-0...",22-04-2010,2010,21-01-2021,2021,144,1.706
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288,32.0,"2010-12-02 21:10:51,2018-01-18 21:40:39,2011-1...",06-11-2010,2010,23-01-2021,2021,24,0.529


In [27]:
mean = round(restaurants_2016_df["total"].mean(), 3)
std = round(restaurants_2016_df["total"].std(), 3)
print(f"Mean frequency in 2016 is {mean} with std {std}")
print("Max total is " + str(restaurants_2016_df['total'].max()))
print("Min total is " + str(restaurants_2016_df['total'].min()))

Mean frequency in 2016 is 52.716 with std 107.656
Max total is 3060
Min total is 0


In [28]:
restaurants_2016_df['normalized_total'].max()

30.294

In [29]:
len(restaurants_2016_df[restaurants_2016_df['total'] > 1000])

40

In [30]:
print("25% quantile total is " + str(restaurants_2016_df['normalized_total'].quantile(0.25)))
print("33% quantile total is " + str(restaurants_2016_df['normalized_total'].quantile(0.33)))
print("50% quantile total is " + str(restaurants_2016_df['normalized_total'].quantile(0.5)))
print("66% quantile total is " + str(restaurants_2016_df['normalized_total'].quantile(0.66)))
print("75% quantile total is " + str(restaurants_2016_df['normalized_total'].quantile(0.75)))

25% quantile total is 0.363
33% quantile total is 0.392
50% quantile total is 0.5
66% quantile total is 0.686
75% quantile total is 0.863


#### Correlations

In [31]:
restaurants_2016_df[['normalized_total', 'stars']].corr()

Unnamed: 0,normalized_total,stars
normalized_total,1.0,0.222748
stars,0.222748,1.0


In [32]:
restaurants_2016_df[['normalized_total', 'review_count']].corr()

Unnamed: 0,normalized_total,review_count
normalized_total,1.0,0.782728
review_count,0.782728,1.0


In [33]:
restaurants_2016_df[['normalized_total', 'is_open']].corr()

Unnamed: 0,normalized_total,is_open
normalized_total,1.0,0.023747
is_open,0.023747,1.0


#### Looking at the restaurants who later closed down

In [34]:
open_restaurants_in_2017_df = restaurants_2016_df[restaurants_2016_df["is_open"] == 1]

In [35]:
open_restaurants_in_2017_df["normalized_total"].mean()

0.8230030565461077

In [36]:
closed_restaurants_in_2017_df = restaurants_2016_df[restaurants_2016_df["is_open"] == 0]

In [37]:
len(closed_restaurants_in_2017_df)

4531

In [38]:
closed_restaurants_in_2017_df["normalized_total"].mean()

0.7587951887000678

In [39]:
# Only consider those who closed in 2017 (i.e. last activity is in 2017) because would reflect on the popularity
# of the restaurant in the previous years (2016)
closed_restaurants_in_2017_df = closed_restaurants_in_2017_df[closed_restaurants_in_2017_df["end_date_year"] == 2017]

In [40]:
len(closed_restaurants_in_2017_df)

1361

In [41]:
closed_restaurants_in_2017_df["normalized_total"].mean()

0.6054011756061711

#### Looking at % of Restaurants who closed down who were in bottom of popularity range

In [42]:
bottom_df = restaurants_2016_df.loc[restaurants_2016_df["normalized_total"] <= 0.392]

In [43]:
len(bottom_df)/len(restaurants_2016_df)

0.338810479698688

In [44]:
middle_df = restaurants_2016_df.loc[restaurants_2016_df["normalized_total"] > 0.392]
middle_df = middle_df.loc[middle_df["normalized_total"] <= 0.686]

In [45]:
len(middle_df)/len(restaurants_2016_df)

0.32593849592318197

In [46]:
top_df = restaurants_2016_df.loc[restaurants_2016_df["normalized_total"] > 0.686]

In [47]:
len(top_df)/len(restaurants_2016_df)

0.33525102437813004

In [48]:
(len(bottom_df) + len(middle_df) + len(top_df))/len(restaurants_2016_df)

1.0

In [49]:
total_closed = len(closed_restaurants_in_2017_df)
print(total_closed)

1361


In [50]:
bottom_closed_df = bottom_df[(bottom_df["is_open"] == 0) & (bottom_df["end_date_year"] == 2017)]
middle_closed_df = middle_df[(middle_df["is_open"] == 0) & (middle_df["end_date_year"] == 2017)]
top_closed_df = top_df[(top_df["is_open"] == 0) & (top_df["end_date_year"] == 2017)]

In [51]:
bottom_pct = round(len(bottom_closed_df) / total_closed, 2)
middle_pct = round(len(middle_closed_df) / total_closed, 2)
top_pct = round(len(top_closed_df) / total_closed, 2)

print(f"{bottom_pct}% of closed restaurants are in the bottom 33% quartile")
print(f"{middle_pct}% of closed restaurants are in the middle 33% quartile")
print(f"{top_pct}% of closed restaurants are in the top 33% quartile")

# Results show some agreement between frequency of visits and whether a business will close, as 40% of closed business in
# 2017 were among the least popular businesses by visit

0.4% of closed restaurants are in the bottom 33% quartile
0.35% of closed restaurants are in the middle 33% quartile
0.24% of closed restaurants are in the top 33% quartile


In [52]:
first_df = restaurants_2016_df.loc[restaurants_2016_df["normalized_total"] <= 0.5]
second_df = restaurants_2016_df.loc[restaurants_2016_df["normalized_total"] > 0.5]

In [53]:
first_closed_df = first_df[(first_df["is_open"] == 0) & (first_df["end_date_year"] == 2017)]
second_closed_df = second_df[(second_df["is_open"] == 0) & (second_df["end_date_year"] == 2017)]

In [54]:
print(f"{round(len(first_closed_df) / total_closed, 2)}% of closed restaurants are in the bottom 50% quartile")
print(f"{round(len(second_closed_df) / total_closed, 2)}% of closed restaurants are in the top 50% quartile")

0.59% of closed restaurants are in the bottom 50% quartile
0.41% of closed restaurants are in the top 50% quartile


### Calculating Popularity Metric

Give equal weight to the visit frequency and star rating

In [55]:
restaurants_2016_df["metric"] = 0.5 * restaurants_2016_df["normalized_total"] + 0.5 * (restaurants_2016_df["stars"]/5.0)

In [56]:
restaurants_2016_df.head(2)

Unnamed: 0,business_id,is_open,review_count,stars,checkins_dates,checkins_count,tips_count,tips_dates,start_date,start_date_year,end_date,end_date_year,total,normalized_total,metric
1,tCbdrRPZA0oiIYSmHG3J0w,1,126,4.0,"2010-04-22 05:31:33, 2010-05-09 18:24:50, 2010...",1180,47.0,"2013-12-18 05:57:05,2013-04-08 01:55:49,2013-0...",22-04-2010,2010,21-01-2021,2021,144,1.706,1.253
2,D4JtQNTI4X3KcbzacDJsMw,1,169,3.5,"2010-11-06 02:53:03, 2010-11-29 02:16:55, 2010...",288,32.0,"2010-12-02 21:10:51,2018-01-18 21:40:39,2011-1...",06-11-2010,2010,23-01-2021,2021,24,0.529,0.6145


In [57]:
print("Mean is " + str(round(restaurants_2016_df['metric'].mean(), 2)))
print("Max is " + str(round(restaurants_2016_df['metric'].max(), 2)))
print("Min is " + str(round(restaurants_2016_df['metric'].min(), 2)))

Mean is 0.75
Max is 15.55
Min is 0.25


#### Analyzing Metric

In [58]:
print("25% quantile total is " + str(round(restaurants_2016_df['metric'].quantile(0.25), 2)))
print("33% quantile total is " + str(round(restaurants_2016_df['metric'].quantile(0.33), 2)))
print("50% quantile total is " + str(round(restaurants_2016_df['metric'].quantile(0.5), 2)))
print("66% quantile total is " + str(round(restaurants_2016_df['metric'].quantile(0.66), 2)))
print("75% quantile total is " + str(round(restaurants_2016_df['metric'].quantile(0.75), 2)))

25% quantile total is 0.51
33% quantile total is 0.55
50% quantile total is 0.62
66% quantile total is 0.71
75% quantile total is 0.8


In [59]:
open_restaurants_in_2017_df = restaurants_2016_df[restaurants_2016_df["is_open"] == 1]

In [60]:
open_restaurants_in_2017_df["metric"].mean()

0.7556762608252703

In [61]:
closed_restaurants_in_2017_df = restaurants_2016_df[restaurants_2016_df["is_open"] == 0]
closed_restaurants_in_2017_df["metric"].mean()

0.7259436106819704

In [62]:
# Only consider those who closed in 2017 (i.e. last activity is in 2017) because would reflect on the popularity
# of the restaurant in the previous years (2016)
closed_restaurants_in_2017_df = closed_restaurants_in_2017_df[closed_restaurants_in_2017_df["end_date_year"] == 2017]

In [63]:
closed_restaurants_in_2017_df["metric"].mean()

0.6497983100661286

In [73]:
bottom_df = restaurants_2016_df.loc[restaurants_2016_df["metric"] <= 0.55]
middle_df = restaurants_2016_df.loc[restaurants_2016_df["metric"] > 0.55]
middle_df = middle_df.loc[middle_df["metric"] <= 0.71]
top_df = restaurants_2016_df.loc[restaurants_2016_df["metric"] > 0.71]

In [74]:
total_closed = len(closed_restaurants_in_2017_df)
print(total_closed)

1361


In [75]:
bottom_closed_df = bottom_df[(bottom_df["is_open"] == 0) & (bottom_df["end_date_year"] == 2017)]
middle_closed_df = middle_df[(middle_df["is_open"] == 0) & (middle_df["end_date_year"] == 2017)]
top_closed_df = top_df[(top_df["is_open"] == 0) & (top_df["end_date_year"] == 2017)]

In [76]:
bottom_pct = round(len(bottom_closed_df) / total_closed, 2)
middle_pct = round(len(middle_closed_df) / total_closed, 2)
top_pct = round(len(top_closed_df) / total_closed, 2)

print(f"{bottom_pct}% of closed restaurants are in the bottom 33% quartile")
print(f"{middle_pct}% of closed restaurants are in the middle 33% quartile")
print(f"{top_pct}% of closed restaurants are in the top 33% quartile")

0.37% of closed restaurants are in the bottom 33% quartile
0.37% of closed restaurants are in the middle 33% quartile
0.26% of closed restaurants are in the top 33% quartile


In [77]:
first_df = restaurants_2016_df.loc[restaurants_2016_df["metric"] <= 0.62]
second_df = restaurants_2016_df.loc[restaurants_2016_df["metric"] > 0.62]

In [78]:
first_closed_df = first_df[(first_df["is_open"] == 0) & (first_df["end_date_year"] == 2017)]
second_closed_df = second_df[(second_df["is_open"] == 0) & (second_df["end_date_year"] == 2017)]

In [79]:
print(f"{round(len(first_closed_df) / total_closed, 2)}% of closed restaurants are in the bottom 50% quartile")
print(f"{round(len(second_closed_df) / total_closed, 2)}% of closed restaurants are in the top 50% quartile")

0.59% of closed restaurants are in the bottom 50% quartile
0.41% of closed restaurants are in the top 50% quartile
