In [101]:
import pandas as pd

In [102]:
df_business = pd.read_json('../yelp_academic_dataset_business.json', lines=True)

# Filter for restuarants that are open and only keep necessary columns
df_business_restaurants_sub = df_business[df_business['categories'].apply(lambda x: 'Restaurants' in x) 
                                          & df_business['open'].apply(lambda x: x == True)].reset_index(drop=True)[['business_id', 'stars', 'hours', 'city', 'attributes']]

# round stars and keep the results in column "stars_round"
df_business_restaurants_sub['stars_round'] = df_business_restaurants_sub['stars'].apply(round)

df_business_restaurants_sub.head()

Unnamed: 0,business_id,stars,hours,city,attributes,stars_round
0,5UmKMjUEUNdYWqANhGckJw,3.5,"{'Thursday': {'close': '21:00', 'open': '11:00...",Dravosburg,"{'Take-out': True, 'Delivery': False, 'Good fo...",4
1,mVHrayjG3uZ_RLHkLj-AMg,4.5,"{'Wednesday': {'close': '19:00', 'open': '10:0...",Braddock,"{'Delivery': False, 'Take-out': True, 'Good Fo...",4
2,KayYbHCt-RkbGcPdGOThNg,4.0,"{'Saturday': {'close': '02:00', 'open': '12:00...",Carnegie,"{'Delivery': False, 'Take-out': True, 'Good Fo...",4
3,wJr6kSA5dchdgOdwH6dZ2w,3.5,"{'Saturday': {'close': '02:00', 'open': '08:00...",Carnegie,"{'Take-out': True, 'Delivery': False, 'Wheelch...",4
4,fNGIbpazjTRdXgwRY_NIXA,4.0,"{'Saturday': {'close': '23:00', 'open': '11:00...",Carnegie,"{'Delivery': False, 'Take-out': True, 'Good Fo...",4


In [103]:
#80% training data
training_restaurants_df = df_business_restaurants_sub.sample(frac=0.8)

#group by stars
group_by_stars = training_restaurants_df.groupby('stars_round')

business_of_stars = {}
for star in group_by_stars.groups:
    group = group_by_stars.get_group(star)
    business_of_stars[star] = group.assign(working_type=lambda x: x['hours'])

In [104]:
#Prior for each group
prior_of_stars = {}

for star in business_of_stars:
    prior_of_stars[star] = len(business_of_stars[star]) * 1.0 / len(training_restaurants_df)
    
prior_of_stars

{1: 0.004596273291925466,
 2: 0.1639751552795031,
 3: 0.16291925465838508,
 4: 0.6480124223602485,
 5: 0.020496894409937887}

In [105]:
from datetime import time

WORKING_TYPES = {
    "WEEKEND_TYPE": "weekend",
    "BREAKFAST_TYPE": "breakfast",
    "LUNCH_TYPE": "lunch",
    "AFTER_LUNCH_TYPE": "after-lunch",
    "DINNER_TYPE": "dinner",
    "NIGHT_TYPE": "night",
}

breakfast = time(8)
lunch = time(12)
after_lunch = time(15)
dinner = time(18)
night = time(0)

def in_between(start, end, check):
    if start == end: # 24 hours
        return True
    if start < end:
        return start <= check < end
    else: # over midnight e.g., 23:30-04:15
        return start <= check or check < end

TYPE_THRESHOLD = 1
def spec_hours_to_type(s):
    types = []
    
    breakfast_count = 0
    lunch_count = 0
    after_lunch_count = 0
    dinner_count = 0
    night_count = 0

    for day in s:
        
        clo = s[day]['close']
        op = s[day]['open']

        h, m = clo.split(':')
        clo_t = time(int(h), int(m))

        h, m = op.split(':')
        op_t = time(int(h), int(m))
        
        if in_between(op_t, clo_t, breakfast):
            breakfast_count += 1
        
        if in_between(op_t, clo_t, lunch):
            lunch_count += 1
        
        if in_between(op_t, clo_t, after_lunch):
            after_lunch_count += 1
        
        if in_between(op_t, clo_t, dinner):
            dinner_count += 1
        
        if in_between(op_t, clo_t, night):
            night_count += 1
        
        if (day in ['Saturday', 'Sunday']) and (WORKING_TYPES["WEEKEND_TYPE"] not in types):
            types.append(WORKING_TYPES["WEEKEND_TYPE"])
            
    if breakfast_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["BREAKFAST_TYPE"])
        
    if lunch_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["LUNCH_TYPE"])
    
    if after_lunch_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["AFTER_LUNCH_TYPE"])
    
    if dinner_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["DINNER_TYPE"])
        
    if night_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["NIGHT_TYPE"])
    
    return join_types(types)

def hours_to_type(s):
    if isinstance(s, str):
        return s
    
    if s:
        return spec_hours_to_type(s)
    else:
        return join_types(WORKING_TYPES.values())

def join_types(ts):
    # reorder
    ordered_types = []
    for t in WORKING_TYPES.values():
        if t in ts:
            ordered_types.append(t)
    return '_'.join(ordered_types)


DEFAULT_TYPE = 'default'
def extract_value_from_attrs(attrs, k):
    if k in attrs:
        return attrs[k]
    else:
        return DEFAULT_TYPE
    
def filter_from_attr_val(attr, k, v):
    return k in attr and attr[k] == v

def filter_no_attr(attr, k):
    return k not in attr
    
tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Accepts Credit Cards'))
accepts_credit_cards_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Alcohol'))
alcohol_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Caters'))
caters_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Noise Level'))
noise_level_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Price Range'))
price_range_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Take-out'))
take_out_type_set = tdf.unique()

working_type_set = set()
for star in business_of_stars:
    business_of_star_df = business_of_stars[star]
    business_of_star_df['working_type'] = business_of_star_df['working_type'].apply(hours_to_type)
    working_type_set |= set(business_of_star_df['working_type'].unique())

num_working_types = len(working_type_set)
num_cities = len(training_restaurants_df['city'].unique())

types_freq_of_stars = {}
city_freq_of_stars = {}
accepts_credit_cards_of_stars = {}
alcohol_of_stars = {}
caters_of_stars = {}
noise_level_of_stars = {}
price_range_of_stars = {}
take_out_of_stars = {}

#calculate the frequencies
for star in business_of_stars:
    business_of_star_df = business_of_stars[star]
    num_business = len(business_of_star_df)
    
    
    # count frequency of different type
    types_freq = {}
    
    working_type_of_business = business_of_star_df.groupby('working_type')
    for wt in working_type_of_business.groups:
        # we use the add-one or Laplace smoothing
        types_freq[wt] = (len(working_type_of_business.get_group(wt)) + 1.0) / (num_business + num_working_types)
    
    types_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_working_types)
    types_freq_of_stars[star] = types_freq
    
    
    # count frequency of different city
    city_freq = {}
    
    # Now group them by city
    city_of_business = business_of_star_df.groupby('city')
    for city in city_of_business.groups:
        # we use the add-one or Laplace smoothing
        city_freq[city] = (len(city_of_business.get_group(city)) + 1.0) / (num_business + num_cities)
    
    # this value is for cities not in the a specify "group of star",
    # e.g. city "glendale" is not in group of star 1
    city_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_cities)
    city_freq_of_stars[star] = city_freq

    
    # count frequency of accepts credit cards
    accepts_credit_cards_freq = {}
    
    for t in accepts_credit_cards_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Accepts Credit Cards', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Accepts Credit Cards'))])
        
        accepts_credit_cards_freq[t] = (num + 1.0) / (num_business + len(accepts_credit_cards_type_set))
    
    accepts_credit_cards_of_stars[star] = accepts_credit_cards_freq

    
    # count frequency of alcohol
    alcohol_freq = {}
    
    for t in alcohol_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Alcohol', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Alcohol'))])
        
        alcohol_freq[t] = (num + 1.0) / (num_business + len(alcohol_type_set))
    
    alcohol_of_stars[star] = alcohol_freq
    
    
    # count frequency of caters
    caters_freq = {}
    
    for t in caters_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Caters', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Caters'))])
        
        caters_freq[t] = (num + 1.0) / (num_business + len(caters_type_set))
    
    caters_of_stars[star] = caters_freq
    
    
    # count frequency of noise level
    noise_level_freq = {}
    
    for t in noise_level_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Noise Level', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Noise Level'))])
        
        noise_level_freq[t] = (num + 1.0) / (num_business + len(noise_level_type_set))
    
    noise_level_of_stars[star] = noise_level_freq
    
    
    # count frequency of price range
    price_range_freq = {}
    
    for t in price_range_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Price Range', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Price Range'))])
        
        price_range_freq[t] = (num + 1.0) / (num_business + len(price_range_type_set))
    
    price_range_of_stars[star] = price_range_freq
    
    
    # count frequency of take-out
    take_out_freq = {}
    
    for t in take_out_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Take-out', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Take-out'))])
        
        take_out_freq[t] = (num + 1.0) / (num_business + len(take_out_type_set))
    
    take_out_of_stars[star] = take_out_freq

Verification our test set

In [106]:
test_restaurants_df = df_business_restaurants_sub[~df_business_restaurants_sub.isin(training_restaurants_df)].dropna()

test_restaurants_df.head()

Unnamed: 0,business_id,stars,hours,city,attributes,stars_round
6,zaXDakTd3RXyOa7sMrUE1g,4.0,"{'Saturday': {'close': '12:00', 'open': '07:00...",Carnegie,"{'Take-out': True, 'Delivery': False, 'Good fo...",4.0
7,6o3RK6rTcN3nw-j-r2nQmA,2.5,"{'Saturday': {'close': '23:00', 'open': '11:00...",Carnegie,{},2.0
9,SQ0j7bgSTazkVQlF5AnqyQ,2.5,{},Carnegie,"{'Take-out': True, 'Delivery': True, 'Waiter S...",2.0
15,6ilJq_05xRgek_8qUp36-g,2.0,"{'Saturday': {'close': '00:00', 'open': '00:00...",Munhall,"{'Take-out': True, 'Delivery': False, 'Alcohol...",2.0
17,sbW8qHJgzEIH42B0S-3New,2.5,"{'Saturday': {'close': '02:00', 'open': '11:30...",Homestead,"{'Delivery': False, 'Take-out': True, 'Good Fo...",2.0


In [107]:
import numpy as np

def calc_probs(hours, city, attrs):
    probs_of_stars = {}
    
    working_type = hours_to_type(hours)
    
    for star in prior_of_stars:
        prob = np.log(prior_of_stars[star])
        prob += np.log(types_freq_of_stars[star].get(working_type, types_freq_of_stars[star]['default']))
        prob += np.log(city_freq_of_stars[star].get(city, city_freq_of_stars[star]['default']))
        
        prob += np.log(accepts_credit_cards_of_stars[star].get(extract_value_from_attrs(attrs, 'Accepts Credit Cards'), accepts_credit_cards_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(alcohol_of_stars[star].get(extract_value_from_attrs(attrs, 'Alcohol'), alcohol_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(caters_of_stars[star].get(extract_value_from_attrs(attrs, 'Caters'), caters_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(noise_level_of_stars[star].get(extract_value_from_attrs(attrs, 'Noise Level'), noise_level_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(price_range_of_stars[star].get(extract_value_from_attrs(attrs, 'Price Range'), price_range_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(take_out_of_stars[star].get(extract_value_from_attrs(attrs, 'Take-out'), take_out_of_stars[star][DEFAULT_TYPE]))
    
        probs_of_stars[star] = prob
    return probs_of_stars
    
test_restaurants_df['stars_probs'] = test_restaurants_df.apply(lambda r: calc_probs(r['hours'], r['city'], r['attributes']), axis=1)
test_restaurants_df.head()

Unnamed: 0,business_id,stars,hours,city,attributes,stars_round,stars_probs
6,zaXDakTd3RXyOa7sMrUE1g,4.0,"{'Saturday': {'close': '12:00', 'open': '07:00...",Carnegie,"{'Take-out': True, 'Delivery': False, 'Good fo...",4.0,"{1: -20.9718281955, 2: -18.8736312069, 3: -17...."
7,6o3RK6rTcN3nw-j-r2nQmA,2.5,"{'Saturday': {'close': '23:00', 'open': '11:00...",Carnegie,{},2.0,"{1: -18.322809713, 2: -21.2673537462, 3: -22.1..."
9,SQ0j7bgSTazkVQlF5AnqyQ,2.5,{},Carnegie,"{'Take-out': True, 'Delivery': True, 'Waiter S...",2.0,"{1: -17.5478161361, 2: -14.6467168362, 3: -14...."
15,6ilJq_05xRgek_8qUp36-g,2.0,"{'Saturday': {'close': '00:00', 'open': '00:00...",Munhall,"{'Take-out': True, 'Delivery': False, 'Alcohol...",2.0,"{1: -17.7760747881, 2: -12.9727613628, 3: -13...."
17,sbW8qHJgzEIH42B0S-3New,2.5,"{'Saturday': {'close': '02:00', 'open': '11:30...",Homestead,"{'Delivery': False, 'Take-out': True, 'Good Fo...",2.0,"{1: -27.8048173911, 2: -19.1939256861, 3: -17...."


Calculte the accuracy

In [108]:
import operator

def predict(stars, probs):
    sorted_probs = sorted(probs.items(), key=operator.itemgetter(1))
    return sorted_probs[-1][0]
    
def correctness(stars, estimated_stars):
    return stars == estimated_stars
    
def distance(stars, estimated_stars):
    return abs(stars - estimated_stars)

test_restaurants_df['estimated_stars'] = test_restaurants_df.apply(lambda r: predict(r['stars'], r['stars_probs']), axis=1)
test_restaurants_df['correctness'] = test_restaurants_df.apply(lambda r: correctness(r['stars'], r['estimated_stars']), axis=1)
test_restaurants_df['distance'] = test_restaurants_df.apply(lambda r: distance(r['stars'], r['estimated_stars']), axis=1)


corrects = len(test_restaurants_df[test_restaurants_df['correctness'] == True])
print('accuracy is ' + str(corrects * 1.0 / len(test_restaurants_df)))

print('average distance is ' + str(test_restaurants_df['distance'].mean()))

accuracy is 0.2720496894409938
average distance is 0.693043478261
