In [537]:
import pandas as pd
import numpy as np

In [538]:
def get_relevant_businesses(review_count):
    df_business = pd.read_json('../yelp_academic_dataset_business.json', lines=True)
    # Filter for restuarants that are open and only keep necessary columns
    df_business_filtered = df_business[df_business['categories'].apply(lambda x: 'Restaurants' in x) 
                                          & df_business['open'].apply(lambda x: x == True)
                                             & df_business['review_count'].apply(lambda x: x >= review_count)].reset_index(drop=True)[['business_id', 'stars', 'hours', 'city', 'attributes']]
    return df_business_filtered

In [539]:
df_business_restaurants = get_relevant_businesses(20)
print (len(df_business_restaurants))
df_business_restaurants.head()

11207


Unnamed: 0,business_id,stars,hours,city,attributes
0,mVHrayjG3uZ_RLHkLj-AMg,4.5,"{'Saturday': {'open': '10:00', 'close': '16:00...",Braddock,"{'Takes Reservations': False, 'Noise Level': '..."
1,KayYbHCt-RkbGcPdGOThNg,4.0,"{'Monday': {'open': '11:00', 'close': '02:00'}...",Carnegie,"{'Takes Reservations': False, 'Noise Level': '..."
2,b9WZJp5L1RZr4F1nxclOoQ,4.5,"{'Monday': {'open': '06:00', 'close': '14:30'}...",Carnegie,"{'Takes Reservations': False, 'BYOB': False, '..."
3,P1fJb2WQ1mXoiudj8UE44w,3.5,"{'Monday': {'open': '11:00', 'close': '22:00'}...",Carnegie,"{'Takes Reservations': True, 'Noise Level': 'a..."
4,6ilJq_05xRgek_8qUp36-g,2.0,"{'Monday': {'open': '00:00', 'close': '00:00'}...",Munhall,"{'Takes Reservations': False, 'Noise Level': '..."


In [540]:
def normalize_values(df_column):
    mean_value = df_column.mean()
    max_value = df_column.max()
    min_value = df_column.min()
    return (df_column - mean_value)/(max_value - min_value)

In [541]:
def discretize_values(df_column, number_of_bins):
    max_value = df_column.max()
    min_value = df_column.min()
    bins = np.linspace(min_value, max_value, number_of_bins)
    return df_column.apply(lambda x: np.digitize(x, bins))

In [542]:
def get_merged_sentiment_details():
    df_sentiment = pd.read_json('../out/yelp_academic_dataset_review_sentiment.json', lines=True)[['business_id','sentiment_value','user_id']]
    df_user = pd.read_json('../yelp_academic_dataset_user.json', lines=True)[['user_id','yelping_since','elite','votes','review_count']]
    #extract useful votes to a column
    df_user['useful'] =  df_user['votes'].apply(lambda x: x.get('useful'))
    #normalize the values
    df_user['useful'] = normalize_values(df_user['useful'])
    df_user['useful'] = discretize_values(df_user['useful'],10)
    #normalize the sentiments
    df_sentiment['sentiment_value'] = normalize_values(df_sentiment['sentiment_value'])
    df_sentiment_user = pd.merge(df_sentiment,df_user, how='left',  on=['user_id', 'user_id'])[['business_id','sentiment_value','user_id','useful','review_count']]
    return df_sentiment_user

In [543]:
df_sentiment_user = get_merged_sentiment_details()
print (len(df_sentiment_user))
df_sentiment_user.head()

2685066


Unnamed: 0,business_id,sentiment_value,user_id,useful,review_count
0,5UmKMjUEUNdYWqANhGckJw,0.079333,PUFPaY9KxDAcGqfsorJp3Q,1,60
1,5UmKMjUEUNdYWqANhGckJw,0.199333,Iu6AxdBYGR4A0wspR9BYHA,1,7
2,5UmKMjUEUNdYWqANhGckJw,0.066,auESFwWvW42h6alXgFxAXQ,1,18
3,5UmKMjUEUNdYWqANhGckJw,-0.050667,qiczib2fO_1VBG8IoCGvVg,1,52
4,5UmKMjUEUNdYWqANhGckJw,-0.107333,qEE5EvV-f-s7yHC0Z4ydJQ,1,26


In [544]:
def reject_outliers(data, m=2):
    original_mean = np.mean(data)
    original_std = np.nanstd(data)
    newdata = data[abs(data - original_mean) < m * original_std]
    if(len(newdata) > 0 ):
        return np.nanmean(newdata)
    else :
        return 0.0

In [545]:
def get_merged_business_sentiment():
    df_sentiment_business = df_sentiment_user.groupby('business_id',as_index=False)[['sentiment_value','useful']].mean()
    df_bus_review_merged = pd.merge(df_business_restaurants,df_sentiment_business, how='left', 
                                     on=['business_id', 'business_id'])
    df_bus_review_merged['sentiment_value'] = discretize_values(df_bus_review_merged['sentiment_value'],100)
    df_bus_review_merged['useful'] = discretize_values(df_bus_review_merged['useful'],10)
    df_bus_review_merged['weighted_sentiment'] = discretize_values(df_bus_review_merged['sentiment_value'] * df_bus_review_merged['useful'], 1000)
    return df_bus_review_merged

In [546]:
df_bus_review_merged = get_merged_business_sentiment()
df_bus_review_merged.head()

Unnamed: 0,business_id,stars,hours,city,attributes,sentiment_value,useful,weighted_sentiment
0,mVHrayjG3uZ_RLHkLj-AMg,4.5,"{'Saturday': {'open': '10:00', 'close': '16:00...",Braddock,"{'Takes Reservations': False, 'Noise Level': '...",60,1,105
1,KayYbHCt-RkbGcPdGOThNg,4.0,"{'Monday': {'open': '11:00', 'close': '02:00'}...",Carnegie,"{'Takes Reservations': False, 'Noise Level': '...",73,1,128
2,b9WZJp5L1RZr4F1nxclOoQ,4.5,"{'Monday': {'open': '06:00', 'close': '14:30'}...",Carnegie,"{'Takes Reservations': False, 'BYOB': False, '...",59,1,103
3,P1fJb2WQ1mXoiudj8UE44w,3.5,"{'Monday': {'open': '11:00', 'close': '22:00'}...",Carnegie,"{'Takes Reservations': True, 'Noise Level': 'a...",54,1,94
4,6ilJq_05xRgek_8qUp36-g,2.0,"{'Monday': {'open': '00:00', 'close': '00:00'}...",Munhall,"{'Takes Reservations': False, 'Noise Level': '...",38,1,66


In [547]:
def get_merged_tip_sentiments():
    df_sentiment = pd.read_json('../out/yelp_academic_dataset_tip_sentiment.json', lines=True)[['business_id','sentiment_value','user_id']]
    df_sentiment['tip_sentiment_value'] = normalize_values(df_sentiment['sentiment_value'])
    df_sentiment_business = df_sentiment.groupby('business_id',as_index=False)[['tip_sentiment_value']].mean()
    df_sentiment_business['tip_sentiment_value'] = discretize_values(df_sentiment_business['tip_sentiment_value'],100)
    df_bus_review_tip_merged = pd.merge(df_bus_review_merged,df_sentiment_business, how='left', 
                                         on=['business_id', 'business_id'])
    return df_bus_review_tip_merged

In [548]:
df_bus_review_tip_merged = get_merged_tip_sentiments()
df_bus_review_tip_merged.head()

Unnamed: 0,business_id,stars,hours,city,attributes,sentiment_value,useful,weighted_sentiment,tip_sentiment_value
0,mVHrayjG3uZ_RLHkLj-AMg,4.5,"{'Saturday': {'open': '10:00', 'close': '16:00...",Braddock,"{'Takes Reservations': False, 'Noise Level': '...",60,1,105,30.0
1,KayYbHCt-RkbGcPdGOThNg,4.0,"{'Monday': {'open': '11:00', 'close': '02:00'}...",Carnegie,"{'Takes Reservations': False, 'Noise Level': '...",73,1,128,62.0
2,b9WZJp5L1RZr4F1nxclOoQ,4.5,"{'Monday': {'open': '06:00', 'close': '14:30'}...",Carnegie,"{'Takes Reservations': False, 'BYOB': False, '...",59,1,103,50.0
3,P1fJb2WQ1mXoiudj8UE44w,3.5,"{'Monday': {'open': '11:00', 'close': '22:00'}...",Carnegie,"{'Takes Reservations': True, 'Noise Level': 'a...",54,1,94,45.0
4,6ilJq_05xRgek_8qUp36-g,2.0,"{'Monday': {'open': '00:00', 'close': '00:00'}...",Munhall,"{'Takes Reservations': False, 'Noise Level': '...",38,1,66,43.0


In [549]:
def get_merged_checkin_count():
    df_checkin = pd.read_json('../out/business_with_checkin_count.json')[['business_id','checkin_count']]
    df_bus_review_tip_checkin_merged = pd.merge(df_bus_review_tip_merged,df_checkin, how='left', 
                                             on=['business_id', 'business_id'])
    df_bus_review_tip_checkin_merged['checkin_count'] = discretize_values(normalize_values(df_bus_review_tip_checkin_merged['checkin_count']),10)
    return df_bus_review_tip_checkin_merged

In [550]:
df_bus_review_tip__checkin = get_merged_checkin_count()

In [551]:
#80% training data
training_restaurants_df = df_bus_review_tip__checkin.sample(frac=0.8, random_state = 42)

#group by stars
group_by_stars = training_restaurants_df.groupby('stars')

business_of_stars = {}
for star in group_by_stars.groups:
    group = group_by_stars.get_group(star)
    business_of_stars[star] = group.assign(working_type=lambda x: x['hours'])

In [552]:
#Prior for each group
prior_of_stars = {}

for star in business_of_stars:
    prior_of_stars[star] = len(business_of_stars[star]) * 1.0 / len(training_restaurants_df)
    
prior_of_stars

{1.0: 0.00011153245594467991,
 1.5: 0.005242025429399956,
 2.0: 0.019183582422484944,
 2.5: 0.06468882444791434,
 3.0: 0.1581530225295561,
 3.5: 0.2983493196520187,
 4.0: 0.3246709792549632,
 4.5: 0.12157037697970109,
 5.0: 0.008030336828016953}

In [553]:
from datetime import time

WORKING_TYPES = {
    "WEEKEND_TYPE": "weekend",
    "BREAKFAST_TYPE": "breakfast",
    "LUNCH_TYPE": "lunch",
    "AFTER_LUNCH_TYPE": "after-lunch",
    "DINNER_TYPE": "dinner",
    "NIGHT_TYPE": "night",
}

breakfast = time(8)
lunch = time(12)
after_lunch = time(15)
dinner = time(18)
night = time(0)

def in_between(start, end, check):
    if start == end: # 24 hours
        return True
    if start < end:
        return start <= check < end
    else: # over midnight e.g., 23:30-04:15
        return start <= check or check < end

TYPE_THRESHOLD = 1
def spec_hours_to_type(s):
    types = []
    
    breakfast_count = 0
    lunch_count = 0
    after_lunch_count = 0
    dinner_count = 0
    night_count = 0

    for day in s:
        
        clo = s[day]['close']
        op = s[day]['open']

        h, m = clo.split(':')
        clo_t = time(int(h), int(m))

        h, m = op.split(':')
        op_t = time(int(h), int(m))
        
        if in_between(op_t, clo_t, breakfast):
            breakfast_count += 1
        
        if in_between(op_t, clo_t, lunch):
            lunch_count += 1
        
        if in_between(op_t, clo_t, after_lunch):
            after_lunch_count += 1
        
        if in_between(op_t, clo_t, dinner):
            dinner_count += 1
        
        if in_between(op_t, clo_t, night):
            night_count += 1
        
        if (day in ['Saturday', 'Sunday']) and (WORKING_TYPES["WEEKEND_TYPE"] not in types):
            types.append(WORKING_TYPES["WEEKEND_TYPE"])
            
    if breakfast_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["BREAKFAST_TYPE"])
        
    if lunch_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["LUNCH_TYPE"])
    
    if after_lunch_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["AFTER_LUNCH_TYPE"])
    
    if dinner_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["DINNER_TYPE"])
        
    if night_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["NIGHT_TYPE"])
    
    return join_types(types)

def hours_to_type(s):
    if isinstance(s, str):
        return s
    
    if s:
        return spec_hours_to_type(s)
    else:
        return join_types(WORKING_TYPES.values())

def join_types(ts):
    # reorder
    ordered_types = []
    for t in WORKING_TYPES.values():
        if t in ts:
            ordered_types.append(t)
    return '_'.join(ordered_types)


DEFAULT_TYPE = 'default'
def extract_value_from_attrs(attrs, k):
    if k in attrs:
        return attrs[k]
    else:
        return DEFAULT_TYPE
    
def filter_from_attr_val(attr, k, v):
    return k in attr and attr[k] == v

def filter_no_attr(attr, k):
    return k not in attr
    
tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Accepts Credit Cards'))
accepts_credit_cards_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Alcohol'))
alcohol_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Caters'))
caters_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Noise Level'))
noise_level_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Price Range'))
price_range_type_set = tdf.unique()

tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, 'Take-out'))
take_out_type_set = tdf.unique()

working_type_set = set()
for star in business_of_stars:
    business_of_star_df = business_of_stars[star]
    business_of_star_df['working_type'] = business_of_star_df['working_type'].apply(hours_to_type)
    working_type_set |= set(business_of_star_df['working_type'].unique())

num_working_types = len(working_type_set)
num_cities = len(training_restaurants_df['city'].unique())
num_weighted_sentiments = len(training_restaurants_df['weighted_sentiment'].unique())
num_sentiments = len(training_restaurants_df['sentiment_value'].unique())
num_tip_sentiments = len(training_restaurants_df['tip_sentiment_value'].unique())
num_checkins = len(training_restaurants_df['checkin_count'].unique())

types_freq_of_stars = {}
city_freq_of_stars = {}
accepts_credit_cards_of_stars = {}
alcohol_of_stars = {}
caters_of_stars = {}
noise_level_of_stars = {}
price_range_of_stars = {}
take_out_of_stars = {}
sentiment_of_stars = {}
sentiment_of_weighted_stars = {}
tip_sentiment_of_stars = {}
checkin_count_of_stars = {}

#calculate the frequencies
for star in business_of_stars:
    business_of_star_df = business_of_stars[star]
    num_business = len(business_of_star_df)
    
    
    # count frequency of different type
    types_freq = {}
    
    working_type_of_business = business_of_star_df.groupby('working_type')
    for wt in working_type_of_business.groups:
        # we use the add-one or Laplace smoothing
        types_freq[wt] = (len(working_type_of_business.get_group(wt)) + 1.0) / (num_business + num_working_types)
    
    types_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_working_types)
    types_freq_of_stars[star] = types_freq
    
    
    # count frequency of different city
    city_freq = {}
    
    # Now group them by city
    city_of_business = business_of_star_df.groupby('city')
    for city in city_of_business.groups:
        # we use the add-one or Laplace smoothing
        city_freq[city] = (len(city_of_business.get_group(city)) + 1.0) / (num_business + num_cities)
    
    # this value is for cities not in the a specify "group of star",
    # e.g. city "glendale" is not in group of star 1
    city_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_cities)
    city_freq_of_stars[star] = city_freq

    
    # count frequency of accepts credit cards
    accepts_credit_cards_freq = {}
    
    for t in accepts_credit_cards_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Accepts Credit Cards', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Accepts Credit Cards'))])
        
        accepts_credit_cards_freq[t] = (num + 1.0) / (num_business + len(accepts_credit_cards_type_set))
    
    accepts_credit_cards_of_stars[star] = accepts_credit_cards_freq

    
    # count frequency of alcohol
    alcohol_freq = {}
    
    for t in alcohol_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Alcohol', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Alcohol'))])
        
        alcohol_freq[t] = (num + 1.0) / (num_business + len(alcohol_type_set))
    
    alcohol_of_stars[star] = alcohol_freq
    
    
    # count frequency of caters
    caters_freq = {}
    
    for t in caters_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Caters', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Caters'))])
        
        caters_freq[t] = (num + 1.0) / (num_business + len(caters_type_set))
    
    caters_of_stars[star] = caters_freq
    
    
    # count frequency of noise level
    noise_level_freq = {}
    
    for t in noise_level_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Noise Level', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Noise Level'))])
        
        noise_level_freq[t] = (num + 1.0) / (num_business + len(noise_level_type_set))
    
    noise_level_of_stars[star] = noise_level_freq
    
    
    # count frequency of price range
    price_range_freq = {}
    
    for t in price_range_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Price Range', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Price Range'))])
        
        price_range_freq[t] = (num + 1.0) / (num_business + len(price_range_type_set))
    
    price_range_of_stars[star] = price_range_freq
    
    
    # count frequency of take-out
    take_out_freq = {}
    
    for t in take_out_type_set:
        if t != DEFAULT_TYPE:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, 'Take-out', t))])
        else:
            num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, 'Take-out'))])
        
        take_out_freq[t] = (num + 1.0) / (num_business + len(take_out_type_set))
    
    take_out_of_stars[star] = take_out_freq
    
    # count frequency for weighted sentiments
    sentiment_weighted_freq = {}
    sentiment_of_business_weighted = business_of_star_df.groupby('weighted_sentiment')
    for sentiment in sentiment_of_business_weighted.groups:
        sentiment_weighted_freq[sentiment] = (len(sentiment_of_business_weighted.get_group(sentiment)) + 1.0) / (num_business + num_weighted_sentiments)
    sentiment_weighted_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_weighted_sentiments)
    sentiment_of_weighted_stars[star] = sentiment_weighted_freq
     
    # count frequency for sentiments
    sentiment_freq = {}
    sentiment_of_business = business_of_star_df.groupby('sentiment_value')
    for sentiment in sentiment_of_business.groups:
        sentiment_freq[sentiment] = (len(sentiment_of_business.get_group(sentiment)) + 1.0) / (num_business + num_sentiments)
    sentiment_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_sentiments)
    sentiment_of_stars[star] = sentiment_freq
    
    # tip sentiments
    tip_sentiment_freq = {}
    sentiment_of_business = business_of_star_df.groupby('tip_sentiment_value')
    for sentiment in sentiment_of_business.groups:
        tip_sentiment_freq[sentiment] = (len(sentiment_of_business.get_group(sentiment)) + 1.0) / (num_business + num_tip_sentiments)
    tip_sentiment_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_tip_sentiments)
    tip_sentiment_of_stars[star] = tip_sentiment_freq
    
    checkin_count_freq = {}
    checkin_count_of_business = business_of_star_df.groupby('checkin_count')
    for checkin in checkin_count_of_business.groups:
        checkin_count_freq[checkin] = (len(checkin_count_of_business.get_group(checkin)) + 1.0) / (num_business + num_checkins)
    checkin_count_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_checkins)
    checkin_count_of_stars[star] = checkin_count_freq
    

Verification our test set

In [554]:
test_restaurants_df = df_bus_review_tip__checkin[~df_bus_review_tip__checkin.isin(training_restaurants_df)].dropna()

test_restaurants_df.head()

Unnamed: 0,business_id,stars,hours,city,attributes,sentiment_value,useful,weighted_sentiment,tip_sentiment_value,checkin_count
1,KayYbHCt-RkbGcPdGOThNg,4.0,"{'Monday': {'open': '11:00', 'close': '02:00'}...",Carnegie,"{'Takes Reservations': False, 'Noise Level': '...",73.0,1.0,128.0,62.0,1.0
4,6ilJq_05xRgek_8qUp36-g,2.0,"{'Monday': {'open': '00:00', 'close': '00:00'}...",Munhall,"{'Takes Reservations': False, 'Noise Level': '...",38.0,1.0,66.0,43.0,1.0
5,1qCuOcks5HRv67OHovAVpg,3.5,{},Homestead,"{'Takes Reservations': False, 'Noise Level': '...",60.0,1.0,105.0,40.0,1.0
9,tU-YtoW339PXJmUGSix_AQ,2.5,"{'Monday': {'open': '11:00', 'close': '23:00'}...",Homestead,"{'Takes Reservations': True, 'Noise Level': 'a...",43.0,1.0,75.0,40.0,1.0
11,JDDeaNfb0JXD1NbznSIC9g,3.0,"{'Monday': {'open': '11:00', 'close': '22:00'}...",Homestead,"{'Takes Reservations': True, 'Noise Level': 'a...",47.0,1.0,82.0,42.0,1.0


In [555]:
import numpy as np

def calc_probs(hours, city, attrs, sentiment_value, weighted, tip_sentiment, checkin_count):
    probs_of_stars = {}
    
    working_type = hours_to_type(hours)
    
    for star in prior_of_stars:
        prob = np.log(prior_of_stars[star])
        prob += np.log(types_freq_of_stars[star].get(working_type, types_freq_of_stars[star]['default']))
        prob += np.log(city_freq_of_stars[star].get(city, city_freq_of_stars[star]['default']))
        
        prob += np.log(accepts_credit_cards_of_stars[star].get(extract_value_from_attrs(attrs, 'Accepts Credit Cards'), accepts_credit_cards_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(alcohol_of_stars[star].get(extract_value_from_attrs(attrs, 'Alcohol'), alcohol_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(caters_of_stars[star].get(extract_value_from_attrs(attrs, 'Caters'), caters_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(noise_level_of_stars[star].get(extract_value_from_attrs(attrs, 'Noise Level'), noise_level_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(price_range_of_stars[star].get(extract_value_from_attrs(attrs, 'Price Range'), price_range_of_stars[star][DEFAULT_TYPE]))
        prob += np.log(take_out_of_stars[star].get(extract_value_from_attrs(attrs, 'Take-out'), take_out_of_stars[star][DEFAULT_TYPE]))
        
        if(weighted):
            prob += np.log(sentiment_of_weighted_stars[star].get(sentiment_value, sentiment_of_weighted_stars[star]['default']))
        else : 
            prob += np.log(sentiment_of_stars[star].get(sentiment_value, sentiment_of_stars[star]['default']))
        prob += np.log(tip_sentiment_of_stars[star].get(tip_sentiment, tip_sentiment_of_stars[star]['default']))
        prob += np.log(checkin_count_of_stars[star].get(checkin_count, checkin_count_of_stars[star]['default']))
        probs_of_stars[star] = prob
    return probs_of_stars
    
test_restaurants_df['stars_probs_weighted'] = test_restaurants_df.apply(lambda r: calc_probs(r['hours'], r['city'], r['attributes'],r['weighted_sentiment'],True,r['tip_sentiment_value'],r['checkin_count']), axis=1)
test_restaurants_df['stars_probs'] = test_restaurants_df.apply(lambda r: calc_probs(r['hours'], r['city'], r['attributes'],r['sentiment_value'],False,r['tip_sentiment_value'],r['checkin_count']), axis=1)
test_restaurants_df.head()

Unnamed: 0,business_id,stars,hours,city,attributes,sentiment_value,useful,weighted_sentiment,tip_sentiment_value,checkin_count,stars_probs_weighted,stars_probs
1,KayYbHCt-RkbGcPdGOThNg,4.0,"{'Monday': {'open': '11:00', 'close': '02:00'}...",Carnegie,"{'Takes Reservations': False, 'Noise Level': '...",73.0,1.0,128.0,62.0,1.0,"{3.5: -31.8927375355, 1.0: -34.8994337567, 2.0...","{3.5: -31.8556830647, 1.0: -34.0735270863, 2.0..."
4,6ilJq_05xRgek_8qUp36-g,2.0,"{'Monday': {'open': '00:00', 'close': '00:00'}...",Munhall,"{'Takes Reservations': False, 'Noise Level': '...",38.0,1.0,66.0,43.0,1.0,"{3.5: -24.0633123104, 1.0: -33.5131393956, 2.0...","{3.5: -24.0262578397, 1.0: -32.6872327252, 2.0..."
5,1qCuOcks5HRv67OHovAVpg,3.5,{},Homestead,"{'Takes Reservations': False, 'Noise Level': '...",60.0,1.0,105.0,40.0,1.0,"{3.5: -22.3965744314, 1.0: -36.2857281179, 2.0...","{3.5: -22.3118919117, 1.0: -35.4598214474, 2.0..."
9,tU-YtoW339PXJmUGSix_AQ,2.5,"{'Monday': {'open': '11:00', 'close': '23:00'}...",Homestead,"{'Takes Reservations': True, 'Noise Level': 'a...",43.0,1.0,75.0,40.0,1.0,"{3.5: -20.0729789643, 1.0: -35.5925809373, 2.0...","{3.5: -19.8961625512, 1.0: -34.7666742669, 2.0..."
11,JDDeaNfb0JXD1NbznSIC9g,3.0,"{'Monday': {'open': '11:00', 'close': '22:00'}...",Homestead,"{'Takes Reservations': True, 'Noise Level': 'a...",47.0,1.0,82.0,42.0,1.0,"{3.5: -17.5933909983, 1.0: -34.8994337567, 2.0...","{3.5: -17.4729549186, 1.0: -34.0735270863, 2.0..."


Calculte the accuracy

In [556]:
import operator

def predict(probs):
    sorted_probs = sorted(probs.items(), key=operator.itemgetter(1))
    return sorted_probs[-1][0]
    
def correctness(stars, estimated_stars):
    return stars == estimated_stars
    
def distance(stars, estimated_stars):
    return abs(stars - estimated_stars)

test_restaurants_df['estimated_weighted_stars'] = test_restaurants_df.apply(lambda r: predict(r['stars_probs_weighted']), axis=1)
test_restaurants_df['estimated_stars'] = test_restaurants_df.apply(lambda r: predict(r['stars_probs']), axis=1)



test_restaurants_df['correctness'] = test_restaurants_df.apply(lambda r: correctness(r['stars'], r['estimated_stars']), axis=1)
corrects = len(test_restaurants_df[test_restaurants_df['correctness'] == True])
test_restaurants_df['distance'] = test_restaurants_df.apply(lambda r: distance(r['stars'], r['estimated_stars']), axis=1)
print('accuracy is ' + str(corrects * 1.0 / len(test_restaurants_df)))
print('average distance is ' + str(test_restaurants_df['distance'].mean()))

test_restaurants_df['correctness'] = test_restaurants_df.apply(lambda r: correctness(r['stars'], r['estimated_weighted_stars']), axis=1)
corrects = len(test_restaurants_df[test_restaurants_df['correctness'] == True])
test_restaurants_df['distance'] = test_restaurants_df.apply(lambda r: distance(r['stars'], r['estimated_weighted_stars']), axis=1)
print('accuracy with weighted reviews ' + str(corrects * 1.0 / len(test_restaurants_df)))
print('average distance with weighted reviews is ' + str(test_restaurants_df['distance'].mean()))


accuracy is 0.5472209670131044
average distance is 0.2528242205151378
accuracy with weighted reviews 0.5494803434252147
average distance with weighted reviews is 0.25869859918662447
