In [1]:
from datetime import time
import pandas as pd
import numpy as np
import json as json

In [2]:

def write_df_to_json_file(df, filename):
    df.reset_index().to_json(filename,orient='records')

def write_preprocessed_data(df_sentiment, df_user , df_business_restaurants):
    
    df_sentiment_user = pd.merge(df_sentiment,df_user, how='left',  on=['user_id', 'user_id'])[['business_id','sentiment_value','user_id','useful','review_count']]
    df_sentiment_business = df_sentiment_user.groupby('business_id',as_index=False)[['sentiment_value','useful']].mean()
    df_bus_review_merged = pd.merge(df_business_restaurants,df_sentiment_business, how='left', 
                                     on=['business_id', 'business_id'])
    
    df_sentiment = pd.read_json('../out/yelp_academic_dataset_tip_sentiment.json', lines=True)[['business_id','sentiment_value','user_id']]
    df_tip_sentiment_business = df_sentiment.groupby('business_id',as_index=False)[['sentiment_value']].mean()
    df_bus_review_tip_merged = pd.merge(df_bus_review_merged,df_tip_sentiment_business, how='left', 
                                         on=['business_id', 'business_id'])
    
    df_checkin = pd.read_json('../out/business_with_checkin_count.json')[['business_id','checkin_count']]
    df_bus_review_tip_checkin_merged = pd.merge(df_bus_review_tip_merged,df_checkin, how='left', 
                                             on=['business_id', 'business_id'])
    
    write_df_to_json_file(df_bus_review_tip_checkin_merged,"../out/preprocessed_business_data.json")
    

In [3]:
def get_relevant_businesses(review_count):
    df_business = pd.read_json('../yelp_academic_dataset_business.json', lines=True)
    # Filter for restuarants that are open and only keep necessary columns
    df_business_filtered = df_business[df_business['categories'].apply(lambda x: 'Restaurants' in x)
                                          & df_business['open'].apply(lambda x: x == True)
                                             & df_business['review_count'].apply(lambda x: x >= review_count)].reset_index(drop=True)[['business_id', 'stars', 'review_count','hours', 'city', 'attributes']]
    return df_business_filtered

In [4]:
df_business_restaurants = get_relevant_businesses(20)
print (len(df_business_restaurants))
df_business_restaurants.head()

11207


Unnamed: 0,business_id,stars,review_count,hours,city,attributes
0,mVHrayjG3uZ_RLHkLj-AMg,4.5,26,"{'Tuesday': {'open': '10:00', 'close': '19:00'...",Braddock,"{'Takes Reservations': False, 'Parking': {'val..."
1,KayYbHCt-RkbGcPdGOThNg,4.0,23,"{'Tuesday': {'open': '11:00', 'close': '02:00'...",Carnegie,"{'Takes Reservations': False, 'Parking': {'val..."
2,b9WZJp5L1RZr4F1nxclOoQ,4.5,69,"{'Tuesday': {'open': '06:00', 'close': '14:30'...",Carnegie,"{'Takes Reservations': False, 'Parking': {'val..."
3,P1fJb2WQ1mXoiudj8UE44w,3.5,81,"{'Tuesday': {'open': '11:00', 'close': '22:00'...",Carnegie,"{'Takes Reservations': True, 'Waiter Service':..."
4,6ilJq_05xRgek_8qUp36-g,2.0,60,"{'Tuesday': {'open': '00:00', 'close': '00:00'...",Munhall,"{'Takes Reservations': False, 'Waiter Service'..."


In [5]:
def normalize_values(df_column):
    mean_value = df_column.mean()
    max_value = df_column.max()
    min_value = df_column.min()
    return (df_column - mean_value)/(max_value - min_value)

In [6]:
def discretize_values(df_column, number_of_bins):
    max_value = df_column.max()
    min_value = df_column.min()
    bins = np.linspace(min_value, max_value, number_of_bins)
    return df_column.apply(lambda x: np.digitize(x, bins))

In [7]:
def get_merged_sentiment_details():
    df_sentiment = pd.read_json('../out/yelp_academic_dataset_review_sentiment.json', lines=True)[['business_id','sentiment_value','user_id']]
    df_user = pd.read_json('../yelp_academic_dataset_user.json', lines=True)[['user_id','yelping_since','elite','votes','review_count']]
    #extract useful votes to a column
    df_user['useful'] =  df_user['votes'].apply(lambda x: x.get('useful'))
    
    #write_preprocessed_data(df_sentiment,df_user, df_business_restaurants)
    
    #normalize the values
    df_user['useful'] = normalize_values(df_user['useful'])
    df_user['useful'] = discretize_values(df_user['useful'],10)
    #normalize the sentiments
    df_sentiment['sentiment_value'] = normalize_values(df_sentiment['sentiment_value'])
    df_sentiment_user = pd.merge(df_sentiment,df_user, how='left',  on=['user_id', 'user_id'])[['business_id','sentiment_value','user_id','useful','review_count']]
    return df_sentiment_user

In [8]:
df_sentiment_user = get_merged_sentiment_details()
print (len(df_sentiment_user))
df_sentiment_user.head()

2685066


Unnamed: 0,business_id,sentiment_value,user_id,useful,review_count
0,5UmKMjUEUNdYWqANhGckJw,0.079333,PUFPaY9KxDAcGqfsorJp3Q,1,60
1,5UmKMjUEUNdYWqANhGckJw,0.199333,Iu6AxdBYGR4A0wspR9BYHA,1,7
2,5UmKMjUEUNdYWqANhGckJw,0.066,auESFwWvW42h6alXgFxAXQ,1,18
3,5UmKMjUEUNdYWqANhGckJw,-0.050667,qiczib2fO_1VBG8IoCGvVg,1,52
4,5UmKMjUEUNdYWqANhGckJw,-0.107333,qEE5EvV-f-s7yHC0Z4ydJQ,1,26


In [9]:
def reject_outliers(data, m=2):
    original_mean = np.mean(data)
    original_std = np.nanstd(data)
    newdata = data[abs(data - original_mean) < m * original_std]
    if(len(newdata) > 0 ):
        return np.nanmean(newdata)
    else :
        return 0.0

In [10]:
def get_merged_business_sentiment():
    df_sentiment_business = df_sentiment_user.groupby('business_id',as_index=False)[['sentiment_value','useful']].mean()
    df_bus_review_merged = pd.merge(df_business_restaurants,df_sentiment_business, how='left', 
                                     on=['business_id', 'business_id'])
    df_bus_review_merged['sentiment_value'] = discretize_values(df_bus_review_merged['sentiment_value'],100)
    df_bus_review_merged['useful'] = discretize_values(df_bus_review_merged['useful'],10)
    df_bus_review_merged['weighted_sentiment'] = discretize_values(df_bus_review_merged['sentiment_value'] * df_bus_review_merged['useful'], 1000)
    return df_bus_review_merged

In [11]:
df_bus_review_merged = get_merged_business_sentiment()
df_bus_review_merged.head()

Unnamed: 0,business_id,stars,review_count,hours,city,attributes,sentiment_value,useful,weighted_sentiment
0,mVHrayjG3uZ_RLHkLj-AMg,4.5,26,"{'Tuesday': {'open': '10:00', 'close': '19:00'...",Braddock,"{'Takes Reservations': False, 'Parking': {'val...",60,1,105
1,KayYbHCt-RkbGcPdGOThNg,4.0,23,"{'Tuesday': {'open': '11:00', 'close': '02:00'...",Carnegie,"{'Takes Reservations': False, 'Parking': {'val...",73,1,128
2,b9WZJp5L1RZr4F1nxclOoQ,4.5,69,"{'Tuesday': {'open': '06:00', 'close': '14:30'...",Carnegie,"{'Takes Reservations': False, 'Parking': {'val...",59,1,103
3,P1fJb2WQ1mXoiudj8UE44w,3.5,81,"{'Tuesday': {'open': '11:00', 'close': '22:00'...",Carnegie,"{'Takes Reservations': True, 'Waiter Service':...",54,1,94
4,6ilJq_05xRgek_8qUp36-g,2.0,60,"{'Tuesday': {'open': '00:00', 'close': '00:00'...",Munhall,"{'Takes Reservations': False, 'Waiter Service'...",38,1,66


In [12]:
def get_merged_tip_sentiments():
    df_sentiment = pd.read_json('../out/yelp_academic_dataset_tip_sentiment.json', lines=True)[['business_id','sentiment_value','user_id']]
    df_sentiment['tip_sentiment_value'] = normalize_values(df_sentiment['sentiment_value'])
    df_sentiment_business = df_sentiment.groupby('business_id',as_index=False)[['tip_sentiment_value']].mean()
    df_sentiment_business['tip_sentiment_value'] = discretize_values(df_sentiment_business['tip_sentiment_value'],100)
    df_bus_review_tip_merged = pd.merge(df_bus_review_merged,df_sentiment_business, how='left', 
                                         on=['business_id', 'business_id'])
    df_bus_review_tip_merged['stars'] = round(df_bus_review_tip_merged['stars'])
    return df_bus_review_tip_merged

In [13]:
df_bus_review_tip_merged = get_merged_tip_sentiments()
df_bus_review_tip_merged.head()

Unnamed: 0,business_id,stars,review_count,hours,city,attributes,sentiment_value,useful,weighted_sentiment,tip_sentiment_value
0,mVHrayjG3uZ_RLHkLj-AMg,4.0,26,"{'Tuesday': {'open': '10:00', 'close': '19:00'...",Braddock,"{'Takes Reservations': False, 'Parking': {'val...",60,1,105,30.0
1,KayYbHCt-RkbGcPdGOThNg,4.0,23,"{'Tuesday': {'open': '11:00', 'close': '02:00'...",Carnegie,"{'Takes Reservations': False, 'Parking': {'val...",73,1,128,62.0
2,b9WZJp5L1RZr4F1nxclOoQ,4.0,69,"{'Tuesday': {'open': '06:00', 'close': '14:30'...",Carnegie,"{'Takes Reservations': False, 'Parking': {'val...",59,1,103,50.0
3,P1fJb2WQ1mXoiudj8UE44w,4.0,81,"{'Tuesday': {'open': '11:00', 'close': '22:00'...",Carnegie,"{'Takes Reservations': True, 'Waiter Service':...",54,1,94,45.0
4,6ilJq_05xRgek_8qUp36-g,2.0,60,"{'Tuesday': {'open': '00:00', 'close': '00:00'...",Munhall,"{'Takes Reservations': False, 'Waiter Service'...",38,1,66,43.0


In [14]:
def get_merged_checkin_count():
    df_checkin = pd.read_json('../out/business_with_checkin_count.json')[['business_id','checkin_count']]
    df_bus_review_tip_checkin_merged = pd.merge(df_bus_review_tip_merged,df_checkin, how='left', 
                                             on=['business_id', 'business_id'])
    df_bus_review_tip_checkin_merged['checkin_count'] = discretize_values(normalize_values(df_bus_review_tip_checkin_merged['checkin_count']),10)
    return df_bus_review_tip_checkin_merged

In [15]:
df_bus_review_tip__checkin = get_merged_checkin_count()

In [16]:
def get_business_bystars(training_restaurants_df, stars_column):
    group_by_stars = training_restaurants_df.groupby(stars_column)
    business_of_stars = {}
    for star in group_by_stars.groups:
        group = group_by_stars.get_group(star)
        business_of_stars[star] = group.assign(working_type=lambda x: x['hours'])
    return business_of_stars

In [17]:
from sklearn.preprocessing import normalize
def get_priors(business_of_stars,training_restaurants_df):
    prior_of_stars = {}
    for star in business_of_stars:
        prior_of_stars[star] = len(business_of_stars[star]) * 1.0 / len(training_restaurants_df)
    #print (prior_of_stars)
    x = list(prior_of_stars.values())
    normalizing_fact = 1 / np.linalg.norm(x)
    for k in prior_of_stars:
        prior_of_stars[k] = prior_of_stars[k] * normalizing_fact
    return prior_of_stars

In [18]:
def get_uniqueattributevalues(atrribute_names, training_restaurants_df, unique_dimension_values):
    for attribute_name in atrribute_names:
        tdf = training_restaurants_df['attributes'].apply(lambda a: extract_value_from_attrs(a, attribute_name))
        unique_dimension_values[attribute_name] = tdf.unique()

In [19]:
def get_working_type(business_of_stars):
    working_type_set = set()
    for star in business_of_stars:
        business_of_star_df = business_of_stars[star]
        business_of_star_df['working_type'] = business_of_star_df['working_type'].apply(hours_to_type)
        working_type_set |= set(business_of_star_df['working_type'].unique())
    return working_type_set

In [20]:
def get_unique_columnvalues(training_restaurants_df, column_names,unique_dimension_values):
    for column_name in column_names:
        unique_dimension_values[column_name] = training_restaurants_df['city'].unique()

In [21]:
from datetime import time

WORKING_TYPES = {
    "WEEKEND_TYPE": "weekend",
    "BREAKFAST_TYPE": "breakfast",
    "LUNCH_TYPE": "lunch",
    "AFTER_LUNCH_TYPE": "after-lunch",
    "DINNER_TYPE": "dinner",
    "NIGHT_TYPE": "night",
}

breakfast = time(8)
lunch = time(12)
after_lunch = time(15)
dinner = time(18)
night = time(0)

def in_between(start, end, check):
    if start == end: # 24 hours
        return True
    if start < end:
        return start <= check < end
    else: # over midnight e.g., 23:30-04:15
        return start <= check or check < end

TYPE_THRESHOLD = 1
def spec_hours_to_type(s):
    types = []
    
    breakfast_count = 0
    lunch_count = 0
    after_lunch_count = 0
    dinner_count = 0
    night_count = 0

    for day in s:
        
        clo = s[day]['close']
        op = s[day]['open']

        h, m = clo.split(':')
        clo_t = time(int(h), int(m))

        h, m = op.split(':')
        op_t = time(int(h), int(m))
        
        if in_between(op_t, clo_t, breakfast):
            breakfast_count += 1
        
        if in_between(op_t, clo_t, lunch):
            lunch_count += 1
        
        if in_between(op_t, clo_t, after_lunch):
            after_lunch_count += 1
        
        if in_between(op_t, clo_t, dinner):
            dinner_count += 1
        
        if in_between(op_t, clo_t, night):
            night_count += 1
        
        if (day in ['Saturday', 'Sunday']) and (WORKING_TYPES["WEEKEND_TYPE"] not in types):
            types.append(WORKING_TYPES["WEEKEND_TYPE"])
            
    if breakfast_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["BREAKFAST_TYPE"])
        
    if lunch_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["LUNCH_TYPE"])
    
    if after_lunch_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["AFTER_LUNCH_TYPE"])
    
    if dinner_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["DINNER_TYPE"])
        
    if night_count >= TYPE_THRESHOLD:
        types.append(WORKING_TYPES["NIGHT_TYPE"])
    
    return join_types(types)

def hours_to_type(s):
    if isinstance(s, str):
        return s
    
    if s:
        return spec_hours_to_type(s)
    else:
        return join_types(WORKING_TYPES.values())

def join_types(ts):
    # reorder
    ordered_types = []
    for t in WORKING_TYPES.values():
        if t in ts:
            ordered_types.append(t)
    return '_'.join(ordered_types)


DEFAULT_TYPE = 'default'
def extract_value_from_attrs(attrs, k):
    if k in attrs:
        return attrs[k]
    else:
        return DEFAULT_TYPE
    
def filter_from_attr_val(attr, k, v):
    return k in attr and attr[k] == v

def filter_no_attr(attr, k):
    return k not in attr

In [22]:
def calculate_frequencies(attributes, dimensions, unique_dimension_values, business_of_stars):
    
    dimension_freq_map = {}
    for dimension in (attributes + dimensions):
        dimension_star_map = {}
        dimension_freq_map[dimension] = dimension_star_map
    
    #calculate the frequencies
    for star in business_of_stars:

        business_of_star_df = business_of_stars[star]
        num_business = len(business_of_star_df)

        for dimension in dimensions:
            dim_star_map = dimension_freq_map[dimension]
            dim_freq = {}
            dim_of_business = business_of_star_df.groupby(dimension)
            num_unique_dimensions = len(unique_dimension_values[dimension])
            for grp in dim_of_business.groups:
                # we use the add-one or Laplace smoothing
                dim_freq[grp] = (len(dim_of_business.get_group(grp)) + 1.0) / (num_business + num_unique_dimensions)
            dim_freq[DEFAULT_TYPE] = 1.0 / (num_business + num_unique_dimensions)
            dim_star_map[star] = dim_freq

        for attribute in attributes:
            attr_star_map = dimension_freq_map[attribute]
            attribute_freq = {}
            attr_set = unique_dimension_values[attribute]
            for t in attr_set:
                if t != DEFAULT_TYPE:
                    num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_from_attr_val(attr, attribute, t))])
                else:
                    num = len(business_of_star_df[business_of_star_df['attributes'].apply(lambda attr: filter_no_attr(attr, attribute))])
                attribute_freq[t] = (num + 1.0) / (num_business + len(attr_set))
            if DEFAULT_TYPE not in  attribute_freq:
                attribute_freq[DEFAULT_TYPE] = 1.0 / (num_business + len(attr_set))
            attr_star_map[star] = attribute_freq
            
    return dimension_freq_map


Verification our test set

In [23]:
import numpy as np
import operator

def predict(probs):
    sorted_probs = sorted(probs.items(), key=operator.itemgetter(1))
    return sorted_probs[-1][0]
    
def correctness(stars, estimated_stars):
    return stars == estimated_stars
    
def distance(stars, estimated_stars):
    return abs(stars - estimated_stars)

def calc_probs(row_value, dim_freq_map, selected_columns, prior_of_stars):#hours, city, attrs, sentiment_value, weighted, tip_sentiment, checkin_count):
    #print (row_value)
    probs_of_stars = {}
    
    working_type = hours_to_type(row_value['hours'])
    #print (working_type)
    for star in prior_of_stars:
        prob = np.log(prior_of_stars[star])
        types_freq_of_stars = dim_freq_map[working_type_column]
        #print (types_freq_of_stars)
        prob += np.log(types_freq_of_stars[star].get(working_type, types_freq_of_stars[star]['default']))
        
        for dimension in selected_columns:
            dim_freq_star_map = dim_freq_map[dimension]
            prob += np.log(dim_freq_star_map[star].get(row_value[dimension], dim_freq_star_map[star]['default']))
        
        attrs = row_value['attributes']
        for attribute in atrribute_names:  
            dim_freq_star_map = dim_freq_map[attribute]
            attrcol = extract_value_from_attrs(attrs, attribute)
            #print (attribute, attrcol)
            #print (dim_freq_star_map[star][DEFAULT_TYPE], "\n")
            prob += np.log(dim_freq_star_map[star].get(attrcol, dim_freq_star_map[star][DEFAULT_TYPE]))
        probs_of_stars[star] = prob
    return probs_of_stars

In [24]:
atrribute_names = ['Accepts Credit Cards','Alcohol','Caters','Noise Level','Price Range','Take-out']
column_names = ['city','weighted_sentiment','sentiment_value','tip_sentiment_value','checkin_count']
working_type_column = 'working_type'

def trainNB(training_restaurants_df):
    #group by stars 
    business_of_stars = get_business_bystars(training_restaurants_df,'stars')
    #get priors
    prior_of_stars = get_priors(business_of_stars,training_restaurants_df )
    #get unique values for attributes
    unique_dimension_values = {}
    get_uniqueattributevalues(atrribute_names, training_restaurants_df, unique_dimension_values)
     # unique values for columns
    get_unique_columnvalues(training_restaurants_df, column_names,  unique_dimension_values)
    unique_dimension_values[working_type_column] = get_working_type(business_of_stars)
    
    dimension_frequency_map = calculate_frequencies(atrribute_names,column_names+[working_type_column], unique_dimension_values, business_of_stars)
    return dimension_frequency_map, prior_of_stars

In [25]:
def testNB(test_restaurants_df, dim_freq_map, selected_columns, prior_of_stars):
    result = pd.DataFrame()
    result['stars'] = test_restaurants_df['stars']
    result['stars_probs'] = test_restaurants_df.apply(lambda r: calc_probs(r, dim_freq_map, selected_columns, prior_of_stars), axis=1)
    result['estimated_stars'] = result.apply(lambda r: predict(r['stars_probs']), axis=1)
    #write_df_to_json_file(test_restaurants_df[['stars','estimated_weighted_stars','estimated_stars']],"../out/results.json")
    result['correctness'] = result.apply(lambda r: correctness(r['stars'], r['estimated_stars']), axis=1)
    corrects = len(result[result['correctness'] == True])
    result['distance'] = result.apply(lambda r: distance(r['stars'], r['estimated_stars']), axis=1)
    result['diff'] = result.apply(lambda r: r['stars'] - r['estimated_stars'], axis=1)
    result_t = result[result['diff'].apply(lambda x: abs(x) >= 0.5)]
    accuracy =  corrects * 1.0 / len(result)
    avg_dist = result['distance'].mean()
    off_by_morethan_halfstar = len(result_t)
    return accuracy,avg_dist,off_by_morethan_halfstar

In [26]:
#80% training data
def test_trainsplit(df,fraction = .8):
    training_restaurants_df = df.sample(frac=fraction, random_state = 42)
    test_restaurants_df = df[~df.isin(training_restaurants_df)].dropna()
    return training_restaurants_df, test_restaurants_df


Calculte the accuracy

In [27]:
training, test = test_trainsplit(df_bus_review_tip__checkin)
dim_freq_map,prior_of_stars = trainNB(training)
selected_columns = ['city','sentiment_value','tip_sentiment_value','checkin_count']
accuracy,dist,offcount = testNB (test,dim_freq_map, selected_columns,prior_of_stars)
print ("accuracy,dist,offcount :",accuracy,dist,offcount)
selected_columns = ['city','weighted_sentiment','tip_sentiment_value','checkin_count']
accuracy,dist,offcount = testNB (test,dim_freq_map, selected_columns,prior_of_stars)
print ("weighted -- accuracy,dist,offcount :",accuracy,dist,offcount)

accuracy,dist,offcount : 0.8328061455038409 0.1789426118391324 370
weighted -- accuracy,dist,offcount : 0.8269317668323543 0.18798011748757343 383


In [28]:
def get_kfolds(df, folds = 10):
    df_list = []
    newdf = df
    for i in range(0,9):
        df_part = newdf.sample(frac= (1.0/(folds - i)), random_state = 42)
        df_list.append(df_part)
        newdf = newdf[~newdf.isin(df_part)].dropna()
    df_list.append(newdf) 
    return df_list

In [29]:
def k_fold_crossvalidation():
    folds = get_kfolds(df_bus_review_tip__checkin)
    accuracy_list = []
    dist_list = []
    offcount_list = []
    for i in range(0, 10):
        test = folds[i]
        train_list = folds[:i] + folds[(i+1):]
        train = pd.concat(train_list)
        dim_freq_map,prior_of_stars = trainNB(train)
        selected_columns = ['city','sentiment_value','tip_sentiment_value','checkin_count']
        accuracy,dist,offcount = testNB (test,dim_freq_map, selected_columns,prior_of_stars)
        accuracy_list.append(accuracy)
        dist_list.append(dist)
        offcount_list.append(offcount)
    return np.mean(np.asarray(accuracy_list)), np.mean(np.asarray(dist_list)), np.mean(np.asarray(offcount_list))

In [30]:
print (k_fold_crossvalidation())


(0.82298725120770988, 0.190752532346047, 195.80000000000001)
