In [1]:
import pandas as pd
import numpy as np
import math

DATASET_DIR = '../../dataset/academic'

# normalize the values to [1, 2]
def normalize(series):
    return ((series - series.min()) / ((series.max() - series.min()) * 1.0)) + 1

In [2]:
df_review = pd.read_json('../out/yelp_academic_dataset_review_sentiment.json', lines=True)
df_review = df_review[df_review['sentiment_value'] != 3]    # remove reviews with invalid sentiment_value

In [3]:
df_review = df_review.assign(
    normalized_sentiment_value =
        # stars are integers and within [1, 5]
        lambda df: df['sentiment_value'] * normalize(df['stars']) * normalize(df['votes'].apply(lambda s: s['useful'])))
# now the range of normalized_sentiment_value is [-8, 8]

In [4]:
df_tip = pd.read_json('../out/yelp_academic_dataset_tip_sentiment.json', lines=True)
df_tip = df_tip[df_tip['sentiment_value'] != 3]    # remove reviews with invalid sentiment_value

In [5]:
df_tip = df_tip.assign(
    normalized_sentiment_value =
        lambda df: df['sentiment_value'] * normalize(df['likes']))

In [6]:
print(len(df_tip['business_id'].unique()))
len(df_tip)

57477


648435

In [None]:
df_business = pd.read_json(DATASET_DIR + '/yelp_academic_dataset_business.json', lines=True)

business_filters = (df_business['categories'].apply(lambda cs: 'Restaurants' in cs)
                    & df_business['open']
                    & df_business['review_count'].apply(lambda rc: rc >= 20))

df_business_restaurants = (df_business[business_filters]
                           .reset_index(drop=True)[['business_id', 'stars', 'hours', 'city', 'attributes']])

In [None]:
# Add a column of 'review_rating' to the business dataframe

review_group_by_business_id = df_review.groupby('business_id')

def get_review_score_of_business(bid):
    group = review_group_by_business_id.get_group(bid)
    if not group.empty:
        # use the mean of the normalized_sentiment_value for the review score of a business 
        return group['normalized_sentiment_value'].mean()
    else:
        return nan;

df_business_restaurants = df_business_restaurants.assign(
    review_rating = lambda df: df['business_id'].apply(lambda bid: get_review_score_of_business(bid)))

# we found that all businesses have a least one review                                                                                                                                                                             
# print(len(df_business_restaurants[df_business_restaurants['review_rating'] == nan])) 

In [None]:
# return the rating which is closeat to the score.
# range of range is [1, 5], rounded to half-ratings
def score_to_rating(score, bins):
    idx = np.argmin(np.abs(bins - score)) + 1
    return idx * 0.5 + 1

bins_review_score = np.linspace(
    df_business_restaurants['review_rating'].min(),
    df_business_restaurants['review_rating'].max(),
    num=9)

df_business_restaurants['review_rating'] = df_business_restaurants['review_rating'].apply(
    lambda score: score_to_rating(score, bins_review_score))