In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [None]:
df = pd.read_csv('../../philadelphia_restaurant_reviews.csv')
df_businesses = pd.read_json('../../yelp_academic_dataset_business.json', lines=True)
df_businesses = df_businesses[df_businesses['city'] == 'Philadelphia']


In [None]:
# Filter the businesses to only include restaurants
df_businesses = df_businesses[df_businesses['categories'].str.contains('Restaurants', na=False)]

In [None]:
# Histogram of the number of reviews per star rating
plt.figure(figsize=(10, 6))
sns.histplot(df['stars'], bins=5, color='blue', alpha=0.5, label='Star Rating', discrete=True)
plt.xticks([1, 2, 3, 4, 5])  # Ensure ticks are at the center of each bar
plt.xlabel('Star Rating')
plt.ylabel('Number of Reviews')
plt.title('Distribution of Star Ratings')
plt.show()



In [None]:
# Define bin edges centered around half-star steps (e.g., 1.0, 1.5, ..., 5.0)
bin_width = 0.5
min_star = df_businesses['stars'].min()
max_star = df_businesses['stars'].max()
bin_edges = np.arange(min_star - bin_width/2, max_star + bin_width, bin_width)

plt.figure(figsize=(10, 6))
sns.histplot(df_businesses['stars'], bins=bin_edges, color='red', alpha=0.5)

# Set xticks at each star rating level
plt.xticks(np.arange(1.0, 5.1, 0.5))  # Ticks from 1.0 to 5.0 by 0.5 steps
plt.xlabel('Star Rating')
plt.ylabel('Number of Buisinesses')
plt.title('Distribution of Star Ratings')
plt.show()

In [None]:
# create a histogram for each category in the dataset where the total number of reviews is greater than 100
# Filter categories with more than 100 reviews
category_counts = df_businesses['categories'].str.split(',').explode().str.strip().value_counts()
categories_to_plot = category_counts[category_counts > 100].index
# Create histograms for each category
for category in categories_to_plot:
    plt.figure(figsize=(10, 6))
    sns.histplot(df_businesses[df_businesses['categories'].str.contains(category, na=False)]['stars'], bins=bin_edges, color='orange', alpha=0.5)
    plt.xticks(np.arange(1.0, 5.1, 0.5))  # Ticks from 1.0 to 5.0 by 0.5 steps
    plt.xlabel('Star Rating')
    plt.ylabel('Number of Businesses')
    plt.title(f'Distribution of Star Ratings for {category}')
    plt.show()


In [9]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import ColumnDataSource, FactorRange

output_notebook()

HighContrast10 = [
    "#1f77b4",  # Blue  
    "#ff7f0e",  # Orange  
    "#2ca02c",  # Green  
    "#d62728",  # Red  
    "#9467bd",  # Purple  
    "#8c564b",  # Brown  
    "#e377c2",  # Pink  
    "#7f7f7f",  # Gray  
    "#bcbd22",  # Olive  
    "#17becf"   # Teal  
]

# Prepare data
df['date'] = pd.to_datetime(df['date'])
df['hour'] = df['date'].dt.hour

# Create 3-hour time blocks
def map_three_hour_block(hour):
    start = (hour // 3) * 3
    end = start + 2
    return f"{start:02d}-{end:02d}"

df['three_hour_block'] = df['hour'].apply(map_three_hour_block)

# Count reviews per (star, time block)
grouped = df.groupby(['stars', 'three_hour_block']).size().unstack(fill_value=0)

# Make sure columns are sorted by time
time_blocks = sorted(grouped.columns)
grouped = grouped[time_blocks]

for star in range(1, 6):
    if star not in grouped.index:
        grouped.loc[star] = [0] * len(time_blocks)

grouped = grouped.sort_index()
grouped.index = grouped.index.map(str)
stars = grouped.index.tolist()

grouped.reset_index(inplace=True)
grouped.rename(columns={'stars': 'stars'}, inplace=True)
source = ColumnDataSource(grouped)

p = figure(x_range=FactorRange(*stars),
           height=500, width=900,
           title="Review Counts per Star Rating across 3-Hour Blocks",
           toolbar_location="above", tools="pan,wheel_zoom,reset,save")

colors = HighContrast10

for i, block in enumerate(time_blocks):
    p.vbar(x='stars',
           top=block,
           source=source,
           width=0.2,
           color=colors[i % len(colors)],
           legend_label=block,
           muted_alpha=0.1,
           muted=False,
           alpha=0.7)

p.xaxis.axis_label = "Star Rating"
p.yaxis.axis_label = "Number of Reviews"
p.xaxis.major_label_orientation = 1.0
p.x_range.range_padding = 0
p.y_range.start = 0
p.title.text_font_size = "14pt"
p.legend.location = "top_left"
p.legend.click_policy = "mute"
p.add_layout(p.legend[0], 'left')

show(p)
