# Data Analysis / Insight Generation

This notebook is a sample of how our sentiment analysis data can be used for downstream tasks to analyse restaurant performance from the perspective of the customers.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data/pipeline/restaurants_final.csv')

In [None]:
print(df.shape)
df.head()

In [None]:
print(df.columns)

## Relative Importance of Aspects

Hypothesis: Different aspects are mentioned at different frequencies and have a different distribution of sentiment scores, that suggests the relative importance of the various aspects identified.

In [None]:
ASPECTS = ['Overall', 'Food', 'Portion', 'Price', 'Time', 'Service', 'Ambience']
rating_columns = [f'review_rating_{x.lower()}' for x in ASPECTS]

In [None]:
len(df)

In [None]:
len(df) - df[rating_columns].isna().sum().sort_values()

## Sentiment Score by Cuisine

Hypothesis: Different restaurant categories (cuisines) have a different distribution of sentiment scores. This may provide us with some insights on consumer tastes and preferences.

In [None]:
CATEGORIES = ['Italian', 'Malay', 'Japanese', 'Chinese', 'Western', 'Korean',\
    'Thai', 'Vietnamese', 'Mexican', 'Indian', 'Local Delights', 'Desserts', \
    'Healthy', 'Cafes & Coffee', 'Halal', 'Beverages', 'Others']

In [None]:
COLUMNS = [f'{x}_Average' for x in ASPECTS]
COLUMNS.insert(0, 'Count')
COLUMNS.insert(0, 'Category')
category_sentiment = pd.DataFrame(columns=COLUMNS)

for category in CATEGORIES:
    # filter rows in category
    df_category = df.loc[df[category] == 1]
    
    new_row = {'Category': category, 'Count': len(df_category)}
    
    # get mean aspect score
    for aspect in ASPECTS:
        aspect_col = f'review_rating_{aspect.lower()}'
        # remove NA
        aspect_df = df_category.dropna(subset=[aspect_col])
        new_row[f'{aspect}_Average'] = sum(aspect_df[aspect_col]) / len(aspect_df[aspect_col])
        
    category_sentiment = category_sentiment.append(new_row, ignore_index=True)

In [None]:
category_sentiment.sort_values(by=['Overall_Average'], ascending=False)

## Sentiment Score across Chains

Hypothesis: Different outlets from the same restaurant chain have similar sentiment scores, indicating a standard of quality across all outlets.

In [None]:
# starbucks
starbucks_df = df.loc[df['restaurant_code'].str.contains('starbucks')].reset_index(drop=True)

print(len(starbucks_df))

In [None]:
rating_df = starbucks_df[rating_columns].melt() # reshape

plt.figure(figsize=(12, 6))
sns.set_theme(style="darkgrid")
ax = sns.boxplot(x='variable', y='value', data=rating_df)
ax.set_title('Boxplot of Sentiment Scores Across Starbucks Outlets', size=16)
ax.set_xticklabels(ASPECTS)
plt.show()

In [None]:
table = starbucks_df.describe()
table

In [None]:
# find outliers in review_rating_overall
# starbucks_df.loc[starbucks_df['review_rating_overall'] >= table.loc['75%', 'review_rating_overall']].sort_values(by=['review_rating_overall'], ascending=False)
# starbucks_df.sort_values(by=['review_rating_overall'], ascending=False)
starbucks_df.loc[starbucks_df['review_rating_overall'] >= 4.6].sort_values(by=['review_rating_overall'], ascending=False)[['restaurant_name', 'review_rating_overall']]

In [None]:
columns = ['restaurant_name']
columns.extend(rating_columns)
reserve = starbucks_df.loc[starbucks_df['restaurant_name'].str.lower().str.contains('reserve')][columns]
columns = ['Name']
columns.extend(ASPECTS)
reserve.columns = columns
reserve