This notebook prepares important network related statistics for reference in the Final Report

In [8]:
# required imports
import os
import glob
import json
import re
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

# degree distributed power law fitting
import powerlaw

# community detection via louvain

import community as community_louvain



In [9]:
# matplotlib styling

plt.style.use('ggplot') # classic, clean

In [None]:
# Research Question 1 - Correlation Between Yelp Ratings and Social Media

# Goal: perform a statistical correlation analysis between Yelp ratings (from `merged_restaurants_final.csv`) and Instagram overall sentiment scores (from `IG_posts_with_sentiment.json`). We visualize the relationship using scatter plots and compute correlation coefficients. We also examine the distribution of both metrics, noting that Instagram overall scores tend to be lower relative to Yelp ratings.


# Load data and compute correlations
merged_restaurants = pd.read_csv("../outputs/merged_restaurants_final.csv")
yelp_ratings = merged_restaurants["rating"].dropna().astype(float)

with open("../outputs/IG_posts_with_sentiment.json", "r", encoding="utf-8") as f:
    ig_posts = pd.DataFrame(json.load(f))
    

# Normalize restaurant names for merging.
merged_restaurants["name_norm"] = merged_restaurants["name"].str.strip().str.lower()
ig_posts["restaurantName_norm"] = ig_posts["restaurantName"].str.strip().str.lower()

# Extract Instagram overall scores (ensure that each post has sentiment_analysis).
ig_posts = ig_posts[ig_posts["sentiment_analysis"].notnull()].copy()
ig_posts["insta_score"] = ig_posts["sentiment_analysis"].apply(lambda x: float(x.get("overall_score", np.nan)))

# Merge based on lowercase restaurant name.
merged_df = pd.merge(merged_restaurants, ig_posts, left_on="name_norm", right_on="restaurantName_norm", how="inner")

# Remove rows with missing ratings.
merged_df = merged_df.dropna(subset=["rating", "insta_score"])

# Create a column for the difference between Yelp and Instagram.
merged_df["difference"] = merged_df["insta_score"] - merged_df["rating"]

# Compute summary statistics.
mean_diff = merged_df["difference"].mean()
std_diff = merged_df["difference"].std()
prop_insta_higher = (merged_df["insta_score"] > merged_df["rating"]).mean()
print(f"Mean difference (Instagram - Yelp): {mean_diff:.2f}")
print(f"Standard deviation of the difference: {std_diff:.2f}")
print(f"Proportion where Instagram score is higher than Yelp rating: {prop_insta_higher*100:.1f}%")

# Plot a histogram of differences.
plt.figure(figsize=(8,6))
sns.histplot(merged_df["difference"], bins=20, kde=True, color="skyblue", edgecolor="black")
plt.axvline(0, color="red", linestyle="--", label="No difference")
plt.xlabel("Difference (Instagram overall score - Yelp rating)")
plt.ylabel("Frequency")
plt.title("Histogram of Differences: Instagram vs Yelp Ratings")
plt.legend()
plt.savefig("../images/difference_histogram_instagram_minus_yelp.png", bbox_inches="tight")
plt.close()
print("Saved histogram as difference_histogram_instagram_minus_yelp.png.")

# Generate a Bland-Altman plot.
merged_df["mean_rating"] = (merged_df["rating"] + merged_df["insta_score"]) / 2

plt.figure(figsize=(8,6))
plt.scatter(merged_df["mean_rating"], merged_df["difference"], alpha=0.6)
plt.axhline(mean_diff, color="gray", linestyle="--", label=f"Mean diff: {mean_diff:.2f}")
plt.axhline(mean_diff + 1.96*std_diff, color="red", linestyle="--", label="Upper 95% limit")
plt.axhline(mean_diff - 1.96*std_diff, color="red", linestyle="--", label="Lower 95% limit")
plt.xlabel("Mean of Yelp rating and Instagram overall score")
plt.ylabel("Difference (Instagram overall score - Yelp rating)")
plt.title("Bland-Altman Plot: Instagram vs Yelp Ratings")
plt.legend()
plt.savefig("../images/bland_altman_instagram_minus_yelp.png", bbox_inches="tight")
plt.close()


Mean difference (Instagram - Yelp): -0.25
Standard deviation of the difference: 0.84
Proportion where Instagram score is higher than Yelp rating: 25.9%
Saved histogram as difference_histogram_instagram_minus_yelp.png.
