In [1]:
import pandas as pd 
import json

# Load the file paths for the Yelp business and review datasets
business_file = "data/yelp_academic_dataset_business.json"
review_file = "data/yelp_academic_dataset_review.json"

# Load the business dataset
print("📍 Filtering businesses...")
business_df = pd.read_json(business_file, lines=True)

# Define keywords to identify businesses related to restaurants or grocery
category_keywords = [
    "restaurant", "restaurants",
    "grocery", "grocers", "grocery store", "supermarket", "markets"
]
category_pattern = "|".join(category_keywords)

# Filter businesses that are located in Philadelphia, PA, are currently open, 
# and match one of the specified categories
filtered_business_df = business_df[
    (business_df["city"].str.lower() == "philadelphia") &
    (business_df["state"] == "PA") &
    (business_df["is_open"] == 1) &
    (business_df["categories"].fillna("").str.lower().str.contains(category_pattern, regex=True))
]

# Set the business_id as the index for easier lookup
filtered_business_df.set_index("business_id", inplace=True)
business_ids = set(filtered_business_df.index)
print(f"✅ Found {len(business_ids)} matching businesses in Philadelphia, PA.")

# Open the review file and read it line by line, only keeping reviews
# associated with the filtered businesses
print("🔁 Collecting relevant reviews...")
merged_records = []

with open(review_file, "r", encoding="utf-8") as f:
    for line in f:
        review = json.loads(line)

        # Skip reviews not related to the filtered businesses
        if review["business_id"] not in business_ids:
            continue

        # Merge review data with business metadata (excluding business stars field)
        business = filtered_business_df.loc[review["business_id"]].to_dict()
        merged = {
            **review,
            **{k: v for k, v in business.items() if k != "stars"},
            "review_stars": review["stars"],
            "business_stars": business.get("stars")
        }
        merged_records.append(merged)

print(f"✅ Collected {len(merged_records)} relevant reviews.")

# Create a DataFrame from the merged review-business records
final_df = pd.DataFrame(merged_records)
print("✅ Stored merged dataset in `final_df`.")

# Drop columns that are not necessary for analysis
columns_to_drop = [
    "user_id", "review_id", "useful", "funny", "cool",
    "address", "city", "state", "is_open", "hours", "stars", "attributes"
]
final_df = final_df.drop(columns=columns_to_drop)
print("🧹 Dropped unnecessary columns.")

# Create a lowercase version of the categories for easier string matching
final_df['categories_lower'] = final_df['categories'].str.lower()

# Define a function to classify businesses into simplified types,
# prioritizing "grocery" over "restaurant" if both keywords appear
def classify_category_priority_grocery(cat):
    is_grocery = any(x in cat for x in ["grocery", "grocer", "supermarket", "markets"])
    is_restaurant = "restaurant" in cat
    if is_grocery:
        return "grocery"
    elif is_restaurant:
        return "restaurant"
    else:
        return "neither"

# Apply the classification function to each record
final_df['category_type'] = final_df['categories_lower'].apply(classify_category_priority_grocery)

# Calculate the 25th percentile (Q1) of review_count for each category type
thresholds = final_df.groupby('category_type')['review_count'].quantile(0.25).to_dict()

# Filter the records, keeping only those businesses whose review_count 
# is greater than or equal to the Q1 threshold for their category type
final_df_filtered = final_df[
    final_df.apply(lambda row: row['review_count'] >= thresholds.get(row['category_type'], 0), axis=1)
]

print(f"✅ Statistically filtered dataset has {len(final_df_filtered)} records.")



📍 Filtering businesses...
✅ Found 3707 matching businesses in Philadelphia, PA.
🔁 Collecting relevant reviews...
✅ Collected 517343 relevant reviews.
✅ Stored merged dataset in `final_df`.
🧹 Dropped unnecessary columns.
✅ Statistically filtered dataset has 388427 records.


# 📄 Food Safety Concern Detection from Yelp Reviews

This script identifies potential **food safety issues** mentioned in Yelp reviews by using **semantic similarity** between review sentences and a curated set of **food safety-related phrases**.

## Purpose

The goal is to **automatically flag reviews** that describe serious food safety concerns like food poisoning, spoiled food, and hospital visits, using natural language processing and BERT embeddings.

## Overview of What We Did

- **Loaded NLP models**  
  - Used **spaCy** for sentence parsing and POS tagging.
  - Used **SentenceTransformer (MiniLM)** for creating dense sentence embeddings.

- **Defined key food safety phrases**  
  - Created a list of common expressions related to food safety incidents.
  - Computed normalized embeddings for these phrases.

- **Prepared the review dataset**  
  - Removed reviews without text.
  - Focused only on **negative or neutral reviews** (stars ≤ 3).

- **Extracted candidate sentences**  
  - Parsed reviews into individual sentences.
  - Filtered sentences that:
    - Mentioned food-related terms (like "burger", "meal", "pizza")
    - Did **not** contain negations ("not good", "didn't get sick", etc.)

- **Calculated semantic similarity**  
  - Computed similarity between extracted sentences and food safety phrases.
  - Recorded the best matching phrase and score for each sentence.

- **Flagged potential food safety incidents**  
  - Flagged reviews where the highest similarity score exceeded **0.65**.
  - Recorded the matched sentence and matched phrase.

- **Displayed sample results**  
  - Showed the top 5 flagged reviews for quick inspection.

- **Tested different thresholds**  
  - Checked how the number of flagged reviews changes at different similarity thresholds (0.5, 0.6, 0.7).

## Final Output

- A filtered dataset (`final_df_filtered`) containing a `food_safety_flag` for each review.
- Highlighted sentences that strongly resemble known food safety concerns.
- Ready for further manual review or automated reporting.



In [None]:
import spacy
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the spaCy model for NLP tasks and the SentenceTransformer model for semantic embeddings
nlp = spacy.load("en_core_web_sm")
bert_model = SentenceTransformer("all-MiniLM-L6-v2")

# Define a list of food safety-related phrases and compute their embeddings using BERT
safety_phrases = [
    "food poisoning", "vomiting", "undercooked meat",
    "bug in my food", "dirty kitchen", "unsanitary conditions",
    "felt sick after eating", "hair in food", "spoiled food",
    "severe stomach pain", "threw up", "diarrhea",
    "needed medical attention", "hospital visit after eating",
    "rushed to urgent care", "child got sick from food"
]
safety_embeddings = bert_model.encode(safety_phrases, normalize_embeddings=True)

# Define a set of food-related nouns to focus on relevant sentences
food_terms = {
    "food", "meal", "chicken", "beef", "fish", "pork", "steak", "seafood",
    "burger", "sandwich", "dish", "plate", "lunch", "dinner", "breakfast",
    "soup", "salad", "fries", "appetizer", "entrée", "noodles", "rice", "pizza"
}

# Remove rows with missing text and only keep reviews rated 3 stars or lower (potentially negative experiences)
final_df_filtered = final_df_filtered.dropna(subset=["text"]).copy()
final_df_filtered = final_df_filtered[final_df_filtered["review_stars"] <= 3].copy()

# Process the text of each review and extract individual sentences mentioning food-related terms without negations
sentences = []
sentence_to_row = []

for idx, text in final_df_filtered["text"].items():
    doc = nlp(text)
    for sent in doc.sents:
        sentence = sent.text.strip()
        if not sentence:
            continue
        if any(tok.dep_ == "neg" for tok in sent):
            continue
        if not any(tok.lemma_ in food_terms for tok in sent if tok.pos_ == "NOUN"):
            continue
        sentences.append(sentence)
        sentence_to_row.append(idx)

# Compute BERT embeddings for all extracted sentences and measure their similarity to the food safety phrases
review_scores = {}
if sentences:
    sentence_embeddings = bert_model.encode(
        sentences, batch_size=32, normalize_embeddings=True, show_progress_bar=True
    )
    sim_matrix = cosine_similarity(sentence_embeddings, safety_embeddings)
    max_sim = np.max(sim_matrix, axis=1)
    best_match_idx = np.argmax(sim_matrix, axis=1)

    # For each review, track the best matching sentence and its similarity score
    for i, row_idx in enumerate(sentence_to_row):
        sim = max_sim[i]
        sentence = sentences[i]
        matched_phrase = safety_phrases[best_match_idx[i]]
        if row_idx not in review_scores or review_scores[row_idx]["score"] < sim:
            review_scores[row_idx] = {
                "sentence": sentence,
                "score": sim,
                "matched_phrase": matched_phrase
            }

# Add new columns to the dataset for food safety flagging and matching results
final_df_filtered["food_safety_flag"] = False
final_df_filtered["matched_sentence"] = ""
final_df_filtered["similarity_score"] = 0.0
final_df_filtered["matched_phrase"] = ""

# Assign food safety flags to reviews based on a similarity threshold
for idx, result in review_scores.items():
    if result["score"] > 0.65:
        final_df_filtered.at[idx, "food_safety_flag"] = True
        final_df_filtered.at[idx, "matched_sentence"] = result["sentence"]
        final_df_filtered.at[idx, "similarity_score"] = result["score"]
        final_df_filtered.at[idx, "matched_phrase"] = result["matched_phrase"]

# Display a few examples of flagged reviews with the highest similarity scores
pd.set_option("display.max_colwidth", None)
print("\n🎯 Sample Flagged Reviews:\n")
print(final_df_filtered[final_df_filtered["food_safety_flag"] == True][
    ["name", "review_stars", "matched_sentence", "matched_phrase", "similarity_score"]
].sort_values("similarity_score", ascending=False).head(5))

# Define a function to test how many reviews would be flagged at different similarity thresholds
def test_thresholds(thresholds=[0.5, 0.6, 0.7]):
    print("\n📊 Threshold Testing Summary:")
    for thresh in thresholds:
        count = sum(score["score"] > thresh for score in review_scores.values())
        print(f"  🔹 Threshold > {thresh:.1f}: {count} flagged reviews")

# Run the threshold testing function
test_thresholds()





# 📄 Updating Food Safety Flag with a Stricter Threshold

## Purpose

After reviewing the initial flagged results, we re-assign the `food_safety_flag` using a **higher similarity threshold** (0.67) to make the detection more conservative and precise.

## What We Did

- **Updated the food safety flag**  
  Reviews with a `similarity_score` greater than 0.67 are now flagged as containing potential food safety concerns.

- **Sampled flagged reviews**  
  Selected a random sample of 10 flagged reviews to manually inspect and verify the effectiveness of the stricter threshold.

## Final Output

- A refined set of flagged reviews focused on stronger matches to food safety incidents.
- Enables manual validation of the stricter filtering results.


In [None]:
# Update the food_safety_flag based on a stricter similarity threshold (0.67 instead of 0.65)
final_df_filtered["food_safety_flag"] = final_df_filtered["similarity_score"] > 0.67

# Retrieve and display a random sample of 10 reviews that are newly flagged under the updated threshold
flagged_06 = final_df_filtered[final_df_filtered["food_safety_flag"] == True]
flagged_06[["name", "business_stars", "review_stars", "text"]].sample(10, random_state=42)

Unnamed: 0,name,business_stars,review_stars,text
56946,Oregon Diner,3.5,1.0,"Do not go here. My last visit I found a hair in my food. They remade my order, and that also came with hair in the food. I dumbly decided after a long boycott to stop in because I was near by and starving. Dumb choice. They brought me apple sauce that tasted like pickles. I ordered a simple meal and then they told me no grits. So I switched my order to French toast with eggs and bacon. They added some hashbrowns and toast I immediately ask the waitress wait what about my french toast. She was like oh the hasbrowns they just gave you. I guessed same for the toast WRONG. My bill comes and they've charged me for two meals. Smh. I should have listened to my instinct and got Popeyes NEVER AGAIN"
15276,Ralphs Italian Restaurant,3.5,1.0,"Food was mediocre and the waiter was rude-- attention was not lacking, but our food got thrown down at us and he seemed annoyed. I got the linguini with clams and there were 5 tiny clams and it was mediocre. Not an awful experience, but food was not worth it and I wouldn't go back.\n\nUpdate: I got food poisoning from my meal last night. Don't get the linguini with clams."
207520,Banana Leaf,3.5,2.0,"I've been here two times too many. Maybe three. But this place is so forgettable that I seriously forget.\n\nThe food is mediocre at best, poisoning at worst. Yes, that's right, poison, as in food poisoning. My friend had the spicy tuna roll and hours later, she was rolling herself to the toilet every half hour. TMI?! Sorry, but you'll be thanking me when you choose not to go here."
258698,Four Seasons Diner,3.5,2.0,"I wish I could stick with my old review, but over the past year things have just... changed. The staff always seem pissed off or exhausted (hard job I know). The food quality has decreased and last time I found a hair in my food. When I found the hair I had already eaten most of the meal, so I just informed the waitress, paid, and left. The fish and chips is good after you squeeze out all the oil off the fish because it swims in the abundance of oil as if it's still in the ocean. My friend had a ruben that literally made her sick."
236260,Distrito,3.5,3.0,"Let me start with the good:\nReally excellent lunch and happy hour menus. Great for groups. Excellent for large parties (I've done events for 100+ here in the past and they have been perfectly and thoughtfully executed).\n\nNow onto the bad:\nThe restaurant is filthy. Cleanliness is not at all a concern for the owners/managers. I have gotten food poisoning twice from the restaurant years apart (once in 2012 and once in 2014) and each time, I was sick for about 10 days, missing quite a bit of work. I've heard from others that this is the norm for Distrito. When it happened in 2012, I didn't know whether it was from a lunch I'd had at Distrito, or something else. After it happened again last year, I have not been back. If you have a weak stomach or any kind of digestive or autoimmune issue, I would avoid eating here."
372490,Little Italy Pizza,3.5,1.0,"The last 48 hours have been hell. The order being wrong was probably an omen. I got food poisoning! I couldn't keep any food down, or liquid, and lost 14lbs with a consistent fever of 100. Maybe this is one in a million, maybe not. But chances are if it happened once it will happen again. Never going back."
506740,Spice 28,4.0,1.0,"My friends got food poisoning. It's a great for bar scene but food wise, that only that waa good was the pad thai."
191335,Dim Sum & Noodle,4.0,3.0,"Let me first say the service here is wonderful, and everyone is extremely pleasant an kind. However, I have been here a total of 3 times. And 2 of of those there's been something in our food! The first time we found 3 hairs in our food. I didn't write a review because I understand, things sometimes happen. However, we went today and there was a piece of plastic in our food! Again, stuff happens. But I've completely lost my appetite to even go back here unfortunately. I hope they can start being more careful with their food prep area."
25473,Rittenhouse Grill,3.5,1.0,"My first time there and the place is really nice . The weight staff is mostly attentive a but more stuck up than anything else . \n\nNow for the food : Me and my boyfriend had lobster bisque and crab cocktail as appetizer for main course he had fish and me the lamb chops with asparagus.\nAccompanies by 2 drinks each 3 alcoholic and one sparking water . \nThe lamb chops came out rubbery and overdone barely cut through em , the asparagus was bland and had no taste at all they could have threw a few sprinkles of salt and pepper on there for taste . \nI didn't want to make a fuss and ruin the special evening so I didn't speak up . \nAll that came to 250$ that's highway robbery if you ask me ! \nTo add insult few hours after were home I'm violently throwing up the food we just had.\nMy puking continued all next day accompanied with a lovely head ache .\nSay what you want but to me this looks like food poisoning . \nThere's no way In f@@@@ hell should anyone be sick after spending 250$ on dinner . \nIn closing in my opinion in would not recommend this place actually I would advise you stay clear , unless that is you have money to burn and want to get sick from food .\n\nStay away !!!!"
324477,Kabobeesh,3.5,2.0,"I've been here twice with friends who are obsessed with it and I genuinely wanted to like this place. It's very diner-y and not aesthetically pleasing indoors but it has outdoor benches outside which is nice. Service is fine. There is parking. My rating comes from the food itself... it's just not great. I've gotten the chicken tandoori platter and a paratha wrap. The sides for the chicken tandoori were very subpar. Iceberg lettuce mix, chickpeas, and a few other options, none of which looked or tasted good. The chicken itself was okay at best. Too much rice, and it was oily. The second time I came here I got sick after eating the food. 2/5"


In [None]:
final_df_filtered["food_safety_flag"].value_counts()

food_safety_flag
False    108335
True        240
Name: count, dtype: int64

In [None]:
# Save the cleaned merged dataset with a clear name
output_path = "data/yelp_philadelphia.csv"
final_df_filtered.to_csv(output_path, index=False)

print(f"✅ Saved final_df to '{output_path}'")

✅ Saved final_df to 'data/yelp_philadelphia.csv'
