# 01_business_filtering.ipynb
## Yelp Data Filtering

This notebook aims to extract and filter Yelp business data to obtain dining-related businesses located in the top state(s) with the highest concentration of such businesses (i.e., Pennsylvania (PA)).
The workflow consists of three main steps:

1. **Extract Unique Categories**: Parse Yelp business data to collect all unique business categories.
2. **Identify Dining Categories with MiniLM**: Use MiniLM embeddings and cosine similarity to identify dining-related categories based on semantic similarity to seed keywords (e.g., "restaurant", "food").
3. **Filter Top-State Dining Businesses**: Filter businesses that are located in the state(s) with the highest concentration of dining-related businesses, as identified in the previous step.

Final Outputs:
- `output_categories/unique_categories.json`
- `output_categories/dining_semantic_categories.json`
- `output_businesses/pa_dining_businesses.json`

1. **Extract Unique Categories**

In [1]:
import json
import os

# Dynamically retrieving path
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Input file for the business dataset
business_data_file = os.path.join(BASE_DIR, "yelp_data", "yelp_academic_dataset_business.json")
# Output file for unique categories
unique_categories_file = os.path.join(BASE_DIR, "output_categories", "unique_categories.json")

# Define the maximum number of records to process
MAX_SCAN_RECORDS = 500000  # Process only the first 500,000 records

def extract_unique_categories(file_path, max_records):
    """Extract all unique categories from the business dataset, limited by max_records."""
    unique_categories = set()
    record_count = 0

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if record_count >= max_records:
                break
            business = json.loads(line.strip())
            if "categories" in business and business["categories"]:
                categories = [cat.strip() for cat in business["categories"].split(",")]
                unique_categories.update(categories)
            record_count += 1

    return unique_categories

# Extract unique categories from the business dataset
unique_categories = extract_unique_categories(business_data_file, MAX_SCAN_RECORDS)

# Save the unique categories to a JSON file
with open(unique_categories_file, 'w', encoding='utf-8') as outfile:
    json.dump(sorted(unique_categories), outfile, indent=2, ensure_ascii=False)

print(f"Processed {MAX_SCAN_RECORDS} records.")
print(f"Extracted {len(unique_categories)} unique categories.")
print(f"Unique categories saved to {unique_categories_file}")

Processed 500000 records.
Extracted 1311 unique categories.
Unique categories saved to d:\Programming\LLM_RS\output_categories\unique_categories.json


In [2]:
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Input file for unique categories
unique_categories_file = os.path.join(BASE_DIR, "output_categories", "unique_categories.json")
# Output file for dining-related categories
dining_categories_file = os.path.join(BASE_DIR, "output_categories", "dining_semantic_categories.json")

# Load unique categories
with open(unique_categories_file, 'r', encoding='utf-8') as file:
    categories = json.load(file)

# Define dining-related seed keywords
dining_keywords = ["restaurant", "food", "cuisine", "bar", "dining"]

# Generate embeddings for categories and keywords
model = SentenceTransformer('all-MiniLM-L6-v2')
category_embeddings = model.encode(categories, convert_to_tensor=True)
keyword_embeddings = model.encode(dining_keywords, convert_to_tensor=True)

# Calculate cosine similarity between keywords and categories
similarities = util.cos_sim(keyword_embeddings, category_embeddings)

# Extract categories with high similarity to dining keywords
threshold = 0.6
dining_related_indices = (similarities > threshold).nonzero(as_tuple=True)[1]
dining_related = [categories[idx] for idx in dining_related_indices]

# Save results
with open(dining_categories_file, 'w', encoding='utf-8') as outfile:
    json.dump(sorted(set(dining_related)), outfile, indent=2, ensure_ascii=False)

print(f"Extracted {len(dining_related)} dining-related categories.")
print(f"Dining-related categories saved to {dining_categories_file}")

Extracted 53 dining-related categories.
Dining-related categories saved to d:\Programming\LLM_RS\output_categories\dining_semantic_categories.json


2. **Filter All Dining Businesses**

In [3]:
import json
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Paths
business_data_file = os.path.join(BASE_DIR, "yelp_data", "yelp_academic_dataset_business.json")
dining_categories_file = os.path.join(BASE_DIR, "output_categories", "dining_semantic_categories.json")
dining_businesses_file = os.path.join(BASE_DIR, "output_businesses", "dining_related_businesses.json")

def filter_dining_businesses(business_file, categories_file, output_file):
    """Filter nationwide dining-related businesses based on pre-selected categories."""
    with open(categories_file, 'r', encoding='utf-8') as f:
        dining_categories = set(json.load(f))

    dining_businesses = []
    total_records = 0

    with open(business_file, 'r', encoding='utf-8') as f:
        for line in f:
            total_records += 1
            business = json.loads(line.strip())
            if "categories" in business and business["categories"]:
                business_cats = set(cat.strip() for cat in business["categories"].split(","))
                if dining_categories & business_cats:
                    dining_businesses.append(business)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(dining_businesses, f, indent=2, ensure_ascii=False)

    print(f"[Step 1] Filtered {len(dining_businesses)} dining-related businesses from {total_records} records.")
    print(f"Results saved to {output_file}")

filter_dining_businesses(business_data_file, dining_categories_file, dining_businesses_file)


[Step 1] Filtered 66852 dining-related businesses from 150346 records.
Results saved to d:\Programming\LLM_RS\output_businesses\dining_related_businesses.json


In [4]:
from collections import Counter
import json
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
dining_businesses_file = os.path.join(BASE_DIR, "output_businesses", "dining_related_businesses.json")

def count_businesses_by_state(input_file):
    """Count dining businesses by state."""
    with open(input_file, 'r', encoding='utf-8') as f:
        businesses = json.load(f)

    state_counts = Counter(business.get('state', 'Unknown') for business in businesses)
    sorted_states = state_counts.most_common()

    print("\n[Step 2] State-wise distribution of dining businesses:")
    for state, count in sorted_states:
        print(f"{state}: {count}")

    return state_counts

state_counts = count_businesses_by_state(dining_businesses_file)


[Step 2] State-wise distribution of dining businesses:
PA: 15842
FL: 11274
TN: 5473
MO: 5287
IN: 5274
LA: 4851
NJ: 4177
AZ: 3460
AB: 3056
NV: 2398
ID: 1748
CA: 1650
IL: 1196
DE: 1161
NC: 1
CO: 1
HI: 1
MT: 1
XMS: 1


In [5]:
import json
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
dining_businesses_file = os.path.join(BASE_DIR, "output_businesses", "dining_related_businesses.json")
pa_businesses_file = os.path.join(BASE_DIR, "output_businesses", "pa_dining_businesses.json")

def filter_pa_dining_businesses(input_file, output_file):
    """Filter PA (Pennsylvania) dining-related businesses from the nationwide dining data."""
    with open(input_file, 'r', encoding='utf-8') as f:
        businesses = json.load(f)

    pa_businesses = [b for b in businesses if b.get('state') == 'PA']

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(pa_businesses, f, indent=2, ensure_ascii=False)

    print(f"\n[Step 3] Filtered {len(pa_businesses)} PA dining-related businesses.")
    print(f"Results saved to {output_file}")

filter_pa_dining_businesses(dining_businesses_file, pa_businesses_file)


[Step 3] Filtered 15842 PA dining-related businesses.
Results saved to d:\Programming\LLM_RS\output_businesses\pa_dining_businesses.json
