# 02_preprocessing.ipynb
## PA Data Preprocessing

This notebook filters and cleans Yelp **PA dining businesses** by removing **irrelevant, low-quality, or inactive businesses** and extract **PA-specific reviews and users**

### Workflow:
1. **Clean Business Data**:
   - Remove businesses with **zero reviews**.
   - Exclude **closed businesses**.
   - Filter out businesses with **low average ratings (stars < 2.0)**.

2. **Filter Reviews & Collect User IDs**:
   - Extract reviews associated with **PA dining businesses**.
   - Identify users who have written reviews or left tips for these businesses.

3. **Filter User Data**:
   - Retrieve **basic user information** (e.g., `user_id`, `name`, `review_count`, `average_stars`) for users involved with PA businesses.
   - Discard unnecessary attributes like `friends`, `compliments`, etc.

### Final Outputs:
- `output_businesses/pa_cleaned_dining_businesses.json`: The cleaned dataset, ensuring **only active, relevant, and reviewed** dining businesses are retained.
- `output_businesses/pa_reviews.json`
- `output_businesses/pa_users.json`


In [4]:
import json
import random
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_businesses", "pa_dining_businesses.json")

with open(input_path, 'r', encoding='utf-8') as f:
    businesses = json.load(f)

print("20 open=0 Businesses")
closed_businesses = [b for b in businesses if b['is_open'] == 0]
sample = random.sample(closed_businesses, min(20, len(closed_businesses)))

for b in sample:
    print(f"{b['name']} - {b.get('address', '')} - {b.get('city', '')}")

print("\n20 open=1 Businesses")
open_businesses = [b for b in businesses if b['is_open'] == 1]
open_sample = random.sample(open_businesses, min(20, len(open_businesses)))

for b in open_sample:
    print(f"{b['name']} - {b.get('address', '')} - {b.get('city', '')}")


20 open=0 Businesses
Butter Crumbs Bake Shoppe - 239 W Butler Ave - Chalfont
PlantPure Cafe - 1115 Walnut St - Philadelphia
Crofts Tavern - 4000 Pottstown Pike - Spring City
Pantry Boy -  - Huntingdon Valley
3 West - 4803 W Chester Pike - Newtown Square
Taormina's Pizzeria & Trattoria - 1601 Valley Forge Rd - Lansdale
Kokopelli Restaurant & Tequila Bar - 1904 Chestnut St - Philadelphia
The Olive Tree Greek Grill - 379 W Uwchlan Ave - Downingtown
FireGirl Mexican Kitchen - 2814 Street Rd - Bensalem
New Bombay Grill - 12 Greenfield Ave - Ardmore
PBandU - 163 E Lancaster Ave - Wayne
My Blue Heaven - 2762 E Pacific St - Philadelphia
Ndulge - 4373 Main St - Philadelphia
Yoshi - 1600 John F Kennedy Blvd - Philadelphia
Wired Beans - 6734 Germantown Ave., Ste 36 - Philadelphia
Hoagie Works - 44 E State St, Ste 4 - Doylestown
Factory Donuts - 4367 W Swamp Rd - Doylestown
South Terminal Market - 306 South St - Philadelphia
Diane & Tom's Cafe - 40 W Maplewood Mall - Philadelphia
Dice's Deli & Cat

In [11]:
import json
import os

# Set base directory and file paths
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
input_path = os.path.join(BASE_DIR, "output_businesses", "pa_dining_businesses.json")
output_path = os.path.join(BASE_DIR, "output_businesses", "pa_cleaned_dining_businesses.json")

# Filtering thresholds (modifiable)
MIN_REVIEW_COUNT = 5  # Minimum review threshold
MIN_RATING = 2.0       # Minimum star rating
KEEP_CLOSED = False    # Whether to keep closed businesses

# Load all PA dining businesses
with open(input_path, 'r', encoding='utf-8') as f:
    businesses = json.load(f)

total_businesses = len(businesses)

# Step 1: Remove closed businesses (if configured)
removed_closed = set()
if not KEEP_CLOSED:
    for b in businesses:
        if b.get('is_open', 1) == 0:
            removed_closed.add(b['business_id'])

# Step 2: Remove businesses with low ratings (before review count filtering)
removed_low_ratings = set()
for b in businesses:
    if b['stars'] < MIN_RATING:
        removed_low_ratings.add(b['business_id'])

# Step 3: Remove businesses with too few reviews (after rating filtering)
removed_low_reviews = set()
for b in businesses:
    if b['review_count'] < MIN_REVIEW_COUNT:
        removed_low_reviews.add(b['business_id'])

# Get final valid businesses (not in any of the removed sets)
final_businesses = [
    b for b in businesses 
    if b['business_id'] not in removed_closed
    and b['business_id'] not in removed_low_ratings
    and b['business_id'] not in removed_low_reviews
]

# Save filtered businesses
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(final_businesses, f, indent=2, ensure_ascii=False)

# Display filtering statistics
print(f"Total original businesses: {total_businesses}")
print(f"Removed closed businesses: {len(removed_closed)}")  
print(f"Removed businesses with rating < {MIN_RATING}: {len(removed_low_ratings)}") 
print(f"Removed businesses with review count < {MIN_REVIEW_COUNT}: {len(removed_low_reviews)}") 
print(f"Remaining businesses after filtering: {len(final_businesses)}")
print(f"Filtered businesses saved to {output_path}")


Total original businesses: 15842
Removed closed businesses: 5404
Removed businesses with rating < 2.0: 441
Removed businesses with review count < 5: 0
Remaining businesses after filtering: 10105
Filtered businesses saved to d:\Programming\LLM_RS\output_businesses\pa_cleaned_dining_businesses.json


In [4]:
import json
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Input file paths
pa_business_file = os.path.join(BASE_DIR, "output_businesses", "pa_cleaned_dining_businesses.json")
review_file = os.path.join(BASE_DIR, "yelp_data", "yelp_academic_dataset_review.json")
user_file = os.path.join(BASE_DIR, "yelp_data", "yelp_academic_dataset_user.json")
tip_file = os.path.join(BASE_DIR, "yelp_data", "yelp_academic_dataset_tip.json")

# Output file paths
pa_review_output = os.path.join(BASE_DIR, "output_businesses", "pa_reviews.json")
pa_user_output = os.path.join(BASE_DIR, "output_businesses", "pa_users.json")

# Load PA dining business IDs into a set for fast lookup
with open(pa_business_file, 'r', encoding='utf-8') as f:
    pa_business_ids = {b['business_id'] for b in json.load(f)}

# Collect PA-related reviews and corresponding user IDs
pa_reviews = []
pa_user_ids = set()

review_count = 0
with open(review_file, 'r', encoding='utf-8') as f:
    for line in f:
        review_count += 1
        review = json.loads(line)
        if review['business_id'] in pa_business_ids:
            pa_reviews.append({
                'review_id': review['review_id'],
                'user_id': review['user_id'],
                'business_id': review['business_id'],
                'stars': review['stars'],
                'text': review['text'],
                'date': review['date']
            })
            pa_user_ids.add(review['user_id'])

# Collect additional users from tips
tip_count = 0
with open(tip_file, 'r', encoding='utf-8') as f:
    for line in f:
        tip_count += 1
        tip = json.loads(line)
        if tip['business_id'] in pa_business_ids:
            pa_user_ids.add(tip['user_id'])

# Collect PA user data based on filtered user IDs
pa_users = []
user_count = 0
with open(user_file, 'r', encoding='utf-8') as f:
    for line in f:
        user_count += 1
        user = json.loads(line)
        if user['user_id'] in pa_user_ids:
            pa_users.append({
                'user_id': user['user_id'],
                'name': user.get('name', 'N/A'),
                'review_count': user.get('review_count', 0),
                'average_stars': user.get('average_stars', 0)
            })

# Save filtered reviews and users
with open(pa_review_output, 'w', encoding='utf-8') as f:
    json.dump(pa_reviews, f, indent=2, ensure_ascii=False)

with open(pa_user_output, 'w', encoding='utf-8') as f:
    json.dump(pa_users, f, indent=2, ensure_ascii=False)

# Display preprocessing statistics
print(f"Total reviews scanned: {review_count}")
print(f"Filtered PA reviews: {len(pa_reviews)}")
print(f"Total tips scanned: {tip_count}")
print(f"Total users scanned: {user_count}")
print(f"Filtered PA users: {len(pa_users)}")
print(f"Reviews saved to {pa_review_output}")
print(f"Users saved to {pa_user_output}")

Total reviews scanned: 6990280
Filtered PA reviews: 910511
Total tips scanned: 908915
Total users scanned: 1987897
Filtered PA users: 288983
Reviews saved to d:\Programming\LLM_RS\output_businesses\pa_reviews.json
Users saved to d:\Programming\LLM_RS\output_businesses\pa_users.json
