# Yelp Restaurant Data Exploration

**Goal:**  
Filter Yelp business data to NYC restaurants and understand closure patterns, ratings, and business characteristics.

**Plan:**
1. Load Yelp business JSON file
2. Filter for NYC locations (5 boroughs)
3. Filter for restaurants only
4. Analyze closure rates by different factors
5. Explore ratings, price ranges, categories
6. Check data quality (missing values, coordinates)
7. Save cleaned NYC restaurant data


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


In [None]:
# Load Yelp business data (JSON lines format)
yelp_path = "../data/raw/Yelp JSON/yelp_academic_dataset_business.json"
businesses = []

print("Loading Yelp business data...")
with open(yelp_path, 'r', encoding='utf-8') as f:
    for line in f:
        businesses.append(json.loads(line))

df = pd.DataFrame(businesses)
print(f"Total businesses in dataset: {len(df):,}")
print(f"Columns: {len(df.columns)}")
df.head()


In [None]:
# Explore the data structure
print("Column names:")
print(df.columns.tolist())
print("\n" + "="*60)
print("\nData types:")
print(df.dtypes)
print("\n" + "="*60)
print("\nMissing values:")
print(df.isnull().sum())


In [None]:
# Check what cities and states are in the dataset
print("Top 20 cities:")
print(df['city'].value_counts().head(20))
print("\n" + "="*60)
print("\nStates in dataset:")
print(df['state'].value_counts())


## Filter for NYC Locations

Based on the data, we'll filter for businesses in New York state with NYC-related cities.


In [None]:
# Filter for NYC - using coordinate bounds as most reliable method
# NYC coordinate bounds
nyc_bounds = {
    'lat_min': 40.4774,
    'lat_max': 40.9176,
    'lon_min': -74.2591,
    'lon_max': -73.7004
}

df_nyc = df[
    (df['latitude'].between(nyc_bounds['lat_min'], nyc_bounds['lat_max'])) &
    (df['longitude'].between(nyc_bounds['lon_min'], nyc_bounds['lon_max']))
].copy()

print(f"Total businesses in NYC (by coordinates): {len(df_nyc):,}")
print(f"\nCities found:")
print(df_nyc['city'].value_counts())


## Filter for Restaurants

Now filter for businesses that are restaurants based on their categories.


In [None]:
# Filter for restaurants
df_nyc['has_restaurant'] = df_nyc['categories'].fillna('').str.contains('Restaurant', case=False)
df_restaurants = df_nyc[df_nyc['has_restaurant']].copy()

print(f"NYC Restaurants: {len(df_restaurants):,}")
print(f"Percentage of all NYC businesses: {len(df_restaurants)/len(df_nyc)*100:.1f}%")

# Sample some categories
print("\nSample restaurant categories:")
print(df_restaurants['categories'].head(10).tolist())


## Analyze Closure Rates

Check how many restaurants are open vs closed.


In [None]:
# Check closure rate
total = len(df_restaurants)
open_count = (df_restaurants['is_open'] == 1).sum()
closed_count = (df_restaurants['is_open'] == 0).sum()
closure_rate = (closed_count / total) * 100

print(f"Total NYC Restaurants: {total:,}")
print(f"Open: {open_count:,} ({open_count/total*100:.1f}%)")
print(f"Closed: {closed_count:,} ({closure_rate:.1f}%)")
print(f"\n{'='*60}")
print(f"CLOSURE RATE: {closure_rate:.2f}%")


In [None]:
# Visualize closure rate
fig, ax = plt.subplots(figsize=(8, 5))
status_counts = df_restaurants['is_open'].map({0: 'Closed', 1: 'Open'}).value_counts()
colors = ['#e74c3c', '#2ecc71']
status_counts.plot(kind='bar', ax=ax, color=colors, edgecolor='black')
ax.set_title('Restaurant Status in NYC', fontsize=14, fontweight='bold')
ax.set_xlabel('Status', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()


## Explore Ratings and Reviews


In [None]:
# Rating statistics
print("Rating Distribution:")
print(df_restaurants['stars'].describe())
print("\n" + "="*60)
print("\nRatings by status:")
print(df_restaurants.groupby('is_open')['stars'].agg(['mean', 'median', 'std']))

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Overall rating distribution
ax1.hist(df_restaurants['stars'], bins=20, edgecolor='black', color='skyblue')
ax1.set_xlabel('Star Rating', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Distribution of Restaurant Ratings', fontsize=14, fontweight='bold')
ax1.axvline(df_restaurants['stars'].mean(), color='red', linestyle='--', label=f'Mean: {df_restaurants["stars"].mean():.2f}')
ax1.legend()

# Ratings by status
df_restaurants.boxplot(column='stars', by='is_open', ax=ax2)
ax2.set_title('Ratings: Open (1) vs Closed (0)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Status (0=Closed, 1=Open)', fontsize=12)
ax2.set_ylabel('Star Rating', fontsize=12)
plt.suptitle('')

plt.tight_layout()
plt.show()


In [None]:
# Review count statistics
print("Review Count Statistics:")
print(df_restaurants['review_count'].describe())
print("\n" + "="*60)
print("\nReview counts by status:")
print(df_restaurants.groupby('is_open')['review_count'].agg(['mean', 'median', 'sum']))


## Explore Restaurant Categories


In [None]:
# Extract and count all categories
all_categories = df_restaurants['categories'].fillna('').str.split(', ')
category_list = [cat.strip() for cats in all_categories for cat in cats if cat.strip()]
category_counts = pd.Series(category_list).value_counts()

print(f"Total unique categories: {len(category_counts)}")
print(f"\nTop 20 restaurant categories:")
print(category_counts.head(20))

# Visualize top 15
fig, ax = plt.subplots(figsize=(12, 6))
category_counts.head(15).plot(kind='barh', ax=ax, color='coral', edgecolor='black')
ax.set_xlabel('Count', fontsize=12)
ax.set_ylabel('Category', fontsize=12)
ax.set_title('Top 15 Restaurant Categories in NYC', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()


## Data Quality Checks


In [None]:
# Check for missing values in key columns
key_cols = ['name', 'address', 'city', 'state', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories']
print("Missing values in key columns:")
print(df_restaurants[key_cols].isnull().sum())
print("\n" + "="*60)

# Check coordinate validity (should all be valid since we filtered by coordinates)
print("\nCoordinate ranges:")
print(f"Latitude: {df_restaurants['latitude'].min():.4f} to {df_restaurants['latitude'].max():.4f}")
print(f"Longitude: {df_restaurants['longitude'].min():.4f} to {df_restaurants['longitude'].max():.4f}")

# Check for duplicates
print("\n" + "="*60)
print(f"\nDuplicate business IDs: {df_restaurants['business_id'].duplicated().sum()}")
print(f"Duplicate names (may be legitimate): {df_restaurants['name'].duplicated().sum()}")


## Summary of Findings

Key insights from the Yelp NYC restaurant data exploration.


In [None]:
# Generate summary statistics
summary = f"""
{'='*70}
YELP NYC RESTAURANT DATA - SUMMARY
{'='*70}

DATASET SIZE:
  • Total NYC Restaurants: {len(df_restaurants):,}
  • Open: {(df_restaurants['is_open']==1).sum():,}
  • Closed: {(df_restaurants['is_open']==0).sum():,}
  • Closure Rate: {((df_restaurants['is_open']==0).sum()/len(df_restaurants)*100):.2f}%

RATINGS:
  • Average Rating: {df_restaurants['stars'].mean():.2f} stars
  • Median Rating: {df_restaurants['stars'].median():.1f} stars
  • Open Restaurants: {df_restaurants[df_restaurants['is_open']==1]['stars'].mean():.2f} stars
  • Closed Restaurants: {df_restaurants[df_restaurants['is_open']==0]['stars'].mean():.2f} stars

REVIEWS:
  • Total Reviews: {df_restaurants['review_count'].sum():,}
  • Average Reviews per Restaurant: {df_restaurants['review_count'].mean():.1f}
  • Median Reviews: {df_restaurants['review_count'].median():.0f}

TOP CATEGORIES:
"""
for i, (cat, count) in enumerate(category_counts.head(5).items(), 1):
    summary += f"  {i}. {cat}: {count:,}\n"

summary += f"""
DATA QUALITY:
  • Missing Categories: {df_restaurants['categories'].isnull().sum()}
  • Missing Coordinates: 0 (filtered)
  • Duplicate Business IDs: {df_restaurants['business_id'].duplicated().sum()}
  
{'='*70}
"""

print(summary)


## Save Cleaned Data

Save the cleaned NYC restaurant data for future analysis.


In [None]:
# Select relevant columns for cleaned dataset
clean_cols = [
    'business_id', 'name', 'address', 'city', 'state', 'postal_code',
    'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'
]

df_clean = df_restaurants[clean_cols].copy()

# Save to processed data folder
output_path = '../data/processed/yelp_restaurants_nyc.csv'
df_clean.to_csv(output_path, index=False)

print(f"✓ Saved {len(df_clean):,} NYC restaurants to: {output_path}")
print(f"✓ File size: {df_clean.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB (in memory)")
print(f"✓ Columns saved: {len(clean_cols)}")
print("\nCleaned dataset ready for next steps!")
