# Yelp Restaurant Data Exploration

**Goal:**  
Filter Yelp business data to NYC restaurants and understand closure patterns, ratings, and business characteristics.

**Plan:**
1. Load Yelp business JSON file
2. Filter for NYC locations (5 boroughs)
3. Filter for restaurants only
4. Analyze closure rates by different factors
5. Explore ratings, price ranges, categories
6. Check data quality (missing values, coordinates)
7. Save cleaned NYC restaurant data


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


In [20]:
# Load Yelp business data (JSON lines format)
yelp_path = "../data/raw/Yelp JSON/yelp_academic_dataset_business.json"
businesses = []

print("Loading Yelp business data...")
with open(yelp_path, 'r', encoding='utf-8') as f:
    for line in f:
        businesses.append(json.loads(line))

df = pd.DataFrame(businesses)
print(f"Total businesses in dataset: {len(df):,}")
print(f"Columns: {len(df.columns)}")
df.head()


Loading Yelp business data...
Total businesses in dataset: 150,346
Columns: 14


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [21]:
# Explore the data structure
print("Column names:")
print(df.columns.tolist())
print("\n" + "="*60)
print("\nData types:")
print(df.dtypes)
print("\n" + "="*60)
print("\nMissing values:")
print(df.isnull().sum())


Column names:
['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']


Data types:
business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object


Missing values:
business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      13744
categories        103
hours           23223
dtype: int64


In [None]:
# Check what cities and states are in the dataset
print("Top 20 cities:")
print(df['city'].value_counts().head(20))
print("\n" + "="*60)
print("\nStates in dataset:")
print(df['state'].value_counts())



Top 20 cities:
city
Philadelphia        14569
Tucson               9250
Tampa                9050
Indianapolis         7540
Nashville            6971
New Orleans          6209
Reno                 5935
Edmonton             5054
Saint Louis          4827
Santa Barbara        3829
Boise                2937
Clearwater           2221
Saint Petersburg     1663
Metairie             1643
Sparks               1624
Wilmington           1446
Franklin             1414
St. Louis            1255
St. Petersburg       1185
Meridian             1043
Name: count, dtype: int64


States in dataset:
state
PA     34039
FL     26330
TN     12056
IN     11247
MO     10913
LA      9924
AZ      9912
NJ      8536
NV      7715
AB      5573
CA      5203
ID      4467
DE      2265
IL      2145
TX         4
CO         3
WA         2
HI         2
MA         2
NC         1
UT         1
MT         1
MI         1
SD         1
XMS        1
VI         1
VT         1
Name: count, dtype: int64
Empty DataFrame
Columns: [busi

## Filter for NYC Locations

Based on the data, we'll filter for businesses in New York state with NYC-related cities.


In [22]:
# Filter for NYC - using coordinate bounds as most reliable method
# NYC coordinate bounds
nyc_bounds = {
    'lat_min': 40.4774,
    'lat_max': 40.9176,
    'lon_min': -74.2591,
    'lon_max': -73.7004
}

df_nyc = df[
    (df['latitude'].between(nyc_bounds['lat_min'], nyc_bounds['lat_max'])) &
    (df['longitude'].between(nyc_bounds['lon_min'], nyc_bounds['lon_max']))
].copy()

print(f"Total businesses in NYC (by coordinates): {len(df_nyc):,}")
print(f"\nCities found:")
print(df_nyc['city'].value_counts())


Total businesses in NYC (by coordinates): 0

Cities found:
Series([], Name: count, dtype: int64)


## Filter for Restaurants

Now filter for businesses that are restaurants based on their categories.


In [23]:
# Filter for restaurants
df_nyc['has_restaurant'] = df_nyc['categories'].fillna('').str.contains('Restaurant', case=False)
df_restaurants = df_nyc[df_nyc['has_restaurant']].copy()

print(f"NYC Restaurants: {len(df_restaurants):,}")
print(f"Percentage of all NYC businesses: {len(df_restaurants)/len(df_nyc)*100:.1f}%")

# Sample some categories
print("\nSample restaurant categories:")
print(df_restaurants['categories'].head(10).tolist())


NYC Restaurants: 0


ZeroDivisionError: division by zero

## Analyze Closure Rates

Check how many restaurants are open vs closed.


In [None]:
# Check closure rate
total = len(df_restaurants)
open_count = (df_restaurants['is_open'] == 1).sum()
closed_count = (df_restaurants['is_open'] == 0).sum()
closure_rate = (closed_count / total) * 100

print(f"Total NYC Restaurants: {total:,}")
print(f"Open: {open_count:,} ({open_count/total*100:.1f}%)")
print(f"Closed: {closed_count:,} ({closure_rate:.1f}%)")
print(f"\n{'='*60}")
print(f"CLOSURE RATE: {closure_rate:.2f}%")


In [None]:
# Visualize closure rate
fig, ax = plt.subplots(figsize=(8, 5))
status_counts = df_restaurants['is_open'].map({0: 'Closed', 1: 'Open'}).value_counts()
colors = ['#e74c3c', '#2ecc71']
status_counts.plot(kind='bar', ax=ax, color=colors, edgecolor='black')
ax.set_title('Restaurant Status in NYC', fontsize=14, fontweight='bold')
ax.set_xlabel('Status', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()


## Explore Ratings and Reviews


In [None]:
# Rating statistics
print("Rating Distribution:")
print(df_restaurants['stars'].describe())
print("\n" + "="*60)
print("\nRatings by status:")
print(df_restaurants.groupby('is_open')['stars'].agg(['mean', 'median', 'std']))

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Overall rating distribution
ax1.hist(df_restaurants['stars'], bins=20, edgecolor='black', color='skyblue')
ax1.set_xlabel('Star Rating', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Distribution of Restaurant Ratings', fontsize=14, fontweight='bold')
ax1.axvline(df_restaurants['stars'].mean(), color='red', linestyle='--', label=f'Mean: {df_restaurants["stars"].mean():.2f}')
ax1.legend()

# Ratings by status
df_restaurants.boxplot(column='stars', by='is_open', ax=ax2)
ax2.set_title('Ratings: Open (1) vs Closed (0)', fontsize=14, fontweight='bold')
ax2.set_xlabel('Status (0=Closed, 1=Open)', fontsize=12)
ax2.set_ylabel('Star Rating', fontsize=12)
plt.suptitle('')

plt.tight_layout()
plt.show()


In [None]:
# Review count statistics
print("Review Count Statistics:")
print(df_restaurants['review_count'].describe())
print("\n" + "="*60)
print("\nReview counts by status:")
print(df_restaurants.groupby('is_open')['review_count'].agg(['mean', 'median', 'sum']))


## Explore Restaurant Categories


In [None]:
# Extract and count all categories
all_categories = df_restaurants['categories'].fillna('').str.split(', ')
category_list = [cat.strip() for cats in all_categories for cat in cats if cat.strip()]
category_counts = pd.Series(category_list).value_counts()

print(f"Total unique categories: {len(category_counts)}")
print(f"\nTop 20 restaurant categories:")
print(category_counts.head(20))

# Visualize top 15
fig, ax = plt.subplots(figsize=(12, 6))
category_counts.head(15).plot(kind='barh', ax=ax, color='coral', edgecolor='black')
ax.set_xlabel('Count', fontsize=12)
ax.set_ylabel('Category', fontsize=12)
ax.set_title('Top 15 Restaurant Categories in NYC', fontsize=14, fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()


## Data Quality Checks


In [None]:
# Check for missing values in key columns
key_cols = ['name', 'address', 'city', 'state', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories']
print("Missing values in key columns:")
print(df_restaurants[key_cols].isnull().sum())
print("\n" + "="*60)

# Check coordinate validity (should all be valid since we filtered by coordinates)
print("\nCoordinate ranges:")
print(f"Latitude: {df_restaurants['latitude'].min():.4f} to {df_restaurants['latitude'].max():.4f}")
print(f"Longitude: {df_restaurants['longitude'].min():.4f} to {df_restaurants['longitude'].max():.4f}")

# Check for duplicates
print("\n" + "="*60)
print(f"\nDuplicate business IDs: {df_restaurants['business_id'].duplicated().sum()}")
print(f"Duplicate names (may be legitimate): {df_restaurants['name'].duplicated().sum()}")


## Summary of Findings

Key insights from the Yelp NYC restaurant data exploration.


In [None]:
# Generate summary statistics
summary = f"""
{'='*70}
YELP NYC RESTAURANT DATA - SUMMARY
{'='*70}

DATASET SIZE:
  • Total NYC Restaurants: {len(df_restaurants):,}
  • Open: {(df_restaurants['is_open']==1).sum():,}
  • Closed: {(df_restaurants['is_open']==0).sum():,}
  • Closure Rate: {((df_restaurants['is_open']==0).sum()/len(df_restaurants)*100):.2f}%

RATINGS:
  • Average Rating: {df_restaurants['stars'].mean():.2f} stars
  • Median Rating: {df_restaurants['stars'].median():.1f} stars
  • Open Restaurants: {df_restaurants[df_restaurants['is_open']==1]['stars'].mean():.2f} stars
  • Closed Restaurants: {df_restaurants[df_restaurants['is_open']==0]['stars'].mean():.2f} stars

REVIEWS:
  • Total Reviews: {df_restaurants['review_count'].sum():,}
  • Average Reviews per Restaurant: {df_restaurants['review_count'].mean():.1f}
  • Median Reviews: {df_restaurants['review_count'].median():.0f}

TOP CATEGORIES:
"""
for i, (cat, count) in enumerate(category_counts.head(5).items(), 1):
    summary += f"  {i}. {cat}: {count:,}\n"

summary += f"""
DATA QUALITY:
  • Missing Categories: {df_restaurants['categories'].isnull().sum()}
  • Missing Coordinates: 0 (filtered)
  • Duplicate Business IDs: {df_restaurants['business_id'].duplicated().sum()}
  
{'='*70}
"""

print(summary)


## Save Cleaned Data

Save the cleaned NYC restaurant data for future analysis.


In [None]:
# Select relevant columns for cleaned dataset
clean_cols = [
    'business_id', 'name', 'address', 'city', 'state', 'postal_code',
    'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'
]

df_clean = df_restaurants[clean_cols].copy()

# Save to processed data folder
output_path = '../data/processed/yelp_restaurants_nyc.csv'
df_clean.to_csv(output_path, index=False)

print(f"✓ Saved {len(df_clean):,} NYC restaurants to: {output_path}")
print(f"✓ File size: {df_clean.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB (in memory)")
print(f"✓ Columns saved: {len(clean_cols)}")
print("\nCleaned dataset ready for next steps!")
