# Yelp data analysis

## Creating helper functions


In [2]:
import json
import pandas as pd

# A function to explore the first few rows of a json file

def head_json(filename, n=5):
    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            print(json.loads(line))

head_json('../data/raw/yelp_academic_dataset_business.json')

{'business_id': 'Pns2l4eNsfO8kk83dixA6A', 'name': 'Abby Rappoport, LAC, CMQ', 'address': '1616 Chapala St, Ste 2', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93101', 'latitude': 34.4266787, 'longitude': -119.7111968, 'stars': 5.0, 'review_count': 7, 'is_open': 0, 'attributes': {'ByAppointmentOnly': 'True'}, 'categories': 'Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists', 'hours': None}
{'business_id': 'mpf3x-BjTdTEA3yCZrAYPw', 'name': 'The UPS Store', 'address': '87 Grasso Plaza Shopping Center', 'city': 'Affton', 'state': 'MO', 'postal_code': '63123', 'latitude': 38.551126, 'longitude': -90.335695, 'stars': 3.0, 'review_count': 15, 'is_open': 1, 'attributes': {'BusinessAcceptsCreditCards': 'True'}, 'categories': 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', 'Wednesday': '8:0-18:30', 'Thursday': '8:0-18:30', 'Friday': '8:0-18

 ## Count of the total number of rows (businesses)

In [3]:
with open('../data/raw/yelp_academic_dataset_business.json', 'r') as f:
    total = sum(1 for line in f)
print(f"Total businesses: {total}")

Total businesses: 150346


## Checking which cities are represented in the dataset

In [5]:
from collections import Counter

def get_city_state_distribution(filename, n=150346):

    """Get city and state distribution"""
    locations = []

    with open(filename, 'r') as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            data = json.loads(line)
            city = data.get('city', 'Unknown')
            state = data.get('state', 'Unknown')
            locations.append(f"{city}, {state}")

    location_counts = Counter(locations)
    print(f"Sampled {n} businesses")
    print(f"Found {len(location_counts)} unique city-state combinations\n")
    print("Top 30 locations:")
    for location, count in location_counts.most_common(30):
        print(f"  {location}: {count}")

    return location_counts

# Usage
locations = get_city_state_distribution('../data/raw/yelp_academic_dataset_business.json')

Sampled 150346 businesses
Found 1467 unique city-state combinations

Top 30 locations:
  Philadelphia, PA: 14567
  Tucson, AZ: 9249
  Tampa, FL: 9048
  Indianapolis, IN: 7540
  Nashville, TN: 6968
  New Orleans, LA: 6208
  Reno, NV: 5932
  Edmonton, AB: 5054
  Saint Louis, MO: 4827
  Santa Barbara, CA: 3829
  Boise, ID: 2937
  Clearwater, FL: 2221
  Saint Petersburg, FL: 1663
  Metairie, LA: 1643
  Sparks, NV: 1623
  Wilmington, DE: 1445
  Franklin, TN: 1410
  St. Louis, MO: 1254
  St. Petersburg, FL: 1185
  Meridian, ID: 1042
  Brandon, FL: 1033
  Largo, FL: 1002
  Carmel, IN: 967
  Cherry Hill, NJ: 959
  West Chester, PA: 838
  Goleta, CA: 798
  Palm Harbor, FL: 665
  Greenwood, IN: 649
  Brentwood, TN: 644
  New Port Richey, FL: 604
