In [1]:
import requests
import json
import time
import pandas as pd
import os

In [None]:
# Set up Google Places API key
API_KEY = "API key"  

# Define NYC area coordinates to cover all boroughs
nyc_areas = [
    {"name": "Manhattan Midtown", "lat": 40.7549, "lng": -73.9840},
    {"name": "Manhattan Downtown", "lat": 40.7128, "lng": -74.0060},
    {"name": "Manhattan Upper East", "lat": 40.7735, "lng": -73.9565},
    {"name": "Manhattan Upper West", "lat": 40.7870, "lng": -73.9754},
    {"name": "Brooklyn Downtown", "lat": 40.6935, "lng": -73.9888},
    {"name": "Brooklyn Williamsburg", "lat": 40.7081, "lng": -73.9571},
    {"name": "Brooklyn Park Slope", "lat": 40.6681, "lng": -73.9806},
    {"name": "Queens Flushing", "lat": 40.7654, "lng": -73.8318},
    {"name": "Queens Astoria", "lat": 40.7644, "lng": -73.9235},
    {"name": "Queens Jamaica", "lat": 40.7020, "lng": -73.8003},
    {"name": "Bronx Fordham", "lat": 40.8607, "lng": -73.8903},
    {"name": "Bronx South", "lat": 40.8126, "lng": -73.9260},
    {"name": "Staten Island North", "lat": 40.6392, "lng": -74.1349},
    {"name": "Staten Island South", "lat": 40.5486, "lng": -74.2029}
]

# Restaurant types categorized by region/ethnicity
regional_cuisine_types = [
    "american_restaurant",
    "chinese_restaurant",
    "italian_restaurant",
    "mexican_restaurant",
    "japanese_restaurant",
    "indian_restaurant",
    "greek_restaurant",
    "french_restaurant",
    "thai_restaurant",
    "vietnamese_restaurant",
    "korean_restaurant",
    "spanish_restaurant",
    "middle_eastern_restaurant",
    "mediterranean_restaurant",
    "lebanese_restaurant",
    "turkish_restaurant",
    "brazilian_restaurant",
    "african_restaurant",
    "afghani_restaurant",
    "indonesian_restaurant"
]

# Restaurant types categorized by specific food type
specific_food_types = [
    "pizza_restaurant",
    "sushi_restaurant",
    "ramen_restaurant",
    "hamburger_restaurant",
    "seafood_restaurant",
    "barbecue_restaurant",
    "steak_house"
]

# Combine all restaurant types for searching
all_restaurant_types = regional_cuisine_types + specific_food_types + ["restaurant"]

# Helper function to get type priority
def get_type_priority(search_type):
    """Get type priority (lower number means higher priority)"""
    if search_type in specific_food_types:
        return 1  # Specific food types have highest priority
    elif search_type in regional_cuisine_types:
        return 2  # Regional cuisine types have medium priority
    else:
        return 3  # Generic restaurant type has lowest priority

# Function to deduplicate and merge restaurant types
def deduplicate_and_merge_types(df):
    """Deduplicate restaurants and create separate lists for regional and specific types"""
    # Create a dictionary to store info for each place_id
    place_info = {}
    
    for _, row in df.iterrows():
        place_id = row['place_id']
        current_type = row['search_type']
        
        if place_id not in place_info:
            # First encounter with this restaurant, initialize
            place_info[place_id] = row.to_dict()
            place_info[place_id]['all_search_types'] = {current_type}
            place_info[place_id]['regional_types'] = [current_type] if current_type in regional_cuisine_types else []
            place_info[place_id]['specific_types'] = [current_type] if current_type in specific_food_types else []
        else:
            # Already have info for this restaurant, update type information
            place_info[place_id]['all_search_types'].add(current_type)
            
            # Update regional_types
            if current_type in regional_cuisine_types and current_type not in place_info[place_id]['regional_types']:
                place_info[place_id]['regional_types'].append(current_type)
            
            # Update specific_types
            if current_type in specific_food_types and current_type not in place_info[place_id]['specific_types']:
                place_info[place_id]['specific_types'].append(current_type)
            
            # Keep the highest priority type as the main search_type
            current_priority = get_type_priority(place_info[place_id]['search_type'])
            new_priority = get_type_priority(current_type)
            
            if new_priority < current_priority:  # Lower number means higher priority
                place_info[place_id]['search_type'] = current_type
    
    # Convert dictionary back to DataFrame
    merged_rows = []
    for place_data in place_info.values():
        # Convert sets to lists
        place_data['all_search_types'] = list(place_data['all_search_types'])
        merged_rows.append(place_data)
    
    result_df = pd.DataFrame(merged_rows)
    return result_df

# Function to search for restaurants of a specific type in an area
def search_restaurants_for_type(area_name, lat, lng, restaurant_type, radius=1500):
    """Search for specific restaurant types in an area"""
    print(f"Searching for {restaurant_type} in {area_name}...")
    
    url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    
    # Base parameters for the search
    if restaurant_type == "restaurant":
        # For general restaurants, just use the type
        params = {
            "location": f"{lat},{lng}",
            "radius": radius,
            "type": "restaurant",
            "key": API_KEY
        }
    else:
        # For specific restaurant types, use both type and keyword
        params = {
            "location": f"{lat},{lng}",
            "radius": radius,
            "type": "restaurant",
            "keyword": restaurant_type.replace("_", " "),  # Convert to natural language
            "key": API_KEY
        }
    
    all_results = []
    page_token = None
    page_count = 1
    
    # Get results from all pages
    while True:
        if page_token:
            params = {"key": API_KEY, "pagetoken": page_token}
        
        response = requests.get(url, params=params)
        data = response.json()
        
        if data["status"] != "OK" and data["status"] != "ZERO_RESULTS":
            print(f"API Error: {data['status']}")
            break
        
        if "results" in data:
            print(f"Processing page {page_count}, containing {len(data['results'])} records")
            
            for place in data["results"]:
                # Basic information from the search result
                restaurant_info = {
                    "place_id": place["place_id"],
                    "name": place["name"],
                    "address": place.get("vicinity", "Unknown address"),
                    "latitude": place["geometry"]["location"]["lat"],
                    "longitude": place["geometry"]["location"]["lng"],
                    "rating": place.get("rating", 0),
                    "user_ratings_total": place.get("user_ratings_total", 0),
                    "price_level": place.get("price_level", 0),
                    "all_types": place.get("types", []),
                    "search_type": restaurant_type,
                    "area": area_name,
                    "business_status": place.get("business_status", ""),
                    "photos": [photo.get("photo_reference", "") for photo in place.get("photos", [])][:1]
                }
                all_results.append(restaurant_info)
        
        # Check for next page
        if "next_page_token" in data and data["next_page_token"]:
            page_token = data["next_page_token"]
            page_count += 1
            time.sleep(2)  # Required delay before requesting next page
        else:
            break
    
    print(f"Found {len(all_results)} places for {restaurant_type} in {area_name}")
    return all_results

# Function to get additional details for restaurants
def get_restaurant_details(place_ids):
    """Get additional details for a batch of places"""
    details_results = {}
    
    for i, place_id in enumerate(place_ids):
        if i > 0 and i % 20 == 0:
            print(f"Processed {i} place details. Pausing...")
            time.sleep(5)  # Avoid hitting rate limits
        
        url = "https://maps.googleapis.com/maps/api/place/details/json"
        params = {
            "place_id": place_id,
            "fields": "name,types,formatted_address,formatted_phone_number,website,opening_hours",
            "key": API_KEY
        }
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            
            if data["status"] == "OK" and "result" in data:
                result = data["result"]
                # Safely access fields, avoid missing field key errors
                details = {
                    "formatted_address": result.get("formatted_address", ""),
                    "formatted_phone_number": result.get("formatted_phone_number", ""),
                    "website": result.get("website", "")
                }
                
                # Handle complex fields separately
                if "opening_hours" in result and "weekday_text" in result["opening_hours"]:
                    details["opening_hours_text"] = json.dumps(result["opening_hours"]["weekday_text"])
                else:
                    details["opening_hours_text"] = "[]"
                    
                if "types" in result:
                    details["detail_types"] = json.dumps(result["types"])
                else:
                    details["detail_types"] = "[]"
                
                details_results[place_id] = details
            else:
                details_results[place_id] = {
                    "formatted_address": "",
                    "formatted_phone_number": "",
                    "website": "",
                    "opening_hours_text": "[]",
                    "detail_types": "[]"
                }
        except Exception as e:
            print(f"Error getting details for place {place_id}: {str(e)}")
            details_results[place_id] = {
                "formatted_address": "",
                "formatted_phone_number": "",
                "website": "",
                "opening_hours_text": "[]",
                "detail_types": "[]"
            }
    
    return details_results

# Main function
def main():
    # Create directory to save data
    os.makedirs("nyc_restaurants_data", exist_ok=True)
    
    all_restaurants = []
    
    # Search all areas
    for area_idx, area in enumerate(nyc_areas):
        print(f"\nProcessing area {area_idx+1}/{len(nyc_areas)}: {area['name']}")
        area_restaurants = []
        
        # First search for general restaurants
        general_results = search_restaurants_for_type(area['name'], area['lat'], area['lng'], "restaurant")
        area_restaurants.extend(general_results)
        
        # Then search for each specific restaurant type
        for type_idx, restaurant_type in enumerate(all_restaurant_types[1:]):  # Skip "restaurant" as we already did it
            results = search_restaurants_for_type(area['name'], area['lat'], area['lng'], restaurant_type)
            area_restaurants.extend(results)
            
            # Pause to avoid hitting API limits
            if (type_idx + 1) % 3 == 0 and type_idx + 1 < len(all_restaurant_types) - 1:
                print("Pausing for 10 seconds to avoid API limits...")
                time.sleep(10)
        
        # Process area results
        if area_restaurants:
            # Create DataFrame
            area_df = pd.DataFrame(area_restaurants)
            
            # Deduplicate and merge types
            area_df = deduplicate_and_merge_types(area_df)
            
            # Convert list fields to JSON strings for CSV storage
            area_df['all_search_types'] = area_df['all_search_types'].apply(json.dumps)
            area_df['regional_types'] = area_df['regional_types'].apply(json.dumps)
            area_df['specific_types'] = area_df['specific_types'].apply(json.dumps)
            
            # Save area results
            area_df.to_csv(f"nyc_restaurants_data/{area['name'].replace(' ', '_')}_restaurants.csv", index=False)
            print(f"Saved {len(area_df)} unique restaurants for {area['name']}")
        
        # Add to full collection
        all_restaurants.extend(area_restaurants)
        
        # Longer pause between areas
        if (area_idx + 1) < len(nyc_areas):
            print(f"Completed area {area_idx+1}/{len(nyc_areas)}. Pausing for 60 seconds...")
            time.sleep(60)
    
    # Process and save all data
    if all_restaurants:
        # Create DataFrame with all restaurants
        all_restaurants_df = pd.DataFrame(all_restaurants)
        
        # Deduplicate and merge types
        all_restaurants_df = deduplicate_and_merge_types(all_restaurants_df)
        print(f"Found {len(all_restaurants_df)} unique restaurants across all areas")
        
        # Optional: Get additional details for top-rated restaurants
        if len(all_restaurants_df) > 0:
            # Get details for top 500 restaurants by rating * number of ratings
            all_restaurants_df['popularity_score'] = all_restaurants_df['rating'] * all_restaurants_df['user_ratings_total'].apply(lambda x: min(x, 1000))
            top_restaurants = all_restaurants_df.sort_values(by='popularity_score', ascending=False).head(500)
            
            print(f"Getting additional details for top {len(top_restaurants)} restaurants...")
            place_details = get_restaurant_details(top_restaurants['place_id'].tolist())
            
            # Create details DataFrame
            details_list = []
            for place_id, details in place_details.items():
                details['place_id'] = place_id
                details_list.append(details)
            
            if details_list:
                details_df = pd.DataFrame(details_list)
                # Merge with main DataFrame
                all_restaurants_df = pd.merge(all_restaurants_df, details_df, on='place_id', how='left')
        
        # Convert list fields to JSON strings for CSV storage
        all_restaurants_df['all_search_types'] = all_restaurants_df['all_search_types'].apply(json.dumps)
        all_restaurants_df['regional_types'] = all_restaurants_df['regional_types'].apply(json.dumps)
        all_restaurants_df['specific_types'] = all_restaurants_df['specific_types'].apply(json.dumps)
        all_restaurants_df['all_types'] = all_restaurants_df['all_types'].apply(json.dumps)
        
        # Save all restaurants
        all_restaurants_df.to_csv("nyc_restaurants_data/all_nyc_restaurants.csv", index=False)
        
        # Save regional cuisine restaurants
        regional_df = all_restaurants_df[all_restaurants_df['regional_types'].apply(json.loads).apply(len) > 0]
        if not regional_df.empty:
            regional_df.to_csv("nyc_restaurants_data/regional_cuisine_restaurants.csv", index=False)
            print(f"Saved {len(regional_df)} restaurants with regional cuisine types")
        
        # Save specific food type restaurants
        food_type_df = all_restaurants_df[all_restaurants_df['specific_types'].apply(json.loads).apply(len) > 0]
        if not food_type_df.empty:
            food_type_df.to_csv("nyc_restaurants_data/specific_food_type_restaurants.csv", index=False)
            print(f"Saved {len(food_type_df)} restaurants with specific food types")
        
        # Create summary with counts by type and area
        print("\nCreating restaurant type summary by area...")
        summary_data = []
        
        for area in nyc_areas:
            area_name = area['name']
            area_restaurants = all_restaurants_df[all_restaurants_df['area'] == area_name]
            
            if not area_restaurants.empty:
                area_summary = {
                    "area": area_name,
                    "total_restaurants": len(area_restaurants)
                }
                
                # Count restaurants by regional type
                for r_type in regional_cuisine_types:
                    count = len(area_restaurants[area_restaurants['regional_types'].apply(
                        lambda x: r_type in json.loads(x) if isinstance(x, str) else r_type in x)])
                    area_summary[r_type] = count
                
                # Count restaurants by specific food type
                for s_type in specific_food_types:
                    count = len(area_restaurants[area_restaurants['specific_types'].apply(
                        lambda x: s_type in json.loads(x) if isinstance(x, str) else s_type in x)])
                    area_summary[s_type] = count
                
                summary_data.append(area_summary)
        
        # Save summary
        if summary_data:
            summary_df = pd.DataFrame(summary_data)
            summary_df.to_csv("nyc_restaurants_data/restaurant_summary_by_area.csv", index=False)
            print("Saved restaurant type summary by area")
        
        print("\nData collection and processing complete!")
        print(f"Total unique restaurants found: {len(all_restaurants_df)}")
        print(f"Data saved to 'nyc_restaurants_data/' directory")
    else:
        print("No restaurant data found")

if __name__ == "__main__":
    main()


Processing area 1/14: Manhattan Midtown
Searching for restaurant in Manhattan Midtown...
Processing page 1, containing 20 records
Processing page 2, containing 20 records
Processing page 3, containing 20 records
Found 60 places for restaurant in Manhattan Midtown
Searching for chinese_restaurant in Manhattan Midtown...
Processing page 1, containing 20 records
Processing page 2, containing 20 records
Processing page 3, containing 20 records
Found 60 places for chinese_restaurant in Manhattan Midtown
Searching for italian_restaurant in Manhattan Midtown...
Processing page 1, containing 20 records
Processing page 2, containing 20 records
Processing page 3, containing 20 records
Found 60 places for italian_restaurant in Manhattan Midtown
Searching for mexican_restaurant in Manhattan Midtown...
Processing page 1, containing 20 records
Processing page 2, containing 20 records
Processing page 3, containing 20 records
Found 60 places for mexican_restaurant in Manhattan Midtown
Pausing for 10 