In [1]:
import pandas as pd
import numpy as np
import os
import requests
import env

In [3]:
# Example usage:
app_token = env.app_token
year_to_retrieve = '2023'
max_req = 100  # Specify the maximum number of observations to retrieve


In [6]:
def get_health_inspection_data(year, app_token, max_observations=None):
    # Define the base API URL for health inspection data
    base_url = 'https://data.cityofnewyork.us/resource/43nn-pn8j.json'

    # Check if a CSV file with the specified year already exists
    csv_filename = f'nyc_health_inspections_{year}.csv'
    if os.path.isfile(csv_filename):
        print(f"CSV file for {year} already exists. Loading data from the CSV.")
        df = pd.read_csv(csv_filename)
        return df

    # Initialize an empty list to store all data
    all_data = []

    # Set the initial offset to 0 and the page size to 1000
    offset = 0
    page_size = 1000

    while max_observations is None or len(all_data) < max_observations:
        # Calculate the remaining observations to retrieve
        remaining_observations = max_observations - len(all_data) if max_observations is not None else page_size

        # Calculate the actual page size for this request
        actual_page_size = min(page_size, remaining_observations)

        # Construct the URL with the app token, date filter, offset, and page size
        url = f'{base_url}?$where=inspection_date between "{year}-01-01T00:00:00.000" and "{year}-12-31T23:59:59.999"&$$app_token={app_token}&$offset={offset}&$limit={actual_page_size}'

        # Send an HTTP GET request to the API
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()  # Convert JSON response to Python data
            if len(data) == 0:
                break  # No more data, exit the loop
            all_data.extend(data)  # Add the data to the list
            offset += actual_page_size  # Increment the offset for the next request
        else:
            print(f"Failed to retrieve health inspection data for {year}. Status code: {response.status_code}")
            return None  # Exit the function with None if data retrieval fails

        if max_observations is not None and len(all_data) >= max_observations:
            break  # Stop if the maximum number of observations has been reached

    # Create a DataFrame using pandas
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file for easy access
    df.to_csv(csv_filename, index=False)

    print(f"Health inspection data for {year} retrieved and saved to {csv_filename}.")

    return df

In [7]:
df = get_health_inspection_data(year_to_retrieve, app_token, max_observations=max_req)

Health inspection data for 2023 retrieved and saved to nyc_health_inspections_2023.csv.


In [8]:
df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,...,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,50124301,YUMMY JUICE BAR,Bronx,737A,LYDIG AVENUE,10462,3472936151,"Juice, Smoothies, Fruit Salads",2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Pre-permit (Operational) / Re-inspection,40.854471364602,-73.866173952462,211,13,22403,2049410,2043190050,BX49
1,50106430,PAN TODO RICO,Queens,7617,ROOSEVELT AVE,11372,6466393116,Bakery Products/Desserts,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Cycle Inspection / Re-inspection,40.747063755141,-73.889336695012,403,25,28700,4029880,4012870037,QN28
2,50129821,MEI JUNG MEI CHINESE RESTAURANT,Brooklyn,1402,FLATBUSH AVENUE,11210,9292509943,Chinese,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Pre-permit (Operational) / Re-inspection,40.636379950029,-73.951435060333,314,45,77000,3120854,3052260039,BK42
3,50057824,SERAFINA LUDLOW,Manhattan,98,RIVINGTON STREET,10002,2123589800,Italian,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Cycle Inspection / Initial Inspection,40.720110803137,-73.988463050518,103,1,3001,1084639,1004110033,MN27
4,41399360,NEW RONG HANG RESTAURANT,Manhattan,38,ELDRIDGE STREET,10002,2126258999,Chinese,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Cycle Inspection / Re-inspection,40.715711318929,-73.993203823468,103,1,1600,1003876,1003000003,MN27
