In [1]:
import pandas as pd
import numpy as np
import os
import requests
import env

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Example usage:
app_token = env.app_token
year_to_retrieve = '2023'
max_req = 100  # Specify the maximum number of observations to retrieve


In [4]:
def get_health_inspection_data(year, app_token, max_observations=None):
    # Define the base API URL for health inspection data
    base_url = 'https://data.cityofnewyork.us/resource/43nn-pn8j.json'

    # Check if a CSV file with the specified year already exists
    csv_filename = f'nyc_health_inspections_{year}.csv'
    if os.path.isfile(csv_filename):
        print(f"CSV file for {year} already exists. Loading data from the CSV.")
        df = pd.read_csv(csv_filename)
        return df

    # Initialize an empty list to store all data
    all_data = []

    # Set the initial offset to 0 and the page size to 1000
    offset = 0
    page_size = 1000

    while max_observations is None or len(all_data) < max_observations:
        # Calculate the remaining observations to retrieve
        remaining_observations = max_observations - len(all_data) if max_observations is not None else page_size

        # Calculate the actual page size for this request
        actual_page_size = min(page_size, remaining_observations)

        # Construct the URL with the app token, date filter, offset, and page size
        url = f'{base_url}?$where=inspection_date between "{year}-01-01T00:00:00.000" and "{year}-12-31T23:59:59.999"&$$app_token={app_token}&$offset={offset}&$limit={actual_page_size}'

        # Send an HTTP GET request to the API
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()  # Convert JSON response to Python data
            if len(data) == 0:
                break  # No more data, exit the loop
            all_data.extend(data)  # Add the data to the list
            offset += actual_page_size  # Increment the offset for the next request
        else:
            print(f"Failed to retrieve health inspection data for {year}. Status code: {response.status_code}")
            return None  # Exit the function with None if data retrieval fails

        if max_observations is not None and len(all_data) >= max_observations:
            break  # Stop if the maximum number of observations has been reached

    # Create a DataFrame using pandas
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file for easy access
    df.to_csv(csv_filename, index=False)

    print(f"Health inspection data for {year} retrieved and saved to {csv_filename}.")

    return df

In [5]:
df = get_health_inspection_data(year_to_retrieve, app_token)

CSV file for 2023 already exists. Loading data from the CSV.


In [6]:
df.head()

Unnamed: 0,camis,dba,boro,building,street,zipcode,phone,cuisine_description,inspection_date,action,...,record_date,inspection_type,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,50124301,YUMMY JUICE BAR,Bronx,737A,LYDIG AVENUE,10462.0,3472936151,"Juice, Smoothies, Fruit Salads",2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Pre-permit (Operational) / Re-inspection,40.854471,-73.866174,211.0,13.0,22403.0,2049410.0,2043190000.0,BX49
1,50106430,PAN TODO RICO,Queens,7617,ROOSEVELT AVE,11372.0,6466393116,Bakery Products/Desserts,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Cycle Inspection / Re-inspection,40.747064,-73.889337,403.0,25.0,28700.0,4029880.0,4012870000.0,QN28
2,50129821,MEI JUNG MEI CHINESE RESTAURANT,Brooklyn,1402,FLATBUSH AVENUE,11210.0,9292509943,Chinese,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Pre-permit (Operational) / Re-inspection,40.63638,-73.951435,314.0,45.0,77000.0,3120854.0,3052260000.0,BK42
3,50057824,SERAFINA LUDLOW,Manhattan,98,RIVINGTON STREET,10002.0,2123589800,Italian,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Cycle Inspection / Initial Inspection,40.720111,-73.988463,103.0,1.0,3001.0,1084639.0,1004110000.0,MN27
4,41399360,NEW RONG HANG RESTAURANT,Manhattan,38,ELDRIDGE STREET,10002.0,2126258999,Chinese,2023-01-03T00:00:00.000,Violations were cited in the following area(s).,...,2023-10-24T06:00:07.000,Cycle Inspection / Re-inspection,40.715711,-73.993204,103.0,1.0,1600.0,1003876.0,1003000000.0,MN27


In [None]:
df.info

In [7]:
len(df)

73512

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73512 entries, 0 to 73511
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   camis                  73512 non-null  int64  
 1   dba                    73508 non-null  object 
 2   boro                   73512 non-null  object 
 3   building               73454 non-null  object 
 4   street                 73512 non-null  object 
 5   zipcode                72777 non-null  float64
 6   phone                  73512 non-null  object 
 7   cuisine_description    73512 non-null  object 
 8   inspection_date        73512 non-null  object 
 9   action                 73512 non-null  object 
 10  violation_code         73127 non-null  object 
 11  violation_description  73127 non-null  object 
 12  critical_flag          73512 non-null  object 
 13  score                  70147 non-null  float64
 14  grade                  41234 non-null  object 
 15  gr

In [31]:
# Example usage:
dallas_app_token = env.dallas_app_token
year_to_retrieve = '2023'
max_req = 100  # Specify the maximum number of observations to retrieve


In [32]:
def get_dallas_health_inspection_data(year, app_token, max_observations=None):
    # Define the base URL for Dallas health inspection data
    base_url = 'https://www.dallasopendata.com/resource/dri5-wcct.json'

    # Check if a CSV file with the specified year already exists
    csv_filename = f'dallas_health_inspections_{year}.csv'
    if os.path.isfile(csv_filename):
        print(f"CSV file for {year} already exists. Loading data from the CSV.")
        df = pd.read_csv(csv_filename)
        return df

    # Initialize an empty list to store all data
    all_data = []

    # Set the initial offset to 0 and the page size to 1000
    offset = 0
    page_size = 1000

    while max_observations is None or len(all_data) < max_observations:
        # Calculate the remaining observations to retrieve
        remaining_observations = max_observations - len(all_data) if max_observations is not None else page_size

        # Calculate the actual page size for this request
        actual_page_size = min(page_size, remaining_observations)

        # Construct the URL with the app token, date filter, offset, and page size
        url = f'{base_url}?$where=insp_date between "{year}-01-01T00:00:00.000" and "{year}-12-31T23:59:59.999"&$$app_token={app_token}&$offset={offset}&$limit={actual_page_size}'

        # Send an HTTP GET request to the API
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()  # Convert JSON response to Python data
            if len(data) == 0:
                break  # No more data, exit the loop
            all_data.extend(data)  # Add the data to the list
            offset += actual_page_size  # Increment the offset for the next request
        else:
            print(f"Failed to retrieve Dallas health inspection data for {year}. Status code: {response.status_code}")
            return None  # Exit the function with None if data retrieval fails

        if max_observations is not None and len(all_data) >= max_observations:
            break  # Stop if the maximum number of observations has been reached

    # Create a DataFrame using pandas
    df = pd.DataFrame(all_data)

    # Save the DataFrame to a CSV file for easy access
    df.to_csv(csv_filename, index=False)

    print(f"Dallas health inspection data for {year} retrieved and saved to {csv_filename}.")

    return df

In [33]:
df_dallas = get_dallas_health_inspection_data(year_to_retrieve, dallas_app_token, max_observations=None)

Dallas health inspection data for 2023 retrieved and saved to dallas_health_inspections_2023.csv.


In [35]:
df_dallas.head()

Unnamed: 0,program_identifier,type,insp_date,score,street_number,street_name,street_type,site_address,zip,violation1_description,...,violation19_text,violation19_memo,violation20_description,violation20_points,violation20_text,violation20_memo,violation21_description,violation21_points,violation21_text,violation21_memo
0,VELVET TACO,Routine,2023-01-03T00:00:00.000,98,4622,GREENVILLE,AVE,4622 GREENVILLE AVE,75206,*45 Lockers to be used to store personal items,...,,,,,,,,,,
1,MARILLAC SENIOR CNTR,Routine,2023-01-03T00:00:00.000,99,2843,LAPSLEY,ST,2843 LAPSLEY ST,75212,*39 Equipment in good repair and proper adjust...,...,,,,,,,,,,
2,THE CHEESECAKE FACTORY,Routine,2023-01-03T00:00:00.000,89,7700,NORTHWEST,HWY,7700 W NORTHWEST HWY #700,75225,*02 Cold Hold (41øF/45øF or below),...,,,,,,,,,,
3,KAFFEINE CAFEE LLC,Routine,2023-01-03T00:00:00.000,97,13440,TI BLVD UNIT #1,,13440 TI BLVD UNIT #1,75243,*02 Cold Hold (41øF/45øF or below),...,,,,,,,,,,
4,SHAPE UP DALLAS,Routine,2023-01-03T00:00:00.000,97,14902,PRESTON,RD,14902 PRESTON RD #510,75254,*10 Clean Sight and Touch,...,,,,,,,,,,
