<a href="https://colab.research.google.com/github/WS2319/nyc_evictions_data/blob/main/v2_API_call_%5BOpenData_NYC%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests

# Define the API endpoint
api_url = "https://data.cityofnewyork.us/resource/6z8x-wfk4.json"

# Define the page size (how many rows to get per request)
page_size = 50000
offset = 0

# Create an empty list to store the DataFrames from each page
all_data_dfs = []

print("Starting to fetch all eviction data...")

while True:
    # Set the parameters for the current page
    params = {
        "$limit": page_size,
        "$offset": offset
    }

    print(f"Fetching data with offset: {offset}...")

    try:
        # Make the API request
        response = requests.get(api_url, params=params)
        response.raise_for_status()  # Raise an error for bad status codes

        # Convert the JSON response to a DataFrame
        data = response.json()

        # If the response is empty, we've reached the end of the data
        if not data:
            print("No more data to fetch. Loop finished.")
            break

        df_page = pd.DataFrame(data)

        # Add the current page's DataFrame to our list
        all_data_dfs.append(df_page)

        # If we received fewer rows than we asked for, it must be the last page
        if len(df_page) < page_size:
            print("Fetched the last page of data.")
            break

        # Increase the offset for the next iteration
        offset += page_size

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        break

# Concatenate all the page DataFrames into one final DataFrame
if all_data_dfs:
    final_df = pd.concat(all_data_dfs, ignore_index=True)
    print("\nAll data has been successfully fetched and combined!")
    print(f"Total rows retrieved: {len(final_df)}")
    final_df.info()
else:
    print("\nNo data was retrieved.")

# Now, 'final_df' contains ALL the eviction data from the API.
# This is the DataFrame you will process and load into BigQuery.

Starting to fetch all eviction data...
Fetching data with offset: 0...
Fetching data with offset: 50000...
Fetching data with offset: 100000...
Fetched the last page of data.

All data has been successfully fetched and combined!
Total rows retrieved: 118386
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118386 entries, 0 to 118385
Data columns (total 20 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   court_index_number          118386 non-null  object
 1   docket_number               118386 non-null  object
 2   eviction_address            118386 non-null  object
 3   eviction_apt_num            100999 non-null  object
 4   executed_date               118386 non-null  object
 5   marshal_first_name          118386 non-null  object
 6   marshal_last_name           118386 non-null  object
 7   residential_commercial_ind  118386 non-null  object
 8   borough                     118386 non-null  object
 9  