In [2]:
import requests
import pandas as pd

In [32]:
data_path: str = r"../data"


def dataframe_to_csv(df: pd.DataFrame, save_path: str, file_name: str) -> None:
    """ Converts a DataFrame to a csv file and saves it at a specific location

    Args:
        df (pd.DataFrame): DataFrame to be converted to CSV
        save_path (str): The folder in which the file needs to be saved
        file_name (str): The actual name of the file
    """
    df.to_csv(f"{save_path}/{file_name}", ",", index=False, encoding="utf-8")

In [33]:
def get_odata(target_url: str, max_requests: int = 100) -> pd.DataFrame:
    """ This functions retrieves data from a given dataset from the Central Bureau of Statistics (CBS).
    The API returns the data in a JSON format, which is converted to a pandas DataFrame.

    Each data request contains a maximum of 10.000 cells, therefore in order to retrieve the whole dataset,
    the API is constantly called. 

    Args:
        target_url (str): The target url which points to a dataset
        max_requests (int): Indicates the maximum number of requests should be made

    Returns:
        pd.DataFrame: DataFrame that contains all retrieved data from CBS
    """
    df: pd.DataFrame = pd.DataFrame()
    request_index: int = 1

    # Making the requests
    while target_url and (request_index <= max_requests):
        print(f"Starting request {str(request_index)}/{str(max_requests)}...")

        r = requests.get(target_url).json()
        df = pd.concat([df, pd.DataFrame(r['value'])], ignore_index=True)
        
        if '@odata.nextLink' in r:  
            target_url = r['@odata.nextLink']
        else:
            target_url = None
        
        request_index = request_index + 1
    
    print("Finished requests")

    return df

In [41]:
# Make a dictionary with types of datasets and their identifier
cbs_datasets: dict[str, str] = {
    "crime_types": "83648NED"
}

# For each dataset in the dictionary, fetch the data and put it in a csv file
for name, identifier in cbs_datasets.items():
    print(f"Fetching data for the \"{name}\" dataset")

    target_url: str = f"https://odata4.cbs.nl/CBS/{identifier}/Observations"
    data: pd.DataFrame = get_odata(target_url, 3)

    dataframe_to_csv(data, data_path, f"{name}.csv")


Fetching data for the "crime_types" dataset
https://odata4.cbs.nl/83648NED/Observations
Starting request 1/3...
Starting request 2/3...
Starting request 3/3...
Finished requests


In [17]:
wanted_persons_parameters: dict[str, str] = {
    "uid": None,
    "language": "nl",
    "query": None,
    "lat": None,
    "lon": None,
    "radius": None,
    "maxnumberofitems": "25"
}


def get_police_data(target_url: str, max_requests: int = 10, parameters: dict[str, str] = {}) -> pd.DataFrame:
    """ Gets the data from the police API back in a dataframe. Since the API is limited to only returning 25 records,
    the API gets queried for a specified numer of times. While making the API call, it is possible to add additional parameters

    Args:
        target_url (str): The url for the specific data that you want to retrieve from the police
        max_requests (int, optional): Maximum number of requests made to the police API. Defaults to 10
        parameters (dict[str, str], optional): Additional parameters that can be added to the API request. Defaults to {}.

    Returns:
        pd.DataFrame: DataFrame containing the desired data
    """
    df: pd.DataFrame = pd.DataFrame()
    request_index = 1

    # Adding the parameters to the target url
    if parameters != {}:
        target_url = f"{target_url}?"

        for parameter, value in parameters.items():
            if value != None:
                target_url = f"{target_url}{parameter}={value}&"

    base_url: str = target_url

    # Making the requests
    while request_index <= max_requests:
        print(f"Starting request {str(request_index)}/{str(max_requests)}...")
        # Calculate the offset
        offset: int = (request_index - 1) * 25
        
        if parameter != {}:
            target_url = f"{base_url}offset={offset}"
        else:
            target_url = f"{base_url}&offset={offset}"

        r = requests.get(target_url).json()
        df = pd.concat([df, pd.DataFrame(r["opsporingsberichten"])], ignore_index=True)

        request_index = request_index + 1

    print("Finished requests")

    return df

In [22]:
target_url: str = "https://api.politie.nl/v4/gezocht"
columns_to_drop: list[str] = [
    "displayName",
    "links",
    "availabletranslations",
    "zaaknummer",
    "introductie",
    "omschrijving",
    "meerafbeeldingen",
    "urltipformulier",
    "afbeeldingen",
    "verdachte",
    "voortvluchtige",
    "dossier"
]

data = get_police_data(target_url, 20, wanted_persons_parameters)
data = data.drop(columns=columns_to_drop)

Starting request 1/20...
Starting request 2/20...
Starting request 3/20...
Starting request 4/20...
Starting request 5/20...
Starting request 6/20...
Starting request 7/20...
Starting request 8/20...
Starting request 9/20...
Starting request 10/20...
Starting request 11/20...
Starting request 12/20...
Starting request 13/20...
Starting request 14/20...
Starting request 15/20...
Starting request 16/20...
Starting request 17/20...
Starting request 18/20...
Starting request 19/20...
Starting request 20/20...
