# Download Data
This program accesses download the needed data from HOTOSM and Bunting Labs APIs, and stores them in files for further processing.

## Global Parameters



In [None]:
token_hotosm = "Token yourtokenhere"
token_bunting_labs = "yourtokenhere"
data_folder = "data/"

In [None]:
in_drive = True  # True to mount a drive while working in Google Colab
if in_drive:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)

### How to get the necessary tokens

#### HOTOSM Tasking Manager API

1.   Create an account at https://tasks.hotosm.org/
2.   Go to settings and activate "Expert Mode"
3.   The token is available at the settings page, under "API Key"


#### Bunting Labs API

1.   Create an account at https://buntinglabs.com/
2.   The token is available at the dashboard, under "OpenStreetMap API"

## Setup

In [None]:
import requests
import time
import json
from urllib.parse import urljoin
import pandas as pd
import geopandas as gpd
from typing import Union, Iterable
from tqdm.autonotebook import tqdm
from IPython.display import Markdown, display
import ipywidgets as widgets
import os.path

api_base_url_hotosm = "https://tasking-manager-tm4-production-api.hotosm.org/api/v2/"
headers_hotosm = {
    "Accept-Language": "en",
    "accept": "application/json",
    "Authorization": token_hotosm
}

api_base_url_bunting_labs = "https://osm.buntinglabs.com/v1/"
headers_bunting_labs = {}


def get_data(endpoint, payload, which="hotosm"):
    if which == "bunting_labs":
        api_base_url = api_base_url_bunting_labs
        headers = headers_bunting_labs
    else:  # hotosm
        api_base_url = api_base_url_hotosm
        headers = headers_hotosm

    while True:
        response = requests.get(
            urljoin(api_base_url, endpoint),
            headers=headers,
            params=payload,
            verify=False if which == "bunting_labs" else True  # see why Bunting Labs API fails verification
        )

        if response.status_code == 200:
          break  # Success

        if response.status_code == 502 or response.status_code == 504:
          # HOTOSM API gives 502 or 504 quite often for no apparent reason
          print("Received a " + str(response.status_code) +
                ", trying again in 3 seconds...")
          time.sleep(3)  # Sleep 3 seconds and try again

        else:
          print(f"Request failed with status code: {response.status_code}")
          raise Exception(response.json())
    return response.json()


def get_project_ids(createdFrom: str = None) -> pd.DataFrame:
    endpoint = "projects/"
    payload = {
        "orderBy": "id",
        "orderByType": "ASC",
        "mappingTypesExact": False,
        "page": 1,
        "projectStatuses": "ARCHIVED",
        "createdByMe": False,
        "mappedByMe": False,
        "favoritedByMe": False,
        "managedByMe": False,
        "basedOnMyInterests": False,
        "omitMapResults": True
    }

    if createdFrom is not None:
        payload["createdFrom"] = createdFrom

    projects: list[dict] = []
    temp_data = {
        "pagination": {
            "nextNum": 1
        }
    }

    pbar = tqdm(unit="projects")

    while temp_data["pagination"]["nextNum"] is not None:
        payload["page"] = int(temp_data["pagination"]["nextNum"])
        temp_data = get_data(endpoint, payload)
        projects += temp_data["results"]
        if int(temp_data["pagination"]["page"]) == 1:
            pbar.total = temp_data["pagination"]["total"]
            pbar.refresh()
        pbar.update(len(temp_data["results"]))

    return pd.DataFrame.from_dict(projects)


def get_project_stats(project_id: Union[int, str]) -> dict:
    endpoint = "projects/" + str(project_id) + "/statistics/"
    payload = {}

    return get_data(endpoint, payload)


def get_projects_stats(projectlist: Iterable[Union[int, str]]) -> pd.DataFrame:
    projects_stats: list[dict] = []
    for p in tqdm(projectlist, unit="projects"):
        projects_stats.append(get_project_stats(p))

    return pd.DataFrame.from_dict(projects_stats)


def get_project_activities(project_id: Union[int, str]) -> pd.DataFrame:
    endpoint = "projects/" + str(project_id) + "/activities/"
    payload = {}

    project_activity: list[dict] = []
    temp_data = {
        "pagination": {
            "nextNum": 1
        }
    }

    pbar = tqdm(unit="activities")

    while temp_data["pagination"]["nextNum"] is not None:
        payload["page"] = int(temp_data["pagination"]["nextNum"])
        temp_data = get_data(endpoint, payload)
        project_activity += temp_data["activity"]
        if int(temp_data["pagination"]["page"]) == 1:
            pbar.total = temp_data["pagination"]["total"]
            pbar.refresh()
        pbar.update(len(temp_data["activity"]))

    return pd.DataFrame.from_dict(project_activity)


def get_project_task_grid(project_id: Union[int, str]):
    endpoint = "projects/" + str(project_id) + "/tasks/"
    payload = {
        "as_file": False
    }

    return get_data(endpoint, payload)


def get_user_info(username: str) -> dict:
    endpoint = "users/queries/" + username + "/"
    payload = {}

    return get_data(endpoint, payload)


def get_users_info(userlist: Iterable[str]) -> pd.DataFrame:
    users_info: list[dict] = []
    for u in tqdm(userlist, unit="users"):
        users_info.append(get_user_info(u))

    return pd.DataFrame.from_dict(users_info)


def get_osm_extract(bbox: Iterable[float], tags="building=*"):
    endpoint = "osm/extract"  # https://docs.buntinglabs.com/openstreetmap-api/extract
    payload = {
        "tags": tags,
        "api_key": token_bunting_labs,
        "bbox": ",".join(map(str, bbox)),
    }

    return get_data(endpoint, payload, "bunting_labs")


## Get archived projects

In [None]:
display(Markdown("Select parameters for project selection:"))

from_date_picker = widgets.DatePicker(
    description='Projects after',
    disabled=False
)
display(from_date_picker)

In [None]:
output_filename = data_folder + "output_archived_projs.csv"

display(Markdown("Downloading archived projects"))
archived_projects = get_project_ids(createdFrom=from_date_picker.value)

display(Markdown("Saving output"))
archived_projects.to_csv(output_filename, index=False)

display(Markdown("Output saved to " + output_filename))
display(Markdown("Finished!"))

## Download stats from archived projects

In [None]:
input_filename = data_folder + "output_archived_projs.csv"
output_filename = data_folder + "output_archived_projs_stats.csv"

display(Markdown("Reading archived projects"))
input_data = pd.read_csv(input_filename)
projects = input_data['projectId'].unique()

display(Markdown("Downloading stats for those projects"))
projs_stats = get_projects_stats(projects)

display(Markdown("Saving output"))
projs_stats.to_csv(output_filename, index=False)

display(Markdown("Output saved to " + output_filename))
display(Markdown("Finished!"))

## Select the projects with 100% validation

In [None]:
input_filename = data_folder + "output_archived_projs_stats.csv"
output_filename = data_folder + "output_archived_projs_selected_ids.csv"

display(Markdown("Reading archived projects stats"))
input_data = pd.read_csv(input_filename)

display(Markdown("Select project ids where project is 100% validated"))
projs_sample = input_data.query("percentValidated == 100")[['projectId']]

display(Markdown("Saving output"))
projs_sample.to_csv(output_filename, index=False)

display(Markdown("Output saved to " + output_filename))
display(Markdown("Finished!"))

## Get project activities

In [None]:
input_filename = data_folder + "output_archived_projs_selected_ids.csv"
overwrite_if_exists = False

display(Markdown("Reading selected project ids"))
input_data = pd.read_csv(input_filename)

display(Markdown("DOWNLOADING ACTIVITIES FOR THE SELECTED PROJECTS"))
for proj_id in tqdm(input_data['projectId'], unit="project(s)"):
    output_filename = data_folder + "output_proj_" + str(proj_id) + ".csv"
    output_filename_exists = os.path.isfile(output_filename)

    if output_filename_exists and not overwrite_if_exists:
        display(Markdown("Skipping [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ") as output file exists already"))
    else:
        display(Markdown("Downloading activities for [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ")"))
        proj_act = get_project_activities(proj_id)

        display(Markdown("Saving output of project #" + str(proj_id)))
        proj_act.to_csv(output_filename, index=False)

        display(Markdown("Output saved to " + output_filename))

display(Markdown("Finished!"))

## Get users info from each project activities

In [None]:
input_ids_filename = data_folder + "output_archived_projs_selected_ids.csv"
overwrite_if_exists = False

display(Markdown("Reading selected project ids"))
input_data = pd.read_csv(input_ids_filename)

display(Markdown("DOWNLOADING USERS FOR THE SELECTED PROJECTS"))
for proj_id in tqdm(input_data['projectId'], unit="project(s)"):
    input_filename = data_folder + "output_proj_" + str(proj_id) + ".csv"
    output_filename = data_folder + "output_users_proj_" + str(proj_id) + ".csv"

    output_filename_exists = os.path.isfile(output_filename)

    if output_filename_exists and not overwrite_if_exists:
        display(Markdown("Skipping [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ") as output file exists already"))
    else:
        display(Markdown("Reading activities for [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ")"))
        proj_info = pd.read_csv(input_filename)
        users = proj_info['actionBy'].unique()

        display(Markdown("Downloading users details"))
        users_info = get_users_info(users)

        display(Markdown("Saving output"))
        users_info.to_csv(output_filename, index=False)

        display(Markdown("Output saved to " + output_filename))

display(Markdown("Finished!"))


## Get task grid for each project

In [None]:
input_ids_filename = data_folder + "output_archived_projs_selected_ids.csv"
overwrite_if_exists = False

display(Markdown("Reading selected project ids"))
input_data = pd.read_csv(input_ids_filename)

display(Markdown("DOWNLOADING TASK GRIDS FOR THE SELECTED PROJECTS"))
for proj_id in tqdm(input_data['projectId'], unit="project(s)"):
    output_filename = data_folder + "output_proj_" + str(proj_id) + "_grid.geojson"

    output_filename_exists = os.path.isfile(output_filename)

    if output_filename_exists and not overwrite_if_exists:
        display(Markdown("Skipping [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ") as output file exists already"))
    else:
        display(Markdown("Downloading task grid for [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ")"))
        proj_grid = get_project_task_grid(proj_id)

        display(Markdown("Saving output"))
        with open(output_filename, 'w') as of:
            json.dump(proj_grid, of)

        display(Markdown("Output saved to " + output_filename))

display(Markdown("Finished!"))

## OSM building data

In [None]:
input_ids_filename = data_folder + "output_archived_projs_selected_ids.csv"
overwrite_if_exists = False

display(Markdown("Reading selected project ids"))
input_data = pd.read_csv(input_ids_filename)

display(Markdown("DOWNLOADING ASSOCIATED OSM DATA FOR EVERY PROJECT"))
for proj_id in tqdm(input_data['projectId'], unit="project(s)"):
    input_proj_grid_filename = data_folder + "output_proj_" + str(proj_id) + "_grid.geojson"
    output_filename = data_folder + "output_proj_" + str(proj_id) + "_osm.geojson"

    output_filename_exists = os.path.isfile(output_filename)

    if output_filename_exists and not overwrite_if_exists:
        display(Markdown("Skipping [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ") as output file exists already"))
    else:
        display(Markdown("Reading input files for [project #" + str(proj_id) + "](https://tasks.hotosm.org/projects/" + str(proj_id) + ")"))
        proj_grid = gpd.GeoDataFrame.from_file(input_proj_grid_filename)

        display(Markdown("Extracting OSM data for the area " + str(proj_grid.geometry.total_bounds)))
        extract = get_osm_extract(proj_grid.geometry.total_bounds)
        display(Markdown("Saving output"))
        with open(output_filename, 'w') as of:
            json.dump(extract, of)

        display(Markdown("Output saved to " + output_filename))

display(Markdown("Finished!"))

### Why was the Bunting Labs API chosen?

There are several ways to download OSM data, all of them with its advantages and disadvantages. Four of these were considered for the project:

* Planet OSM + Osmium extract
* Geofabrik extracts (+ optional Osmium extract)
* Overpass API
* Bunting Labs API

The two first options are a good alternative, but were ultimately discarded as the data is quite big to handle with Python, in addition to PyOsmium not having the extract function available. On top of that, Osmium CLI is available for MacOS and GNU/Linux, but not for Windows. We want to make this workflow reproducible for users irrespectively of Operating System. You are still encouraged to use these two options if they fit your workflow better than our proposed way.

The Overpass API option was discarded as query limits are very low, and we would hit them multiple times with HOTOSM projects.

That leaves the Bunting Labs API as our option, with very generous limits, but the downside of having to register for a token.

