## Common imports

In [1]:
import requests
import io
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import itertools

# Goal
- Get list of all sensors
- For each sensors, retrieve data from 2018 to 2023
- Preprocessing of the data : handle the missing data as in PDF

# Retrieve all sensors

In [2]:
def get_all_sensors():
    """
    Retrieve all sensors from the Brussels Mobility API
    :return: all sensors from the Brussels Mobility API
    """
    response = requests.get("https://data.mobility.brussels/bike/api/counts/?request=devices")
    data = response.json()
    return [sensor['id'][7:] for sensor in data['features']]

In [3]:
sensors = get_all_sensors()

# Retrieve data from all sensors
## Retrieve data for one sensor

In [4]:
def retrieve_historical_data_from_one_sensor(sensor_id: str, start_date="20181206", end_date="20230331", save=False,
                                             prefix_path="historical_data_csv/"):
    url = f"https://data.mobility.brussels/bike/api/counts/?request=history&featureID={sensor_id}&startDate={start_date}&endDate={end_date}&outputFormat=csv"
    response = requests.get(url)
    if response.status_code == 200:
        content = response.content
        csv_buffer = io.StringIO(content.decode())
        df = pd.read_csv(csv_buffer)
        if save:
            if not os.path.exists(prefix_path):
                os.makedirs(prefix_path)
                print("Folder created!")
            df.to_csv(f"{prefix_path}/{sensor_id}.csv")
        return df
    else:
        raise Exception(f"Error request : might be that {sensor_id} is not a valid sensor")


retrieve_historical_data_from_one_sensor(sensor_id=sensors[0])

Unnamed: 0,Date,Time gap,Count,Average speed
0,2020-10-27,93,0,-1
1,2020-10-27,94,0,-1
2,2020-10-27,95,0,-1
3,2020-10-27,96,0,-1
4,2020-10-28,1,0,-1
...,...,...,...,...
84955,2023-03-31,92,4,18
84956,2023-03-31,93,3,19
84957,2023-03-31,94,1,12
84958,2023-03-31,95,1,29


Note if we use the first sensor : the start date begins in 2020, we will need to handle this in the preprocessing

## Retrieve Historical Data For All sensors

In [5]:
for sensor in tqdm(sensors):
    retrieve_historical_data_from_one_sensor(sensor, save=True)

100%|██████████| 18/18 [00:45<00:00,  2.51s/it]


# Data Preprocessing
Combine all csv together before preprocess the data

In [6]:
def get_sensor_csv(sensor_name: str, prefix_path="historical_data_csv/"):
    sf = pd.read_csv(f"{prefix_path + sensor_name}.csv", index_col=0)
    sf["Sensor"] = sensor_name
    return sf


get_sensor_csv('CAT17')

Unnamed: 0,Date,Time gap,Count,Average speed,Sensor
0,2020-10-27,93,0,-1,CAT17
1,2020-10-27,94,0,-1,CAT17
2,2020-10-27,95,0,-1,CAT17
3,2020-10-27,96,0,-1,CAT17
4,2020-10-28,1,0,-1,CAT17
...,...,...,...,...,...
84955,2023-03-31,92,4,18,CAT17
84956,2023-03-31,93,3,19,CAT17
84957,2023-03-31,94,1,12,CAT17
84958,2023-03-31,95,1,29,CAT17


Get all the dates and check for every date that the time gap goes from 1 to 96 otherwise fill the missing values.

In [7]:
def get_df_with_filled_missing_values(start_date="20181206", end_date="20230331", save=False,
                                      prefix_path="historical_data_csv/"):
    all_history = pd.DataFrame()
    for sensor in tqdm(sensors):
        all_history = pd.concat([all_history, get_sensor_csv(sensor)], ignore_index=True)

    all_dates = pd.date_range(start=start_date, end=end_date)
    all_numbers = np.arange(1, 97)

    # create all possible combinations of dates, time gaps, and sensors
    combinations = tqdm(list(itertools.product(all_dates.strftime('%Y-%m-%d'), all_numbers, sensors)))

    # create a DataFrame from the combinations
    oj = pd.DataFrame(combinations, columns=['Date', 'Time gap', 'Sensor'])

    # perform an outer join on 'Date', 'Time gap', 'Sensor' columns
    df_merge = pd.merge(oj, all_history, on=['Date', 'Time gap', 'Sensor'], how='outer')

    # fill NaN values in column 'Count' with 0, and in column 'Average speed' with -1
    fill_values = {'Count': 0, 'Average speed': -1}
    df_filled = df_merge.fillna(value=fill_values)
    print("Done!")
    if save:
        if not os.path.exists(prefix_path):
            os.makedirs(prefix_path)
            print("Folder created!")
        df_filled.to_csv(f"{prefix_path}/data.csv")
    return df_filled

In [8]:
get_df_with_filled_missing_values(save=True)

100%|██████████| 18/18 [00:00<00:00, 33.74it/s]
100%|██████████| 2725056/2725056 [00:00<00:00, 10320804.74it/s]


Done!


Unnamed: 0,Date,Time gap,Sensor,Count,Average speed
0,2018-12-06,1,CAT17,0.0,-1.0
1,2018-12-06,1,CB02411,0.0,-1.0
2,2018-12-06,1,CB1101,0.0,-1.0
3,2018-12-06,1,CB1142,0.0,-1.0
4,2018-12-06,1,CB1143,0.0,-1.0
...,...,...,...,...,...
2725051,2023-03-31,96,CJE181,1.0,4.0
2725052,2023-03-31,96,CJM90,10.0,17.0
2725053,2023-03-31,96,CLW239,0.0,-1.0
2725054,2023-03-31,96,COM205,2.0,13.0
