<a href="https://colab.research.google.com/github/SELF-msselve/UTN-DataEngineering/blob/main/CEL_Extracci%C3%B3n_y_almacenamiento_APIs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests
!pip install fastparquet



In [None]:
import requests
import pandas as pd
import os
from datetime import datetime, timedelta

In [None]:
def get_data(base_url, endpoint, data_field=None, params=None, headers=None):
    """
    Realiza una solicitud GET a una API para obtener datos.

    Parámetros:
    base_url (str): La URL base de la API.
    endpoint (str): El endpoint de la API al que se realizará la solicitud.
    data_field (str): Atribudo del json de respuesta donde estará la lista
    de objetos con los datos que requerimos
    params (dict): Parámetros de consulta para enviar con la solicitud.
    headers (dict): Encabezados para enviar con la solicitud.

    Retorna:
    dict: Los datos obtenidos de la API en formato JSON.
    """
    try:
        endpoint_url = f"{base_url}/{endpoint}"
        response = requests.get(endpoint_url, params=params, headers=headers)
        response.raise_for_status()  # Levanta una excepción si hay un error en la respuesta HTTP.

        # Verificar si los datos están en formato JSON.
        try:
            data = response.json()
            if data_field:
              data = data[data_field]
        except:
            print("El formato de respuesta no es el esperado")
            return None
        return data

    except requests.exceptions.RequestException as e:
        # Capturar cualquier error de solicitud, como errores HTTP.
        print(f"La petición ha fallado. Código de error : {e}")
        return None

def build_table(json_data, record_path=None):
    """
    Construye un DataFrame de pandas a partir de datos en formato JSON.

    Parámetros:
    json_data (dict): Los datos en formato JSON obtenidos de una API.

    Retorna:
    DataFrame: Un DataFrame de pandas que contiene los datos.
    """
    try:
        df = pd.json_normalize(
            json_data,
            record_path)
        return df
    except:
        print("Los datos no están en el formato esperado")
        return None

def save_to_parquet(df, output_path, partition_cols=None):
    """
    Recibe un dataframe, se recomienda que haya sido convertido a un formato tabular,
    y lo guarda en formato parquet.

    Parametros:
    df (pd.DataFrame). Dataframe a guardar.
    output_path (str). Ruta donde se guardará el archivo. Si no existe, se creará.
    partition_cols (list o str). Columna/s por las cuales particionar los datos.
    """

    # Crear el directorio si no existe
    directory = os.path.dirname(output_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)

    df.to_parquet(
        output_path,
        engine="fastparquet",
        partition_cols=partition_cols
        )

In [None]:
base_url = "https://api.luchtmeetnet.nl/open_api"

### Obtener datos de todas las estaciones

In [None]:
# Obtener todas las estaciones
endpoint = "stations"
params = {"organisation_id": 1}

stations = get_data(base_url, endpoint, data_field="data", params=params)

### Obtener detalles de cada estación

In [None]:
# Obtener detalles de cada station
all_stations = []

for station in stations:
  endpoint = f"stations/{station['number']}"

  station_details = get_data(base_url, endpoint, "data")
  if station_details:
    station_details["number"] = station["number"]
    all_stations.append(station_details)

In [None]:
df_stations = build_table(all_stations)

In [None]:
df_stations.head()

Unnamed: 0,type,components,municipality,url,province,organisation,location,year_start,number,geometry.type,geometry.coordinates,description.NL,description.EN
0,Regional,"[NO2, NO, LKI, PM10, PM25]",Rotterdam,,,DCMR (Rijnmond),Rotterdam-Maasvlakte,2020.0,NL01497,point,"[3.99972, 51.933517]",Rotterdam - Maasvlakte,Measuring site Maasvlakte
1,Other,"[C6H6, C7H8, FN, NO, NO2, O3, PM10, PM25, SO2]",Rotterdam,,,DCMR (Rijnmond),Rotterdam-HvHolland,,NL01496,point,"[4.121944, 51.977802999084986]",Rotterdam Hoek van Holland,Rotterdam Hoek van Holland
2,Regional,"[NO2, PM10, PM25]",Ridderkerk,,,DCMR (Rijnmond),Ridderkerk-Voorweg,2018.0,NL01912,point,"[4.563812, 51.861729]",Ridderkerk-Voorweg,Ridderkerk-Voorweg
3,Traffic,"[FN, NO, NO2, PM10, PM25]",Rotterdam,,,DCMR (Rijnmond),Overschie-A13,,NL01491,point,"[4.4307, 51.93858]",Overschie-A13,Overschie-A13
4,Municipal,"[C6H6, C7H8, NO, NO2, O3, PM10, SO2, PM25]",Rotterdam,,,DCMR (Rijnmond),Rotterdam-Hoogvliet,,NL01485,point,"[4.35524, 51.86742]",Rotterdam-Hoogvliet,Rotterdam-Hoogvliet


### Obtener mediciones de la última hora cada diferentes estaciones

In [None]:
# Obtener mediciones

endpoint = "measurements"

start_date = datetime.utcnow() - timedelta(hours=1)

end_date = start_date.strftime("%Y-%m-%dT%H:59:59Z")
start_date = start_date.strftime("%Y-%m-%dT%H:00:00Z")

params = {
    "start": start_date,
    "end": end_date
    }


measurements = get_data(base_url, endpoint, params=params)
df_measurements = build_table(measurements, "data")

In [None]:
df_measurements.sort_values("timestamp_measured", ascending=True).query("station_number == 'NL10934'")

Unnamed: 0,station_number,value,timestamp_measured,formula
0,NL10934,0.33,2024-03-14T21:00:00+00:00,SO2
383,NL10934,6.09,2024-03-14T21:00:00+00:00,PM25
382,NL10934,17.23,2024-03-14T21:00:00+00:00,PM10
381,NL10934,38.67,2024-03-14T21:00:00+00:00,O3
380,NL10934,6.76,2024-03-14T21:00:00+00:00,NO2
379,NL10934,0.15,2024-03-14T21:00:00+00:00,NO
378,NL10934,62.09,2024-03-14T21:00:00+00:00,NH3


In [None]:
bronze_dir = "datalake/bronze/luchtmeetnet_api"

In [None]:
save_to_parquet(
    df_stations,
    f"{bronze_dir}/stations/data.parquet"
    )

In [None]:
save_to_parquet(
    df_measurements,
    f"{bronze_dir}/measurements",
    "formula"
    )

In [None]:
df_measurements["timestamp_measured"] = pd.to_datetime(df_measurements.timestamp_measured)
df_measurements["fecha"] = df_measurements.timestamp_measured.dt.date
df_measurements["hora"] = df_measurements.timestamp_measured.dt.hour

save_to_parquet(
    df_measurements,
    f"{bronze_dir}/measurements_v2",
    ["fecha", "hora"]
    )