In [1]:
import bisect
import os
import re
from datetime import datetime, timedelta
from pathlib import Path
from typing import TypedDict

import numpy as np
import pandas as pd
import pandera as pa
import xarray as xr
from tqdm import tqdm
from zoneinfo import ZoneInfo, available_timezones

np.random.seed(42)

%matplotlib inline


In [4]:
def get_df_websirenes_from_parquet(parquet_file: str = "../websirenes_coords.parquet"):
    df_websirenes = pd.read_parquet(parquet_file)
    df_websirenes.drop(columns=["estacao_desc"], inplace=True)
    return df_websirenes


df_websirenes = get_df_websirenes_from_parquet()
print(df_websirenes.shape)

df_websirenes.estacao = df_websirenes.estacao.str.strip()
print(
    "process df_websirenes['latitude'] df_websirenes['longitude'] and from float64 to str since it will be key:"
)
df_websirenes["latitude"] = df_websirenes["latitude"].apply(lambda x: str(x))
df_websirenes["longitude"] = df_websirenes["longitude"].apply(lambda x: str(x))

print(df_websirenes["latitude"].dtype)
print(df_websirenes["longitude"].dtype)

df_websirenes.head()

(83, 4)
process df_websirenes['latitude'] df_websirenes['longitude'] and from float64 to str since it will be key:
object
object


Unnamed: 0,id_estacao,estacao,latitude,longitude
0,33,Ladeira dos Tabajaras,-22.9617,-43.188
1,8,Cabritos 1,-22.9647,-43.195
2,26,Guararapes 1,-22.9447,-43.208
3,34,Liberdade 1,-22.9266,-43.218
4,64,Salgueiro 1,-22.9302,-43.226


In [5]:
def get_UTC_offset_from_timezone_name(timezone_name: str) -> str:
    now = datetime.now(ZoneInfo(timezone_name))
    return now.strftime("%z")


timezone_name = "America/Sao_Paulo"
print(f"The UTC offset for {timezone_name} is {get_UTC_offset_from_timezone_name(timezone_name)}")
print(available_timezones())

today_string = "2024-08-21 21:40:55-0300"
dt = pd.to_datetime(datetime.strptime(today_string, "%Y-%m-%d %H:%M:%S%z"))

print("Datetime object:", dt)
print("Timezone info:", dt.tzinfo)
dt_utc = dt.tz_convert("UTC")
print("Datetime object in UTC:", dt_utc)
print("Timezone info in UTC:", dt_utc.tzinfo)

print(repr(dt))
print(repr(dt_utc))
print(repr(pd.to_datetime(today_string, format="%Y-%m-%d %H:%M:%S%z").tz_convert(None)))
print(
    pd.Series([pd.to_datetime(today_string, format="%Y-%m-%d %H:%M:%S%z").tz_convert("UTC")]).dtype
)
print(
    pd.Series([pd.to_datetime(today_string, format="%Y-%m-%d %H:%M:%S%z").tz_convert(None)]).dtype
)

The UTC offset for America/Sao_Paulo is -0300
{'America/Kentucky/Monticello', 'US/Eastern', 'Etc/GMT-4', 'Europe/Amsterdam', 'Africa/Bamako', 'Antarctica/Troll', 'Etc/GMT+12', 'America/Cordoba', 'Europe/Chisinau', 'America/Cambridge_Bay', 'Pacific/Efate', 'Africa/Brazzaville', 'America/Indiana/Tell_City', 'Brazil/East', 'America/Araguaina', 'America/Mendoza', 'America/Blanc-Sablon', 'Asia/Beirut', 'Cuba', 'Europe/Gibraltar', 'Europe/Volgograd', 'Atlantic/Reykjavik', 'America/Dawson', 'Africa/Porto-Novo', 'Asia/Taipei', 'Africa/Monrovia', 'Europe/Moscow', 'Asia/Singapore', 'Pacific/Enderbury', 'Pacific/Apia', 'Pacific/Guam', 'Pacific/Tahiti', 'Zulu', 'Canada/Central', 'Africa/Kampala', 'Iran', 'Asia/Magadan', 'Europe/Zurich', 'America/North_Dakota/Center', 'Europe/Lisbon', 'Asia/Tomsk', 'Asia/Ashkhabad', 'America/Punta_Arenas', 'America/Paramaribo', 'Pacific/Kosrae', 'Africa/Accra', 'America/Kralendijk', 'Pacific/Chatham', 'Asia/Sakhalin', 'America/Rio_Branco', 'Africa/Ceuta', 'Asia/Dac

In [8]:
class WebSireneSchema(pa.DataFrameModel):
    horaLeitura: pd.Timestamp
    nome: str
    m15: float = pa.Field(nullable=True)  # ge=0, but has -99.99 values
    m30: float = pa.Field(nullable=True)
    h01: float = pa.Field(nullable=True)
    h02: float = pa.Field(nullable=True)
    h03: float = pa.Field(nullable=True)
    h04: float = pa.Field(nullable=True)
    h24: float = pa.Field(nullable=True)
    h96: float = pa.Field(nullable=True)
    station_id: int


class WebSirenesParser:
    minimum_date = pd.Timestamp.max
    maximum_date = pd.Timestamp.min

    def list_files(self) -> list[str]:
        return os.listdir("../../../data/ws/websirenes_defesa_civil")

    def _get_name_pattern(self) -> str:
        """
        Example:
            BARRA DA TIJUCA 3 2021-08-01 00:00:00-03 null 2 ... matches BARRA DA TIJUCA 3
        Returns:
            str: regex pattern to extract name from line
        """
        return r"^(?P<name>.+?)(?=\s+\d{4}-\d{2}-\d{2})"

    def _get_date_pattern(self) -> str:
        """
        Example:
            BARRA DA TIJUCA 3 2021-08-01 00:00:00-03 null 2 ... matches 2021-08-01 00:00:00-03
        Returns:
            str: regex pattern to extract date from line
        """
        return r"(?P<date>\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2}-\d{2})"

    def _get_timeframe_pattern(self) -> str:
        """
        Example:
            BARRA DA TIJUCA 3 2021-08-01 00:00:00-03 null 2 3 4 5 6 7 8 ... matches null 2 3 4 5 6 7 8
        Returns:
            str: regex pattern to extract timeframe from line
        """
        return r"(?P<timeframe>.+)$"

    def _get_complete_pattern(self) -> str:
        """
        Example:
            BARRA DA TIJUCA 3 2021-08-01 00:00:00-03 null 2 3 4 5 6 7 8 ...
            matches:
                BARRA DA TIJUCA 3 at group 'name'
                2021-08-01 00:00:00-03 at group 'date'
                null 2 3 4 5 6 7 8 at group 'timeframe'
        Returns:
            str: regex pattern to extract name, date and timeframe from line
        """
        return rf"{self._get_name_pattern()}\s+{self._get_date_pattern()}\s+{self._get_timeframe_pattern()}"

    def _extract_features(self, line: str) -> tuple:
        """
        Extracts features from a line of the txt file using regex patterns.
        It extracts:
            - name
            - date
            - m15
            - m30
            - h01
            - h02
            - h03
            - h04
            - h24
            - h96
            - station_id
        """
        complete_pattern = self._get_complete_pattern()

        match = re.search(complete_pattern, line)

        if match is None:
            raise ValueError(f"Could not extract features from line: {line}")

        name = match.group("name")
        date = match.group("date") + "00"
        timeframe = match.group("timeframe")

        m15, m30, h01, h02, h03, h04, h24, h96, station_id = [
            np.nan if x == "null" else float(x.replace(",", ".")) if "," in x else float(x)
            for x in timeframe.strip().split()
        ]

        return (
            name,
            date,
            m15,
            m30,
            h01,
            h02,
            h03,
            h04,
            h24,
            h96,
            int(station_id),
        )

    def _parse_txt_file(self, file_path: str) -> tuple[list[str], list[tuple]]:
        try:
            file_data: list[tuple] = []
            with open(file_path, "r", encoding="utf-8-sig") as file:
                header = file.readline().strip().split()
                for line in file:
                    file_data.append(self._extract_features(line))
            return header, file_data
        except Exception as e:
            print(f"Error parsing file {file_path}: {e}")
            raise e

    def read_station_name_id_txt_file(self, file_path: str) -> tuple[str, int]:
        try:
            with open(file_path, "r", encoding="utf-8-sig") as file:
                _header = file.readline().strip().split()
                (
                    nome,
                    horaLeitura,
                    m15,
                    m30,
                    h01,
                    h02,
                    h03,
                    h04,
                    h24,
                    h96,
                    station_id,
                ) = self._extract_features(file.readline())
                return nome, station_id
        except Exception as e:
            print(f"Error parsing file {file_path}: {e}")
            raise e

    def get_dataframe(self, file_path: str) -> pd.DataFrame:
        header, file_data = self._parse_txt_file(file_path)
        df = pd.DataFrame(file_data, columns=header)
        df["horaLeitura"] = pd.to_datetime(
            df["horaLeitura"], format="%Y-%m-%d %H:%M:%S%z"
        ).dt.tz_convert(None)
        df.rename(columns={"id": "station_id"}, inplace=True)
        WebSireneSchema.validate(df)
        df.set_index("horaLeitura", inplace=True)
        return df

    def assert_is_sorted_by_date(self, df: pd.DataFrame) -> None:
        assert df.index.is_monotonic_increasing, "DataFrame index is not sorted by date"


websirenes_parser = WebSirenesParser()

In [10]:
class StationNameId(TypedDict):
    name: str
    station_id: int


def get_stations_name_id() -> list[StationNameId]:
    stations_name_id: list[StationNameId] = []
    for file in websirenes_parser.list_files():
        name, station_id = websirenes_parser.read_station_name_id_txt_file(
            os.path.join("../../../data/ws/websirenes_defesa_civil", file)
        )
        stations_name_id.append(StationNameId(name=name, station_id=station_id))
    return stations_name_id


stations_name_id: list[StationNameId] = get_stations_name_id()
stations_name_id[:5]

[{'name': 'Juramento 2', 'station_id': 1},
 {'name': 'Cachoeirinha 1', 'station_id': 100},
 {'name': 'Cantagalo 1', 'station_id': 101},
 {'name': 'Engenho da Rainha 2', 'station_id': 102},
 {'name': 'Travessa Antonina 1', 'station_id': 106}]

In [11]:
def get_stations_not_found_in_parquet(
    stations_name_id: list[StationNameId], df_websirenes: pd.DataFrame
) -> list[StationNameId]:
    not_founds_in_parquet: list[StationNameId] = []
    for station_name_id in stations_name_id:
        name = station_name_id["name"]
        if name in df_websirenes.estacao.values:
            continue
        not_founds_in_parquet.append(station_name_id)
    return not_founds_in_parquet


def get_stations_not_matching_id_by_name(
    stations_name_id: list[StationNameId], df_websirenes: pd.DataFrame
) -> list:
    not_matching_id_by_name: list = []
    for station_name_id in stations_name_id:
        name_in_txt = station_name_id["name"]
        station_id_in_txt = station_name_id["station_id"]

        if name_in_txt not in df_websirenes.estacao.values:
            print(f"Station {name_in_txt} not found in parquet")
            continue

        station = df_websirenes[df_websirenes.estacao == name_in_txt].iloc[0]
        name_in_parquet = station["estacao"]
        station_id_in_parquet = station["id_estacao"]

        if station_id_in_txt == station_id_in_parquet:
            continue

        not_matching_id_by_name.append(
            {
                "name_in_txt": name_in_txt,
                "name_in_parquet": name_in_parquet,
                "station_id_in_txt": station_id_in_txt,
                "station_id_in_parquet": station_id_in_parquet,
            }
        )
    return not_matching_id_by_name


not_matching_id_by_name = get_stations_not_matching_id_by_name(stations_name_id, df_websirenes)
print(f"""
    Stations not matching id by name:
    {not_matching_id_by_name}
""")

assert all([x["name_in_txt"] == x["name_in_parquet"] for x in not_matching_id_by_name])

not_founds_in_parquet = get_stations_not_found_in_parquet(stations_name_id, df_websirenes)
print(f"""
    Stations not found in parquet:
    {not_founds_in_parquet}
""")

Station Travessa Antonina 1 not found in parquet

    Stations not matching id by name:
    [{'name_in_txt': 'Juramento 2', 'name_in_parquet': 'Juramento 2', 'station_id_in_txt': 1, 'station_id_in_parquet': 32}, {'name_in_txt': 'Cachoeirinha 1', 'name_in_parquet': 'Cachoeirinha 1', 'station_id_in_txt': 100, 'station_id_in_parquet': 9}, {'name_in_txt': 'Cantagalo 1', 'name_in_parquet': 'Cantagalo 1', 'station_id_in_txt': 101, 'station_id_in_parquet': 10}, {'name_in_txt': 'Engenho da Rainha 2', 'name_in_parquet': 'Engenho da Rainha 2', 'station_id_in_txt': 102, 'station_id_in_parquet': 20}, {'name_in_txt': 'São João 4', 'name_in_parquet': 'São João 4', 'station_id_in_txt': 109, 'station_id_in_parquet': 70}, {'name_in_txt': 'Andaraí 1', 'name_in_parquet': 'Andaraí 1', 'station_id_in_txt': 11, 'station_id_in_parquet': 3}, {'name_in_txt': 'Parque Nova Maracá 1', 'name_in_parquet': 'Parque Nova Maracá 1', 'station_id_in_txt': 111, 'station_id_in_parquet': 51}, {'name_in_txt': 'São João 2', '

In [13]:
class WebSirenesBuilder:
    def __init__(self) -> None:
        self.websirenes_datasets_path = Path("./websirenes_datasets")
        if not self.websirenes_datasets_path.exists():
            self.websirenes_datasets_path.mkdir()

    def merge_by_name(
        self, df_websirenes: pd.DataFrame, df_websirenes_defesa_civil: pd.DataFrame
    ) -> pd.DataFrame:
        df_websirenes_defesa_civil.reset_index(inplace=True)
        df = pd.merge(
            df_websirenes,
            df_websirenes_defesa_civil,
            left_on="estacao",
            right_on="nome",
            how="inner",
        )
        df.drop(columns=["estacao", "id_estacao"], inplace=True)
        df.set_index("horaLeitura", inplace=True)
        return df

    def create_key(self, df: pd.DataFrame) -> str:
        row = df.iloc[0]
        return f"{row['latitude']}_{row['longitude']}"

    def write_dataset(self, df: pd.DataFrame, key: str):
        if not self.websirenes_datasets_path.exists():
            self.websirenes_datasets_path.mkdir()
        if (self.websirenes_datasets_path / f"{key}.parquet").exists():
            print(f"Dataset {key}.parquet already exists")
            return
        df.to_parquet(self.websirenes_datasets_path / f"{key}.parquet")


websirenes_builder = WebSirenesBuilder()


def build_websirenes_datasets():
    files = websirenes_parser.list_files()

    for file in tqdm(files):
        df = websirenes_parser.get_dataframe(
            os.path.join("../../../data/ws/websirenes_defesa_civil", file)
        )

        station_name = df[WebSireneSchema.nome].iloc[0]
        if station_name in [x["name"] for x in not_founds_in_parquet]:
            print(f"Station {station_name} not found in parquet")
            continue

        if df.index.min() < websirenes_parser.minimum_date:
            websirenes_parser.minimum_date = df.index.min()
        if df.index.max() > websirenes_parser.maximum_date:
            websirenes_parser.maximum_date = df.index.max()

        websirenes_parser.assert_is_sorted_by_date(df)
        df = websirenes_builder.merge_by_name(df_websirenes, df)
        key = websirenes_builder.create_key(df)
        websirenes_builder.write_dataset(df, key)
        # print(f"""
        #     Initial operation date: {df.index[0]}
        #     Last operation date: {df.index[-1]}
        #     Last operation date using max: {df.index.max()}
        #     Station name: {df.nome.iloc[0]}
        #     Station key: {key}
        # """)


build_websirenes_datasets()

  6%|▌         | 5/84 [00:14<03:13,  2.46s/it]

Station Travessa Antonina 1 not found in parquet


100%|██████████| 84/84 [04:37<00:00,  3.31s/it]


In [14]:
websirenes_parser.minimum_date, websirenes_parser.maximum_date

(Timestamp('2011-04-12 20:30:00'), Timestamp('2022-06-02 21:30:00'))

In [15]:
date_string = "2011-04-12-20"  # build a pd.Timestamp Timestamp('2011-04-12 20:00:00')
date = pd.Timestamp(date_string, tz=None)
repr(date)

"Timestamp('2011-04-12 20:00:00')"

In [16]:
def load_websirene_dataset(key: str) -> pd.DataFrame:
    return pd.read_parquet(f"./websirenes_datasets/{key}.parquet")


print(Path("./websirenes_datasets").glob("*.parquet"))

keys = [x.stem for x in Path("./websirenes_datasets").glob("*.parquet")]
print(keys)

df_example = load_websirene_dataset(keys[0])
df_example.reset_index(inplace=True)
print(df_example.head())

assert all(df_example.latitude == df_example.latitude.iloc[0])
assert all(df_example.longitude == df_example.longitude.iloc[0])

<generator object Path.glob at 0x7f01c317dd60>
['-22.8344_-43.2953', '-22.8414_-43.2933', '-22.8443_-43.2905', '-22.8456_-43.2978', '-22.8483_-43.2805', '-22.8514_-43.3045', '-22.8522_-43.2783', '-22.8547_-43.2725', '-22.8575_-43.313', '-22.8575_-43.333', '-22.8595_-43.2657', '-22.8598_-43.2795', '-22.8606_-43.3027', '-22.8606_-43.3317', '-22.8633_-43.3111', '-22.8636_-43.2636', '-22.8641_-43.283', '-22.8645_-43.2756', '-22.8651_-43.2586', '-22.8652_-43.2805', '-22.8683_-43.3133', '-22.86_-43.2864', '-22.8711_-43.2894', '-22.875278_-43.305833', '-22.8766_-43.3011', '-22.8886_-43.3569', '-22.8928_-43.3432', '-22.8985_-43.2347', '-22.89_-43.3439', '-22.9017_-43.3594', '-22.9023_-43.3434', '-22.9043_-43.2388', '-22.9046_-43.264', '-22.9056_-43.2341', '-22.908_-43.2667', '-22.9119_-43.2635', '-22.9122_-43.2038', '-22.912_-43.2526', '-22.9135_-43.2942', '-22.9142_-43.2852', '-22.9147_-43.2757', '-22.914_-43.2822', '-22.9159_-43.2591', '-22.9164_-43.3422', '-22.9166_-43.2979', '-22.9178_-43.

In [17]:
df_example

Unnamed: 0,horaLeitura,latitude,longitude,nome,m15,m30,h01,h02,h03,h04,h24,h96,station_id
0,2012-05-21 20:45:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,,,,,,,,,35
1,2012-05-21 21:00:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,,,,,,,,35
2,2012-05-21 21:15:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,,,,,,,,35
3,2012-05-21 21:30:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,,,,,,,,35
4,2012-05-21 21:45:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,,,,,,,,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
344071,2022-06-02 20:15:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
344072,2022-06-02 20:30:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
344073,2022-06-02 20:45:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
344074,2022-06-02 21:00:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35


In [18]:
green_color_terminal = "\033[92m"
reset_color_terminal = "\033[0m"
print(
    f"{green_color_terminal}abordagem correta, pegar as medidas do passado para montar o de 15:00h{reset_color_terminal}"
)

timestamps = pd.date_range(start="2022-01-09 15:00:00", end="2022-01-09 17:00:00", freq="15min")

df_example2 = pd.DataFrame(index=timestamps)

df_example2["precipitation"] = np.random.rand(len(df_example2))

print(df_example2)

df_example2_resampled = df_example2.resample("1h", closed="right", label="right").sum()
print(df_example2_resampled)

dalezada = df_example2[
    (df_example2.index >= "2022-01-09 14:15:00") & (df_example2.index <= "2022-01-09 15:00:00")
]
print(round(dalezada["precipitation"].sum(), 6))

dalezada = df_example2[
    (df_example2.index >= "2022-01-09 15:15:00") & (df_example2.index <= "2022-01-09 16:00:00")
]
print(round(dalezada["precipitation"].sum(), 6))
dalezada = df_example2[
    (df_example2.index >= "2022-01-09 16:15:00") & (df_example2.index <= "2022-01-09 17:00:00")
]
print(round(dalezada["precipitation"].sum(), 6))

[92mabordagem correta, pegar as medidas do passado para montar o de 15:00h[0m
                     precipitation
2022-01-09 15:00:00       0.374540
2022-01-09 15:15:00       0.950714
2022-01-09 15:30:00       0.731994
2022-01-09 15:45:00       0.598658
2022-01-09 16:00:00       0.156019
2022-01-09 16:15:00       0.155995
2022-01-09 16:30:00       0.058084
2022-01-09 16:45:00       0.866176
2022-01-09 17:00:00       0.601115
                     precipitation
2022-01-09 15:00:00       0.374540
2022-01-09 16:00:00       2.437385
2022-01-09 17:00:00       1.681369
0.37454
2.437385
1.681369


In [19]:
data = {
    "A": [1, 2, np.nan, 4],
    "B": [5, np.nan, 7, 8],
    "C": [np.nan, 10, 11, 12],
    "D": [np.nan, np.nan, np.nan, np.nan],
}
df_with_nan = pd.DataFrame(data)
a_values = df_with_nan["A"]
print(a_values.isnull().any())
print(a_values.isnull().all())
print(a_values.sum())
print(df_with_nan["D"].sum())
print(df_with_nan["D"].isnull().all())
print(df_with_nan["C"].isnull().all())
print(max([np.nan, np.nan]))

True
False
7.0
0.0
True
False
nan


In [20]:
df_example.horaLeitura.max()

Timestamp('2022-06-02 21:15:00')

In [21]:
print(type(df_example.horaLeitura.dt.date[0]))

# dalezada = df_example[
#     (df_example.horaLeitura >= '2022-01-09 14:00:00-03') & (df_example.horaLeitura <= '2022-01-09 15:00:00-03')
# ]

# start_date = datetime.strptime('2022-01-09 14:00:00', '%Y-%m-%d %H:%M:%S')
# end_date = datetime.strptime('2022-01-09 15:00:00', '%Y-%m-%d %H:%M:%S')
start_date = pd.to_datetime(
    datetime.strptime("2022-01-09 14:00:00-03:00", "%Y-%m-%d %H:%M:%S%z")
).tz_convert(None)
end_date = pd.to_datetime(
    datetime.strptime("2022-01-09 15:00:00-03:00", "%Y-%m-%d %H:%M:%S%z")
).tz_convert(None)

dalezada = df_example[(df_example.horaLeitura >= start_date) & (df_example.horaLeitura <= end_date)]
m15_values = dalezada["m15"].values
print(m15_values.sum())


current_date = pd.to_datetime(
    datetime.strptime("2022-01-09 14:00:00-03:00", "%Y-%m-%d %H:%M:%S%z")
).tz_convert(None)
new_date = current_date - timedelta(minutes=45)
print(current_date)
print(new_date)

dalezada = df_example[
    (df_example.horaLeitura >= str(new_date)) & (df_example.horaLeitura <= str(current_date))
]

m15_values = dalezada["m15"].values
print(m15_values.sum())
dalezada

<class 'datetime.date'>
0.2
2022-01-09 17:00:00
2022-01-09 16:15:00
0.0


Unnamed: 0,horaLeitura,latitude,longitude,nome,m15,m30,h01,h02,h03,h04,h24,h96,station_id
330647,2022-01-09 16:15:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,6.0,50.8,35
330648,2022-01-09 16:30:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,5.8,50.8,35
330649,2022-01-09 16:45:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,5.8,50.8,35
330650,2022-01-09 17:00:00,-22.8344,-43.2953,Guaíba 1 / Vila Pequiri,0.0,0.0,0.0,0.0,0.0,0.0,5.8,50.8,35


In [22]:
filename = (
    "../adaptor.mars.internal-1715727447.0335772-3538-17-4fd0fd8b-6cbd-4465-949a-7ae25c73fad9.nc"
)
ds = xr.open_dataset(filename)
if "expver" in list(ds.coords.keys()):
    print(">>>Oops! expver dimension found. Going to remove it.<<<")
    ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
    ds_combine.load()
    ds = ds_combine
ds = ds[["u10", "v10", "d2m", "t2m", "sp", "tp"]]
ds

In [27]:
ds.data_vars

Data variables:
    u10      (time, latitude, longitude) float64 1MB ...
    v10      (time, latitude, longitude) float64 1MB ...
    d2m      (time, latitude, longitude) float64 1MB ...
    t2m      (time, latitude, longitude) float64 1MB ...
    sp       (time, latitude, longitude) float64 1MB ...
    tp       (time, latitude, longitude) float64 1MB ...

In [16]:
lats = ds.coords["latitude"].values
lons = ds.coords["longitude"].values

xd = ds.sel(time="2023-01-09T15:00:00.000000000", method="nearest")
df_era5land = xd.to_dataframe()
df_era5land.reset_index(inplace=True)
df_era5land["time"] = df_era5land["time"].dt.strftime("%Y-%m-%d %H:%M:%S")
print(df_era5land.head(5))
print(type(df_era5land["time"][0]))
df_era5land["time"] = pd.to_datetime(df_era5land["time"])
df_era5land.head(26)
print(len(df_era5land))
print(lats.size * lons.size)
assert (
    len(df_era5land) == lats.size * lons.size
), f"Expected {lats.size * lons.size} got {len(df_era5land)}"
print(f"latitude type: {type(df_era5land['latitude'][0])}")

   latitude  longitude       u10       v10         d2m         t2m  \
0     -22.0 -44.000000  0.563038 -1.104868  291.277377  294.777236   
1     -22.0 -43.900002  0.478085 -0.925669  291.482460  295.077731   
2     -22.0 -43.799999  0.336979 -0.871397  291.707201  295.388566   
3     -22.0 -43.700001  0.106079 -0.971544  291.875198  295.599235   
4     -22.0 -43.599998 -0.123369 -1.100977  292.015718  295.712648   

             sp        tp                 time  
0  90882.795613  0.003355  2023-01-09 15:00:00  
1  91509.727846  0.003943  2023-01-09 15:00:00  
2  92337.726202  0.004726  2023-01-09 15:00:00  
3  93143.781930  0.005202  2023-01-09 15:00:00  
4  93648.910186  0.005397  2023-01-09 15:00:00  
<class 'str'>
231
231
latitude type: <class 'numpy.float32'>


In [17]:
nan_df_era5land = df_era5land[df_era5land["tp"].isnull()]
nan_df_era5land_4_random_rows = nan_df_era5land.sample(4)
nan_df_era5land_4_random_rows_tp = nan_df_era5land_4_random_rows["tp"]
nan_df_era5land_4_random_rows_tp_1 = nan_df_era5land_4_random_rows_tp.iloc[0]
nan_df_era5land_4_random_rows_tp_2 = nan_df_era5land_4_random_rows_tp.iloc[1]
nan_df_era5land_4_random_rows_tp_3 = nan_df_era5land_4_random_rows_tp.iloc[2]
nan_df_era5land_4_random_rows_tp_4 = nan_df_era5land_4_random_rows_tp.iloc[3]

max_nan_df_era5land_4_random_rows_tp = max(
    nan_df_era5land_4_random_rows_tp_1,
    nan_df_era5land_4_random_rows_tp_2,
    nan_df_era5land_4_random_rows_tp_3,
    nan_df_era5land_4_random_rows_tp_4,
)
print(type(max_nan_df_era5land_4_random_rows_tp))
print(max_nan_df_era5land_4_random_rows_tp)
print(max_nan_df_era5land_4_random_rows_tp == np.nan)
print(max_nan_df_era5land_4_random_rows_tp == pd.NA)
print(np.isnan(max_nan_df_era5land_4_random_rows_tp))
print(pd.isna(max_nan_df_era5land_4_random_rows_tp))


<class 'numpy.float64'>
nan
False
<NA>
True
True


In [18]:
def get_upper_neighbor(lat: float, lon: float, sorted_latitudes_ascending: np.ndarray):
    lat_idx = bisect.bisect_right(sorted_latitudes_ascending, lat)
    if lat_idx < len(sorted_latitudes_ascending):
        return sorted_latitudes_ascending[lat_idx], lon
    return None


def get_bottom_neighbor(lat: float, lon: float, sorted_latitudes_ascending: np.ndarray):
    lat_idx = bisect.bisect_left(sorted_latitudes_ascending, lat)
    if lat_idx > 0:
        return sorted_latitudes_ascending[lat_idx - 1], lon
    return None


def get_left_neighbor(lat: float, lon: float, sorted_longitudes_ascending: np.ndarray):
    lon_idx = bisect.bisect_left(sorted_longitudes_ascending, lon)
    if lon_idx > 0:
        return lat, sorted_longitudes_ascending[lon_idx - 1]
    return None


def get_right_neighbor(lat: float, lon: float, sorted_longitudes_ascending: np.ndarray):
    lon_idx = bisect.bisect_right(sorted_longitudes_ascending, lon)
    if lon_idx < len(sorted_longitudes_ascending):
        return lat, sorted_longitudes_ascending[lon_idx]
    return None


In [19]:
from pydantic import BaseModel


class Square(BaseModel):
    top_left: tuple[float, float]
    bottom_left: tuple[float, float]
    bottom_right: tuple[float, float]
    top_right: tuple[float, float]


target = np.zeros((len(lats), len(lons)), dtype=np.float32)


def get_websirenes_keys_in_square(builder: WebSirenesBuilder, square: Square) -> list[str]:
    websirenes_datasets_path = builder.websirenes_datasets_path
    keys = [x.stem for x in Path(websirenes_datasets_path).glob("*.parquet")]
    websirenes_keys = []
    for key in keys:
        key_lat, key_lon = map(float, key.split("_"))

        if key_lat < square.bottom_left[0] or key_lat > square.top_left[0]:
            continue
        if key_lon < square.top_left[1] or key_lon > square.top_right[1]:
            continue
        websirenes_keys.append(key)
    return websirenes_keys


def _get_max_era5land_precipitation_in_square(square: Square, ds_time: xr.Dataset) -> float:
    """
    This function is going to return the max precipitation in the square from ERA5Land data
    """
    top_left_lat, top_left_lon = square.top_left
    bottom_left_lat, bottom_left_lon = square.bottom_left
    bottom_right_lat, bottom_right_lon = square.bottom_right
    top_right_lat, top_right_lon = square.top_right

    top_left = ds_time.sel(latitude=top_left_lat, longitude=top_left_lon)
    bottom_left = ds_time.sel(latitude=bottom_left_lat, longitude=bottom_left_lon)
    bottom_right = ds_time.sel(latitude=bottom_right_lat, longitude=bottom_right_lon)
    top_right = ds_time.sel(latitude=top_right_lat, longitude=top_right_lon)

    # top_left = top_left["tp"]

    assert top_left["tp"].size == 1
    assert bottom_left["tp"].size == 1
    assert bottom_right["tp"].size == 1
    assert top_right["tp"].size == 1

    top_left = top_left["tp"].data
    bottom_left = bottom_left["tp"].data
    bottom_right = bottom_right["tp"].data
    top_right = top_right["tp"].data

    max_tp = max(
        top_left,
        bottom_left,
        bottom_right,
        top_right,
    )

    if np.isnan(max_tp):
        # yellow_color_terminal = "\033[93m"
        # reset_color_terminal = "\033[0m"
        # print(
        #     f"{yellow_color_terminal}Found NaN ERA5Land values in all points of the square. It means we are out of land - square {square} - returning 0.0{reset_color_terminal}"
        # )
        return 0.0
    return max_tp


def get_precipitation_from_websirenes_keys_at_date(
    square: Square,
    websirenes_keys: list[str],
    timestamp: pd.Timestamp,
    ds_time: xr.Dataset,
) -> float:
    if len(websirenes_keys) == 0:
        # yellow_color_terminal = "\033[93m"
        # reset_color_terminal = "\033[0m"
        # print(
        #     f"{yellow_color_terminal}Found no stations in the square {square} at {date} - using ERA5Land max precipitation in square{reset_color_terminal}"
        # )
        return _get_max_era5land_precipitation_in_square(square, ds_time)

    precipitations_15_min_aggregated: list[float] = []
    for key in websirenes_keys:
        df_web = load_websirene_dataset(key)

        # print(f"Processing station {key} at {date}")
        # print(df_web)
        # if df_webs.index.tz is None:
        #     print(f'dataframe is tz NAIVE {df_webs.index.tz}')
        # else:
        #     print(f'dataframe is tz AWARE {df_webs.index.tz}')

        time_upper_bound = timestamp
        time_lower_bound = timestamp - timedelta(minutes=45)

        df_web_filtered = df_web[
            (df_web.index >= time_lower_bound) & (df_web.index <= time_upper_bound)
        ]

        m15 = df_web_filtered["m15"]

        if m15.isnull().all():
            # yellow_color_terminal = "\033[93m"
            # reset_color_terminal = "\033[0m"
            # print(
            #     f"{yellow_color_terminal}Found all NaN values in station {key} from {time_lower_bound} to {time_upper_bound} - using ERA5Land max precipitation in square{reset_color_terminal}"
            # )
            precipitations_15_min_aggregated.append(
                _get_max_era5land_precipitation_in_square(square, ds_time)
            )
            continue
        precipitations_15_min_aggregated.append(m15.sum())

    max_precipitation = max(precipitations_15_min_aggregated)
    return max_precipitation


# since we are traversing the matrix from top left to bottom right, we can use the same order to fill the target matrix
# to do so, we need to sort the lats and lons, the lons should have higher priority than lats on the sorting:
sorted_lats = np.sort(lats)[::-1]
sorted_lons = np.sort(lons)

sorted_latitudes_ascending = np.sort(lats)
sorted_longitudes_ascending = np.sort(lons)


def process_target(target: np.ndarray, ds_time: xr.Dataset, timestamp: pd.Timestamp):
    for i, lat_i in enumerate(sorted_lats):
        for j, lon_j in enumerate(sorted_lons):
            # row = df_era5land[
            #     (df_era5land.latitude == lat_i) & (df_era5land.longitude == lon_j)
            # ]
            # lat, lon = row["latitude"].values[0], row["longitude"].values[0]
            lat, lon = lat_i, lon_j

            bottom_neighbor = get_bottom_neighbor(lat, lon, sorted_latitudes_ascending)
            if not bottom_neighbor:
                continue
            lat_bottom, lon_bottom = bottom_neighbor
            right_neighbor = get_right_neighbor(lat_bottom, lon_bottom, sorted_longitudes_ascending)
            if not right_neighbor:
                continue
            lat_right, lon_right = right_neighbor
            upper_neighbor = get_upper_neighbor(lat_right, lon_right, sorted_latitudes_ascending)
            if not upper_neighbor:
                continue
            lat_upper, lon_upper = upper_neighbor

            square = Square(
                top_left=(lat, lon),
                bottom_left=(lat_bottom, lon_bottom),
                bottom_right=(lat_right, lon_right),
                top_right=(lat_upper, lon_upper),
            )

            websirene_keys = get_websirenes_keys_in_square(websirenes_builder, square)

            # if len(websirene_keys) > 0:
            #     green_color = "\033[92m"
            #     reset_color = "\033[0m"
            #     print(f"""
            #         {green_color}
            #         There are {len(websirene_keys)} stations in the square:
            #             Left top: {square.top_left}
            #             Left bottom: {square.bottom_left}
            #             Right bottom: {square.bottom_right}
            #             Right top: {square.top_right}
            #         Stations: {websirene_keys}
            #         {reset_color}
            #     """)

            # current_date = datetime.strptime(
            #     "2023-01-09T15:00:00.000000", "%Y-%m-%dT%H:%M:%S.%f"
            # )
            websirenes_precipitation_at_date = get_precipitation_from_websirenes_keys_at_date(
                square, websirene_keys, timestamp, ds_time
            )
            target[i, j] = websirenes_precipitation_at_date


def write_target(target: np.ndarray, timestamp: pd.Timestamp):
    target_directory = Path("./target")
    if not target_directory.exists():
        target_directory.mkdir()
    target_filename = target_directory / f"{timestamp.strftime('%Y_%m_%d_%H')}.npy"
    # check if file exists
    if target_filename.exists():
        # yellow_color_terminal = "\033[93m"
        # reset_color_terminal = "\033[0m"
        # print(
        #     f"{yellow_color_terminal}File {target_filename} already exists - skipping{reset_color_terminal}"
        # )
        return
    np.save(target_filename, target)


timestamps = pd.date_range(
    start=websirenes_parser.minimum_date, end=websirenes_parser.maximum_date, freq="h"
)

current_year_month = None
ds = None

for timestamp in tqdm(timestamps):
    break
    year = timestamp.year
    month = timestamp.month
    day = timestamp.day
    hour = timestamp.hour

    if current_year_month != (year, month):
        current_year_month = (year, month)
        df_era5land_path = f"./ERA5Land/monthly_data/RJ_{year}_{month}.nc"
        if not os.path.exists(df_era5land_path):
            raise FileNotFoundError(f"File {df_era5land_path} not found")
        ds = xr.open_dataset(filename)
        if "expver" in list(ds.coords.keys()):
            # print(">>>Oops! expver
            # dimension found. Going to remove it.<<<")
            ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
            ds_combine.load()
            ds = ds_combine
        ds = ds[["u10", "v10", "d2m", "t2m", "sp", "tp"]]

    time = f"{year}-{month}-{day}T{hour}:00:00.000000000"
    ds_time = ds.sel(time=time, method="nearest")  # type: ignore
    target = np.zeros((len(lats), len(lons)), dtype=np.float32)
    process_target(target, ds_time, timestamp)
    write_target(target, timestamp)


  0%|          | 0/97658 [00:00<?, ?it/s]
