In [55]:
import pandas as pd
import os
from collections.abc import KeysView

In [56]:
df = pd.DataFrame({
    'name': ['a', 'a', 'b', 'b', 'c', 'c'],
    'speed': [1, 2, 3, 4, 5, 6],
})
print(df[(df['name'] == 'a') | (df['name'] == 'b')])
# or print(df[df['name'].isin(['a', 'b'])])

  name  speed
0    a      1
1    a      2
2    b      3
3    b      4


In [57]:
def check_if_data_folder_exists(folder: str) -> None:
  if not os.path.exists(folder):
    raise FileNotFoundError(f"The folder {folder} does not exist.")

check_if_data_folder_exists('alertario_rain_gauge/alertario')
df_alertario_gauge = pd.read_parquet('alertario_rain_gauge/alertario')
print(df_alertario_gauge.shape)
print(df_alertario_gauge.columns)

(8268690, 11)
Index(['station', 'datetime', 'precipitation', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'latitude', 'longitude', 'year', 'month'],
      dtype='object')


In [58]:
# Please see atmoseer/src/globals:ALERTARIO_GAUGE_STATION_IDS
ALERTARIO_GAUGE_STATION_IDS = [
    'anchieta', 'av_brasil_mendanha', 'bangu', 
    'barrinha', 'campo_grande', 'cidade_de_deus', 
    'copacabana', 'grajau_jacarepagua', 'grajau', 
    'grande_meier', 'grota_funda', 'ilha_do_governador', 
    'laranjeiras', 'madureira', 'penha', 
    'piedade', 'recreio', 'rocinha',
    'santa_teresa', 'saude', 'sepetiba', 
    'tanque', 'tijuca_muda', 'tijuca', 
    'urca', 'alto_da_boa_vista', 'iraja',
    'jardim_botanico', 'riocentro', 'santa_cruz',
    'vidigal'
]

set(df_alertario_gauge.station.unique()) - set(ALERTARIO_GAUGE_STATION_IDS)

{'guaratiba', 'sao_cristovao'}

In [59]:
df_alertario_gauge = df_alertario_gauge[df_alertario_gauge.station.isin(ALERTARIO_GAUGE_STATION_IDS)]
print(set(df_alertario_gauge.station.unique()) - set(ALERTARIO_GAUGE_STATION_IDS))
df_alertario_gauge.head()

set()


Unnamed: 0,station,datetime,precipitation,hour_sin,hour_cos,month_sin,month_cos,latitude,longitude,year,month
0,vidigal,2016-01-01 02:00:00+00:00,0.0,0.5,0.866025,0.5,0.866025,-22.9925,-43.23306,2016,1
1,vidigal,2016-01-01 02:15:00+00:00,0.0,0.55557,0.83147,0.5,0.866025,-22.9925,-43.23306,2016,1
2,vidigal,2016-01-01 02:30:00+00:00,0.0,0.608761,0.793353,0.5,0.866025,-22.9925,-43.23306,2016,1
3,vidigal,2016-01-01 02:45:00+00:00,0.0,0.659346,0.75184,0.5,0.866025,-22.9925,-43.23306,2016,1
4,vidigal,2016-01-01 03:00:00+00:00,0.0,0.707107,0.707107,0.5,0.866025,-22.9925,-43.23306,2016,1


In [60]:
class DataFrameHelper:
    def get_dataframe_with_selected_columns(self, df: pd.DataFrame, column_names: KeysView) -> pd.DataFrame:
        selected_columns = []
        for column_name in column_names:
            if column_name not in df.columns: raise ValueError(f"The column {column_name} does not exist in the df")
            selected_columns.append(column_name)
        return df[selected_columns].copy()

    def rename_dataframe_column_names(self, df: pd.DataFrame, column_name_mapping: dict[str, str]) -> pd.DataFrame:
        new_columns = []
        for old_column, new_column in column_name_mapping.items():
            if old_column not in df.columns: raise ValueError(f"The column {old_column} does not exist in the df")
            new_columns.append(new_column)
        df.columns = new_columns
        return df

    def get_column_name_mapping(self) -> dict[str, str]:
        return {
            'datetime': 'datetime',
            'station': 'station',
            'precipitation': 'precipitation_sum',
        }
    
    def filter_dataframe_by_station(self, df: pd.DataFrame, station: str ) -> pd.DataFrame:
        return df[df['station'] == station]
    
    def save_dataframe_as_parquet(self, df: pd.DataFrame, station_id: str) -> None:
        df.to_parquet(f"{station_id}.parquet")

In [61]:
dataframe_helper = DataFrameHelper()

column_name_mapping = {
    'station': 'estacao_desc',
    'latitude': 'latitude',
    'longitude': 'longitude'
}

alertario_stations = dataframe_helper.get_dataframe_with_selected_columns(
    df=df_alertario_gauge,
    column_names=column_name_mapping.keys()
)
alertario_stations = dataframe_helper.rename_dataframe_column_names(
    df=alertario_stations,
    column_name_mapping=column_name_mapping
)
alertario_stations.drop_duplicates(inplace=True)
print(len(alertario_stations))
alertario_stations.to_parquet('alertario_stations.parquet')

31


In [62]:
import numpy as np
np.random.seed(42)
random_numbers = np.random.rand(10) * 0.5
random_numbers = np.round(random_numbers, 2)
print(random_numbers)
index = pd.date_range('1/1/2000', periods=10, freq='min')
series = pd.Series(random_numbers, index=index)
print(series)
series.resample('2min').sum()

[0.19 0.48 0.37 0.3  0.08 0.08 0.03 0.43 0.3  0.35]
2000-01-01 00:00:00    0.19
2000-01-01 00:01:00    0.48
2000-01-01 00:02:00    0.37
2000-01-01 00:03:00    0.30
2000-01-01 00:04:00    0.08
2000-01-01 00:05:00    0.08
2000-01-01 00:06:00    0.03
2000-01-01 00:07:00    0.43
2000-01-01 00:08:00    0.30
2000-01-01 00:09:00    0.35
Freq: min, dtype: float64


2000-01-01 00:00:00    0.67
2000-01-01 00:02:00    0.67
2000-01-01 00:04:00    0.16
2000-01-01 00:06:00    0.46
2000-01-01 00:08:00    0.65
Freq: 2min, dtype: float64

In [63]:
import pandas as pd

pd.set_option('future.no_silent_downcasting', True)

data = {
    'datetime': [
        '2016-01-01 02:00:00',
        '2016-01-01 02:15:00',
        '2016-01-01 02:30:00',
        '2016-01-01 02:45:00',
        '2016-01-01 03:00:00',
        '2016-01-01 03:15:00',
        '2016-01-01 03:30:00',
        '2016-01-01 03:45:00',
        '2016-01-01 04:00:00',
        '2016-01-01 04:15:00'
    ],
    'precipitation': [0.4, 1.0, pd.NA, pd.NA, pd.NA, pd.NA, 12.2, 7.4, pd.NA, 8.3],
    "somename": ['ana', 'julia', 'ana', 'julia', 'ana', 'ana', 'ana', 'ana', 'ana', 'ana'],
    "latitude": [0.98, 0.51, 0.98, 0.32, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98],
}
df = pd.DataFrame(data)

df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
df['precipitation'] = df['precipitation'].ffill(limit_area="inside", limit=2)
df['precipitation'] = df['precipitation'].bfill(limit_area="inside", limit=2)
df = df[df.index.minute == 0]
df

Unnamed: 0_level_0,precipitation,somename,latitude
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-01 02:00:00,0.4,ana,0.98
2016-01-01 03:00:00,12.2,ana,0.98
2016-01-01 04:00:00,7.4,ana,0.98


In [64]:
column_name_mapping = dataframe_helper.get_column_name_mapping()

for station_id in ALERTARIO_GAUGE_STATION_IDS:
    df_station = dataframe_helper.filter_dataframe_by_station(df=df_alertario_gauge, station=station_id)

    df_station = dataframe_helper.get_dataframe_with_selected_columns(
        df=df_station,
        column_names=column_name_mapping.keys()
    )
    df_station = dataframe_helper.rename_dataframe_column_names(
        df=df_station,
        column_name_mapping=column_name_mapping
    )

    df_station['datetime'] = pd.to_datetime(df_station['datetime'])
    df_station.set_index('datetime', inplace=True)

    df_station['precipitation_sum'] = df_station['precipitation_sum'].ffill(limit_area="inside", limit=4)
    df_station['precipitation_sum'] = df_station['precipitation_sum'].bfill(limit_area="inside", limit=4)
    df_station = df_station[df_station.index.minute == 0]

    if df_station['precipitation_sum'].isnull().values.any().any():
        print(f"Station {station_id} has missing precipitation {(df_station.isnull().mean() * 100).mean()}")

    df_station.reset_index(inplace=True)
    dataframe_helper.save_dataframe_as_parquet(df_station, station_id)

Station av_brasil_mendanha has missing precipitation 0.032764876052871324
Station barrinha has missing precipitation 0.0007991560911677269
Station campo_grande has missing precipitation 0.006393248729341815
Station cidade_de_deus has missing precipitation 0.02715307947866087
Station copacabana has missing precipitation 0.011991749676222758
Station grajau_jacarepagua has missing precipitation 0.013585653549851357
Station grajau has missing precipitation 0.01038902918518045
Station grota_funda has missing precipitation 0.3430095915645049
Station ilha_do_governador has missing precipitation 0.004794936547006362
Station laranjeiras has missing precipitation 0.35397387260115243
Station madureira has missing precipitation 0.011188185276348175
Station penha has missing precipitation 0.007991560911677268
Station piedade has missing precipitation 0.002397468273503181
Station recreio has missing precipitation 0.002397468273503181
Station rocinha has missing precipitation 0.34451553495165665
Stat

In [65]:
df_station['precipitation_sum'].isnull().values.any().any()

True

In [66]:
df_station.isnull().values.any().any()

True

In [67]:
(df_station.isnull().mean() * 100).mean()

0.002663853637225756

In [68]:
df_station['precipitation_sum'].isnull().sum().sum()

5