In [21]:
import pandas as pd
import os
from collections.abc import KeysView

In [22]:
print(os.environ.get('CONDA_DEFAULT_ENV'))

atmoseer


In [23]:
df = pd.DataFrame({
    'name': ['a', 'a', 'b', 'b', 'c', 'c'],
    'speed': [1, 2, 3, 4, 5, 6],
})
print(df[(df['name'] == 'a') | (df['name'] == 'b')])
print(df[df['name'].isin(['a', 'b'])])

  name  speed
0    a      1
1    a      2
2    b      3
3    b      4
  name  speed
0    a      1
1    a      2
2    b      3
3    b      4


In [24]:
def check_if_data_folder_exists(folder: str) -> None:
  if not os.path.exists(folder):
    raise FileNotFoundError(f"The folder {folder} does not exist.")

check_if_data_folder_exists('alertario_rain_gauge/alertario')
df_alertario_gauge = pd.read_parquet('alertario_rain_gauge/alertario')
print(df_alertario_gauge.shape)
print(df_alertario_gauge.columns)
df_alertario_gauge.head()

(8268690, 11)
Index(['station', 'datetime', 'precipitation', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'latitude', 'longitude', 'year', 'month'],
      dtype='object')


Unnamed: 0,station,datetime,precipitation,hour_sin,hour_cos,month_sin,month_cos,latitude,longitude,year,month
0,vidigal,2016-01-01 02:00:00+00:00,0.0,0.5,0.866025,0.5,0.866025,-22.9925,-43.23306,2016,1
1,vidigal,2016-01-01 02:15:00+00:00,0.0,0.55557,0.83147,0.5,0.866025,-22.9925,-43.23306,2016,1
2,vidigal,2016-01-01 02:30:00+00:00,0.0,0.608761,0.793353,0.5,0.866025,-22.9925,-43.23306,2016,1
3,vidigal,2016-01-01 02:45:00+00:00,0.0,0.659346,0.75184,0.5,0.866025,-22.9925,-43.23306,2016,1
4,vidigal,2016-01-01 03:00:00+00:00,0.0,0.707107,0.707107,0.5,0.866025,-22.9925,-43.23306,2016,1


In [25]:
# Please see atmoseer/src/globals:ALERTARIO_GAUGE_STATION_IDS
ALERTARIO_GAUGE_STATION_IDS = [
    'anchieta', 'av_brasil_mendanha', 'bangu', 
    'barrinha', 'campo_grande', 'cidade_de_deus', 
    'copacabana', 'grajau_jacarepagua', 'grajau', 
    'grande_meier', 'grota_funda', 'ilha_do_governador', 
    'laranjeiras', 'madureira', 'penha', 
    'piedade', 'recreio', 'rocinha',
    'santa_teresa', 'saude', 'sepetiba', 
    'tanque', 'tijuca_muda', 'tijuca', 
    'urca', 'alto_da_boa_vista', 'iraja',
    'jardim_botanico', 'riocentro', 'santa_cruz',
    'vidigal'
]

set(df_alertario_gauge.station.unique()) - set(ALERTARIO_GAUGE_STATION_IDS)

{'guaratiba', 'sao_cristovao'}

In [26]:
df_alertario_gauge = df_alertario_gauge[df_alertario_gauge.station.isin(ALERTARIO_GAUGE_STATION_IDS)]
print(set(df_alertario_gauge.station.unique()) - set(ALERTARIO_GAUGE_STATION_IDS))
df_alertario_gauge.head()

set()


Unnamed: 0,station,datetime,precipitation,hour_sin,hour_cos,month_sin,month_cos,latitude,longitude,year,month
0,vidigal,2016-01-01 02:00:00+00:00,0.0,0.5,0.866025,0.5,0.866025,-22.9925,-43.23306,2016,1
1,vidigal,2016-01-01 02:15:00+00:00,0.0,0.55557,0.83147,0.5,0.866025,-22.9925,-43.23306,2016,1
2,vidigal,2016-01-01 02:30:00+00:00,0.0,0.608761,0.793353,0.5,0.866025,-22.9925,-43.23306,2016,1
3,vidigal,2016-01-01 02:45:00+00:00,0.0,0.659346,0.75184,0.5,0.866025,-22.9925,-43.23306,2016,1
4,vidigal,2016-01-01 03:00:00+00:00,0.0,0.707107,0.707107,0.5,0.866025,-22.9925,-43.23306,2016,1


In [27]:
class DataFrameHelper:
    def get_dataframe_with_selected_columns(self, df: pd.DataFrame, column_names: KeysView) -> pd.DataFrame:
        selected_columns = []
        for column_name in column_names:
            if column_name not in df.columns: raise ValueError(f"The column {column_name} does not exist in the df")
            selected_columns.append(column_name)
        return df[selected_columns]

    def rename_dataframe_column_names(self, df: pd.DataFrame, column_name_mapping: dict[str, str]) -> pd.DataFrame:
        new_columns = []
        for old_column, new_column in column_name_mapping.items():
            if old_column not in df.columns: raise ValueError(f"The column {old_column} does not exist in the df")
            new_columns.append(new_column)
        df.columns = new_columns
        return df

    def get_column_name_mapping(self, df_alertario: pd.DataFrame) -> dict[str, str]:
        column_name_mapping = {
            'station': 'station',

            'datetime': 'datetime',
            'precipitation': 'precipitation_sum',

            'hour_sin': 'hour_sin',
            'hour_cos': 'hour_cos',
            'month_sin': 'month_sin',
            'month_cos': 'month_cos',
            'latitude': 'latitude',
            'longitude': 'longitude',
            'year': 'year',
            'month': 'month'
        }

        if set(column_name_mapping.keys()) != set(df_alertario.columns):
            raise ValueError(
               "The column names in the mapping do not match the column names in the DataFrame."
            )
        return column_name_mapping
    
    def filter_dataframe_by_station(self, df: pd.DataFrame, station: str ) -> pd.DataFrame:
        return df[df['station'] == station]
    
    def save_dataframe_as_parquet(self, df: pd.DataFrame, station_id: str) -> None:
        df.to_parquet(f"{station_id}.parquet")

In [28]:
dataframe_helper = DataFrameHelper()

column_name_mapping = {
    'station': 'estacao_desc',
    'latitude': 'latitude',
    'longitude': 'longitude'
}

alertario_stations = dataframe_helper.get_dataframe_with_selected_columns(
    df=df_alertario_gauge,
    column_names=column_name_mapping.keys()
)
alertario_stations = dataframe_helper.rename_dataframe_column_names(
    df=alertario_stations,
    column_name_mapping=column_name_mapping
)
alertario_stations.head()

Unnamed: 0,estacao_desc,latitude,longitude
0,vidigal,-22.9925,-43.23306
1,vidigal,-22.9925,-43.23306
2,vidigal,-22.9925,-43.23306
3,vidigal,-22.9925,-43.23306
4,vidigal,-22.9925,-43.23306


In [29]:

# subset=['estacao_desc'], keep='first',

alertario_stations.head()
alertario_stations.drop_duplicates(inplace=True)
print(len(alertario_stations))
alertario_stations.head()


31


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alertario_stations.drop_duplicates(inplace=True)


Unnamed: 0,estacao_desc,latitude,longitude
0,vidigal,-22.9925,-43.23306
2968,urca,-22.95583,-43.16667
5936,rocinha,-22.98583,-43.245
8904,tijuca,-22.93194,-43.22167
11872,santa_teresa,-22.93167,-43.19639


In [30]:
alertario_stations.to_parquet('alertario_stations.parquet')

In [31]:
column_name_mapping = dataframe_helper.get_column_name_mapping(df_alertario_gauge)

for station_id in ALERTARIO_GAUGE_STATION_IDS:
    df_station = dataframe_helper.filter_dataframe_by_station(df=df_alertario_gauge, station=station_id)

    df_station = dataframe_helper.get_dataframe_with_selected_columns(
        df=df_station,
        column_names=column_name_mapping.keys()
    )
    df_station = dataframe_helper.rename_dataframe_column_names(
        df=df_station,
        column_name_mapping=column_name_mapping
    )

    dataframe_helper.save_dataframe_as_parquet(df_station, station_id)

In [32]:
df_station.isnull()

Unnamed: 0,station,datetime,precipitation_sum,hour_sin,hour_cos,month_sin,month_cos,latitude,longitude,year,month
68264,False,False,False,False,False,False,False,False,False,False,False
68265,False,False,False,False,False,False,False,False,False,False,False
68266,False,False,False,False,False,False,False,False,False,False,False
68267,False,False,False,False,False,False,False,False,False,False,False
68268,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
8260828,False,False,False,False,False,False,False,False,False,False,False
8260829,False,False,False,False,False,False,False,False,False,False,False
8260830,False,False,False,False,False,False,False,False,False,False,False
8260831,False,False,False,False,False,False,False,False,False,False,False


In [33]:
df_station.isnull().values.any().any()

True

In [34]:
(df_station.isnull().mean() * 100).mean()

0.002797046319087044

In [35]:
df_station.isnull().sum().sum()

77

: 