In [1]:
import pandas as pd
import os
from collections.abc import KeysView

In [2]:
# Please see atmoseer/src/globals.py
ALERTARIO_WEATHER_STATION_IDS = ['guaratiba', 'sao_cristovao']

In [3]:
def check_if_data_folder_exists(folder: str) -> None:
  if not os.path.exists(folder):
    raise FileNotFoundError(f"The folder {os.path.join(os.getcwd(), folder)} does not exist.")

In [4]:
class DataFrameHelper:
    def get_dataframe_with_selected_columns(self, df: pd.DataFrame, column_names: KeysView) -> pd.DataFrame:
        selected_columns = []
        for column_name in column_names:
            if column_name not in df.columns: raise ValueError(f"The column {column_name} does not exist in df")
            selected_columns.append(column_name)
        return df[selected_columns]

    def rename_dataframe_column_names(self, df: pd.DataFrame, column_name_mapping: dict[str, str]) -> pd.DataFrame:
        new_column_names = []
        for old_column_name, new_column_name in column_name_mapping.items():
            if old_column_name not in df.columns: raise ValueError(f"The column {old_column_name} does not exist in df")
            new_column_names.append(new_column_name)
        df.columns = new_column_names
        return df

    def get_column_name_mapping(self, df_alertario: pd.DataFrame) -> dict[str, str]:
        column_name_mapping = {
            'station': 'station',

            'datetime': 'datetime',
            'temperature': 'temperature_mean',
            'humidity': 'humidity_mean',
            'pressure': 'pressure_mean',
            'wind_speed': 'wind_speed_mean',
            'wind_dir': 'wind_dir_mean',
            'precipitation': 'precipitation_sum',

            'wind_u': 'wind_u',
            'wind_v': 'wind_v',
            'hour_sin': 'hour_sin',
            'hour_cos': 'hour_cos',
            'month_sin': 'month_sin',
            'month_cos': 'month_cos',
            'latitude': 'latitude',
            'longitude': 'longitude',
            'year': 'year',
            'month': 'month'
        }

        if set(column_name_mapping.keys()) != set(df_alertario.columns):
            raise ValueError(
               "The column names in the mapping do not match the column names in the DataFrame."
            )
        return column_name_mapping
    
    def filter_dataframe_by_station(self, df: pd.DataFrame, station: str) -> pd.DataFrame:
        return df[df['station'] == station]
    
    def save_dataframe_as_parquet(self, df: pd.DataFrame, station_id: str):
        df.to_parquet(f"{station_id}.parquet")

In [5]:
def main():
    check_if_data_folder_exists('alertario_weather_station/alertario')

    df_alertario = pd.read_parquet('alertario_weather_station/alertario')

    print(df_alertario.shape)
    print(df_alertario.columns)
    print(df_alertario['station'].unique())
    print(df_alertario.head())

    dataframe_helper = DataFrameHelper()
    column_name_mapping = dataframe_helper.get_column_name_mapping(df_alertario)

    for station_id in ALERTARIO_WEATHER_STATION_IDS:
        df_station = dataframe_helper.filter_dataframe_by_station(df=df_alertario, station=station_id)

        df_station = dataframe_helper.get_dataframe_with_selected_columns(
            df=df_station,
            column_names=column_name_mapping.keys()
        )
        df_station = dataframe_helper.rename_dataframe_column_names(
            df=df_station,
            column_name_mapping=column_name_mapping
        )

        dataframe_helper.save_dataframe_as_parquet(df_station, station_id)

if __name__ == '__main__':
  main()

(1913637, 18)
Index(['station', 'datetime', 'precipitation', 'wind_dir', 'wind_speed',
       'temperature', 'pressure', 'humidity', 'wind_u', 'wind_v', 'hour_sin',
       'hour_cos', 'month_sin', 'month_cos', 'latitude', 'longitude', 'year',
       'month'],
      dtype='object')
['iraja' 'jardim_botanico' 'riocentro' 'guaratiba' 'santa_cruz'
 'alto_da_boa_vista' 'sao_cristovao' 'vidigal']
  station                  datetime  precipitation  wind_dir  wind_speed  \
0   iraja 2016-01-01 02:00:00+00:00            0.0       NaN         NaN   
1   iraja 2016-01-01 02:15:00+00:00            0.0       NaN         NaN   
2   iraja 2016-01-01 02:30:00+00:00            0.0       NaN         NaN   
3   iraja 2016-01-01 02:45:00+00:00            0.0       NaN         NaN   
4   iraja 2016-01-01 03:00:00+00:00            0.0       NaN         NaN   

   temperature  pressure  humidity  wind_u  wind_v  hour_sin  hour_cos  \
0         30.8       NaN      60.0     NaN     NaN  0.500000  0.866025   
1