In [1]:
import paths

# Concat INMET files

In [2]:
import pandas as pd
import os

code_meta_label = 'CODIGO (WMO)'


is_first_write = True
stations_df = pd.DataFrame()
for root, _, files in os.walk(paths.raw_inmet_folder):  # Navigate in every csv inside the folder and read it
    for file in files:
        if not file.lower().endswith('.csv'):
            continue

        file_path = os.path.join(root, file)
        with open(file_path, 'r') as f:
            station_dict = {}  # Get the first 8 lines as station metadata
            for line in f.readlines()[:8]:
                key, value = line.strip().split(':;', 1)
                station_dict[key.strip()] = value.strip()
            stations_df = pd.concat([stations_df, pd.DataFrame([station_dict])], ignore_index=True)

        curr_df = pd.read_csv(file_path, skiprows=8, sep=';', decimal=',').dropna(axis=1, how='all')  # Read the csv skipping the first 8 rows and dropping empty columns
        curr_df.replace(-9999.0, pd.NA, inplace=True)
        curr_df.dropna(inplace=True)
        curr_df[code_meta_label] = station_dict[code_meta_label]  # Add the station code to the dataframe

        curr_df.to_csv(paths.inmet_concat_file, mode='a', header=is_first_write, index=False)
        is_first_write = False

stations_df = stations_df = ( # Keeps the first non-null value for each column among duplicates
    stations_df
    .groupby(code_meta_label, as_index=False)
    .agg(lambda x: x.dropna().iloc[0] if x.dropna().any() else pd.NA)
)
stations_df = stations_df.loc[:, ~stations_df.columns.str.contains(r'\?|\bDATA\b', regex=True, case=False)]  # Remove columns with '?' or '\bDATA\b' in the name
stations_df.to_csv(paths.inmet_stations_file, index=False)

# Concat INPE All Satellites files

In [4]:
import pandas as pd
import os


is_first_write = True
for root, _, files in os.walk(paths.raw_inpe_all_folder):  # Navigate in every csv inside the folder and read it
    for file in files:
        if not file.lower().endswith('.csv'):
            continue

        file_path = os.path.join(root, file)
        curr_df = pd.read_csv(file_path, sep=',', decimal='.')
        curr_df.drop(columns=['satelite', 'pais', 'bioma', 'numero_dias_sem_chuva', 'precipitacao', 'risco_fogo', 'id_area_industrial'], inplace=True)
        curr_df.replace(-999.0, pd.NA, inplace=True)
        curr_df.dropna(inplace=True)

        curr_df.to_csv(paths.inpe_all_concat_file, mode='a', header=is_first_write, index=False)
        is_first_write = False