In [1]:
import os


data_path = os.path.join('data')

# ---------- Raw Folders
raw_path = os.path.join(data_path, 'raw_data')

raw_inmet_folder = os.path.join(raw_path, 'inmet')

raw_inpe_folder = os.path.join(raw_path, 'inpe')
raw_inpe_all_folder = os.path.join(raw_inpe_folder, 'todos-sats')

# ---------- Concat Folder
concat_path = os.path.join(data_path, 'concat')
os.makedirs(concat_path, exist_ok=True)

inmet_concat_file = os.path.join(concat_path, 'inmet_concat.csv')
inmet_stations_file = os.path.join(concat_path, 'inmet_stations.csv')
inpe_all_concat_file = os.path.join(concat_path, 'inpe_all_concat.csv')

run_fix_inmet_files_format = False

# Fix INMET files format

In [4]:
if run_fix_inmet_files_format:
    import shutil
    from unidecode import unidecode
    from pathlib import Path
    import os


    def get_ascii_char(c: str) -> str:
            decoded = unidecode(c)
            if len(decoded) > 1:
                return ''
            ascii_number = ord(decoded)
            ascii_char = bytes([ascii_number]).decode("ascii")
            return ascii_char


    def get_ascii_content(content: str) -> str:
        ascii_content = ''
        for c in content:
            ascii_char = get_ascii_char(c)
            ascii_content += ascii_char

        return ascii_content


    fixed_folder = os.path.join(raw_path, 'inmet_fixed')


    for root, _, files in os.walk(raw_inmet_folder):
        for file in files:
            if not file.lower().endswith('.csv'):
                continue

            file_path = os.path.join(root, file)
            new_file_path = file_path.replace(raw_inmet_folder, fixed_folder)
            Path(new_file_path).parent.mkdir(parents=True, exist_ok=True)

            with open(file_path, 'r', encoding='latin-1') as latin:
                content = latin.read()
                with open(new_file_path, 'w', encoding='utf-8') as fixed:
                    fixed.write(get_ascii_content(content))

    shutil.rmtree(raw_inmet_folder)
    os.rename(fixed_folder, raw_inmet_folder)

# Concat INMET files

In [2]:
import pandas as pd
import os

code_meta_label = 'CODIGO (WMO)'


is_first_write = True
stations_df = pd.DataFrame()
for root, _, files in os.walk(raw_inmet_folder):  # Navigate in every csv inside the folder and read it
    for file in files:
        if not file.lower().endswith('.csv'):
            continue

        file_path = os.path.join(root, file)
        with open(file_path, 'r') as f:
            station_dict = {}  # Get the first 8 lines as station metadata
            for line in f.readlines()[:8]:
                key, value = line.strip().split(':;', 1)
                station_dict[key.strip()] = value.strip()
            stations_df = pd.concat([stations_df, pd.DataFrame([station_dict])], ignore_index=True)

        curr_df = pd.read_csv(file_path, skiprows=8, sep=';', decimal=',').dropna(axis=1, how='all')  # Read the csv skipping the first 8 rows and dropping empty columns
        curr_df.replace(-9999.0, pd.NA, inplace=True)
        curr_df.dropna(inplace=True)
        curr_df[code_meta_label] = station_dict[code_meta_label]  # Add the station code to the dataframe

        curr_df.to_csv(inmet_concat_file, mode='a', header=is_first_write, index=False)
        is_first_write = False

stations_df = stations_df = ( # Keeps the first non-null value for each column among duplicates
    stations_df
    .groupby(code_meta_label, as_index=False)
    .agg(lambda x: x.dropna().iloc[0] if x.dropna().any() else pd.NA)
)
stations_df = stations_df.loc[:, ~stations_df.columns.str.contains(r'\?|\bDATA\b', regex=True, case=False)]  # Remove columns with '?' or '\bDATA\b' in the name
stations_df.to_csv(inmet_stations_file, index=False)

# Concat INPE All Satellites files

In [4]:
import pandas as pd
import os


is_first_write = True
for root, _, files in os.walk(raw_inpe_all_folder):  # Navigate in every csv inside the folder and read it
    for file in files:
        if not file.lower().endswith('.csv'):
            continue

        file_path = os.path.join(root, file)
        curr_df = pd.read_csv(file_path, sep=',', decimal='.')
        curr_df.drop(columns=['satelite', 'pais', 'bioma', 'numero_dias_sem_chuva', 'precipitacao', 'risco_fogo', 'id_area_industrial'], inplace=True)
        curr_df.replace(-999.0, pd.NA, inplace=True)
        curr_df.dropna(inplace=True)

        curr_df.to_csv(inpe_all_concat_file, mode='a', header=is_first_write, index=False)
        is_first_write = False