In [1]:
!pip install huggingface

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Installing collected packages: huggingface
Successfully installed huggingface-0.0.1


In [2]:
!pip install dask dask[dataframe] pyarrow



In [3]:
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [4]:
import os, datetime
from google.colab import files
import shutil, os, datetime
import os, glob, datetime
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import re
from google.colab import drive
import gc

import dask.dataframe as dd
import dask_cudf

In [5]:
# # Mount Google Drive
# drive.mount('/content/drive')

In [6]:
hoje = datetime.date.today()
BASE   = "/content/projeto_pcb"
BRONZE_RAW = f"{BASE}/dados/bronze/raw"                   # onde guardamos os arquivos "brutos" por parti√ß√£o
BRONZE_CUR = f"{BASE}/dados/bronze/current"               # dataset consolidado (√∫nico arquivo)
os.makedirs(BRONZE_RAW, exist_ok=True)
os.makedirs(BRONZE_CUR, exist_ok=True)


DRIVE_CSV_PATH = "/content/drive/My Drive/8_periodo/Big Data/datasets"
os.makedirs(DRIVE_CSV_PATH, exist_ok=True)

In [7]:
# 1) Descobrir todos os CSVs diretamente de um link ou ID de pasta compartilhada do Google Drive

shared_folder_link_or_id = 'https://drive.google.com/drive/folders/1hRgLe3t-zGgWqFH7ZAtW0m6XQdgOW3bh?usp=sharing'
download_dir = f"{DRIVE_CSV_PATH}/downloaded_csvs"
os.makedirs(download_dir, exist_ok=True)

print(f"üì¶ Attempting to download all files from shared Google Drive folder: {shared_folder_link_or_id}")

try:
    # Use gdown to download all files from the folder
    # The --folder flag indicates that the provided link/ID is for a folder
    # The -O flag specifies the output directory
    !gdown --no-cookies --no-check-certificate --folder "{shared_folder_link_or_id}" -O "{download_dir}"

    # After downloading, list the files in the download directory to process them
    arquivos_csv = [f for f in os.listdir(download_dir) if f.endswith(".csv") and os.path.isfile(os.path.join(download_dir, f))]
    print(f"üì¶ Downloaded {len(arquivos_csv)} CSVs to {download_dir}.\n")

except Exception as e:
    print(f"‚ö†Ô∏è Error during download: {e}")
    print("Please ensure gdown is installed (`!pip install gdown`), the folder link/ID is correct, and the folder is shared with 'Anyone with the link'.")
    arquivos_csv = [] # Ensure arquivos_csv is empty if download fails


# 2) Mover/copy to Bronze/raw partitioned and read into pandas DataFrames
for nome in arquivos_csv:
    caminho_src = os.path.join(download_dir, nome)

    # extrai AAAAMM do nome (ex.: focos_mensal_br_202311.csv)
    m = re.search(r'(\d{6})', nome)
    aaaamm = m.group(1) if m else f"{hoje.year}{hoje:%m}"
    ano, mes = aaaamm[:4], aaaamm[4:]

    # destino particionado (save as parquet in bronze raw)
    dest_dir = os.path.join(BRONZE_RAW, f"ano={ano}", f"mes={mes}")
    os.makedirs(dest_dir, exist_ok=True)
    nome_parquet = nome.replace(".csv", ".parquet")
    caminho_dst_parquet = os.path.join(dest_dir, nome_parquet)


    try:
        df_temp = pd.read_csv(caminho_src)
        df_temp["origem_arquivo"] = nome

        # salva como Parquet em bronze raw
        df_temp.to_parquet(caminho_dst_parquet, index=False)
        print(f"‚úÖ Processado e salvo como Parquet: {nome}  ‚ûú  {caminho_dst_parquet}")

        del df_temp # Free up memory
        gc.collect() # Run garbage collection

    except Exception as e:
        print(f"‚ö†Ô∏è Erro ao processar {caminho_src}: {e}")


# 3) Consolidate everything into a single Dask DataFrame (Bronze/current) by reading the Parquet files

if arquivos_csv:
    try:
        bronze_ddf = dask_cudf.read_parquet(os.path.join(BRONZE_RAW, "*/*/*.parquet"))

        # Convert 'ano' and 'mes' to Int64 as category dtype is not supported by parquet writer
        bronze_ddf["ano"] = bronze_ddf["ano"].astype("Int64")
        bronze_ddf["mes"] = bronze_ddf["mes"].astype("Int64")

        # Simple idempotence: remove exact duplicate rows
        bronze_ddf = bronze_ddf.drop_duplicates()

        print(f"\nüß± Bronze consolidated: {len(bronze_ddf):,} linhas, {bronze_ddf.shape[1]} colunas")
        # For Dask, we use .head() to see the first few rows. This triggers computation.
        display(bronze_ddf.head())

        # 4) Salva consolidado (Parquet)
        out_parquet = os.path.join(BRONZE_CUR, "focos_bronze_consolidado.parquet")

        # To save as a single file, we repartition to 1 partition before saving
        bronze_ddf.repartition(npartitions=1).to_parquet(out_parquet)

        print(f"\nüíæ Consolidado salvo em:\n - {out_parquet}")

    except Exception as e:
        print(f"‚ö†Ô∏è Erro ao consolidar ou salvar Bronze: {e}")
        # Handle the case where no parquet files were created
        if "Empty dataset" in str(e):
             print("No parquet files were created. Please check the download and processing steps.")

else:
    print("\nSkipping consolidation and saving Bronze as no CSV files were downloaded or processed.")

üì¶ Attempting to download all files from shared Google Drive folder: https://drive.google.com/drive/folders/1hRgLe3t-zGgWqFH7ZAtW0m6XQdgOW3bh?usp=sharing
Retrieving folder contents
Retrieving folder 1ovB3SBy-8dycREpOpS6R8IvZf1M9Pucf silver
Processing file 1C0kSP8H1texrVhDk3phK3JpbFzobhOvn focos_mensal_br_202401.csv
Processing file 1A5R0ddrYc9h1mdC13BcZgionHE2DOBqG focos_mensal_br_202402.csv
Processing file 1FfQ3dMqa4P8nJsj6aduzmuRK7KTGIic9 focos_mensal_br_202403.csv
Processing file 1xPIWC_TV-mF3Va_N4x2DpXtzqmMv-Uxt focos_mensal_br_202404.csv
Processing file 1mqey7AzyRkuBped-T_Qx5hXl2EfE4WA0 focos_mensal_br_202405.csv
Processing file 1NMvoGVZM2txO2O_31i28oBEfABs2eT9A focos_mensal_br_202406.csv
Processing file 1oysu2-kQSqWy8RnHU08SLefVi2hzV3SY focos_mensal_br_202407.csv
Processing file 11UijgNQ7DXDfulVRP4lXS05tbqmrjg-n focos_mensal_br_202408.csv
Processing file 1Byntu2ERioK27Mj8-lRY8Ylq7tHGZuZO focos_mensal_br_202409.csv
Processing file 1O8YNTuMNGJ-8ZrVZTHUM9IHK-ckgX_v6 focos_mensal_br

Unnamed: 0,id,lat,lon,data_hora_gmt,satelite,municipio,estado,pais,municipio_id,estado_id,pais_id,numero_dias_sem_chuva,precipitacao,risco_fogo,bioma,frp,origem_arquivo,ano,mes
17728,da3ad74e-cf90-3d13-a010-c8bd269a9730,-9.23975,-35.69029,2024-01-06 16:17:00,NOAA-20,S√ÉO LU√çS DO QUITUNDE,ALAGOAS,Brasil,2708501,27,33,15.0,0.0,1.0,Mata Atl√¢ntica,5.1,focos_mensal_br_202401.csv,2024,1
17732,e91a1ce3-5980-3f5c-89f0-39d7a7bc1bf1,-9.29241,-35.67792,2024-01-06 16:17:00,NOAA-20,S√ÉO LU√çS DO QUITUNDE,ALAGOAS,Brasil,2708501,27,33,15.0,0.0,1.0,Mata Atl√¢ntica,3.2,focos_mensal_br_202401.csv,2024,1
17742,ce24faa0-c547-3d7a-8212-629af92ec70c,-7.52567,-37.62928,2024-01-06 16:17:00,NOAA-20,√ÅGUA BRANCA,PARA√çBA,Brasil,2500106,25,33,5.0,0.0,0.57,Caatinga,30.1,focos_mensal_br_202401.csv,2024,1
48545,49539484-48b1-3320-9ed0-1234e0249020,-7.02693,-40.14935,2024-01-10 16:43:00,NOAA-20,SALITRE,CEAR√Å,Brasil,2311959,23,33,3.0,1.0,0.86,Caatinga,5.6,focos_mensal_br_202401.csv,2024,1
17743,d950bda4-0ebe-362b-8c81-1520be19fb7f,-7.52445,-37.62645,2024-01-06 16:17:00,NOAA-20,√ÅGUA BRANCA,PARA√çBA,Brasil,2500106,25,33,5.0,0.0,0.57,Caatinga,21.4,focos_mensal_br_202401.csv,2024,1



üíæ Consolidado salvo em:
 - /content/projeto_pcb/dados/bronze/current/focos_bronze_consolidado.parquet


In [8]:
bronze_ddf.info()

<class 'dask_cudf._expr.collection.DataFrame'>
Columns: 19 entries, id to mes
dtypes: object(8), float64(5), int64(6)

In [9]:
bronze_ddf.isna().sum().compute()

id                            0
lat                           0
lon                           0
data_hora_gmt                 0
satelite                      0
municipio                     0
estado                        0
pais                          0
municipio_id                  0
estado_id                     0
pais_id                       0
numero_dias_sem_chuva    125852
precipitacao             125852
risco_fogo               125852
bioma                         4
frp                      374354
origem_arquivo                0
ano                           0
mes                           0
dtype: int64

In [10]:
hoje = datetime.date.today()
BASE = "/content/projeto_pcb"
BRONZE = "/content/projeto_pcb/dados/bronze/current"
SILVER = "/content/projeto_pcb/dados/silver"
os.makedirs(SILVER, exist_ok=True)

In [11]:
df_temp = None
del df_temp
gc.collect()

41

In [12]:
bronze_ddf = bronze_ddf.drop(columns=['id', 'lat','lon','municipio_id','estado_id','pais_id','pais','origem_arquivo', 'ano', 'mes'])

In [14]:

paths = sorted(glob.glob(os.path.join(BRONZE, "*.parquet")))
assert paths, "Nenhum arquivo Parquet encontrado em bronze."

bronze_ddf = dd.read_parquet(paths[0]) # Assuming the consolidated file is the first one

esperadas = {"data_hora_gmt","satelite","municipio","estado"}
# Check for missing columns (Dask DataFrames have columns attribute)
faltando = esperadas - set(bronze_ddf.columns)
assert not faltando, f"Faltam colunas no dataset: {faltando}"

bronze_ddf["data_hora_gmt"] = bronze_ddf["data_hora_gmt"].astype('datetime64[ns, UTC]')

bronze_ddf = bronze_ddf.drop(columns=['id', 'lat','lon','municipio_id','estado_id','pais_id','pais','origem_arquivo', 'ano', 'mes'])

for c in ["satelite","municipio","estado"]:
      bronze_ddf[c] = bronze_ddf[c].astype(str).str.strip()

bronze_ddf = bronze_ddf.dropna(subset=["data_hora_gmt"])

# Identify and replace -999 with NaN in numeric columns
numeric_cols = bronze_ddf.select_dtypes(include=np.number).columns
# Use Dask's assign with a dictionary comprehension for replacement
replace_dict = {col: bronze_ddf[col].replace(-999, np.nan) for col in numeric_cols}
bronze_ddf = bronze_ddf.assign(**replace_dict)


# Calculate and print proportion of NaNs per column using Dask
nan_proportion_per_column = bronze_ddf.isnull().sum().compute() / len(bronze_ddf) * 100
print("\nPropor√ß√£o de NaN por coluna (%):")
print(nan_proportion_per_column)

# Calculate and print proportion of rows with NaN using Dask
rows_with_nan = bronze_ddf.isnull().any(axis=1).sum().compute()
proportion_rows_with_nan = rows_with_nan / len(bronze_ddf) * 100
print(f"\nPropor√ß√£o de linhas com NaN no total do dataset (%): {proportion_rows_with_nan:.2f}%")

bronze_ddf = bronze_ddf.dropna()

# Extract date components using Dask's .dt accessor
bronze_ddf["ano"] = bronze_ddf["data_hora_gmt"].dt.year.astype("Int64")
bronze_ddf["mes"] = bronze_ddf["data_hora_gmt"].dt.month.astype("Int64")
bronze_ddf["dia"] = bronze_ddf["data_hora_gmt"].dt.day.astype("Int64")

# carga que prof pediu
bronze_ddf["dt_carga"] = datetime.date.today().isoformat()

# Salva silver as Parquet
SILVER_FILE =  f"focos_silver_{datetime.date.today():%Y%m%d}.parquet"
silver_parquet = os.path.join(SILVER,SILVER_FILE)
os.makedirs(SILVER, exist_ok=True)

# Save as a single parquet file
bronze_ddf.repartition(npartitions=1).to_parquet(silver_parquet)


print("Silver salvo:", silver_parquet, "| linhas:", len(bronze_ddf))

display(bronze_ddf.head())


Propor√ß√£o de NaN por coluna (%):
data_hora_gmt            0.000000
satelite                 0.000000
municipio                0.000000
estado                   0.000000
numero_dias_sem_chuva    1.916553
precipitacao             1.180835
risco_fogo               2.209782
bioma                    0.000038
frp                      3.512460
dtype: float64

Propor√ß√£o de linhas com NaN no total do dataset (%): 6.23%
Silver salvo: /content/projeto_pcb/dados/silver/focos_silver_20251013.parquet | linhas: 9993721


Unnamed: 0,data_hora_gmt,satelite,municipio,estado,numero_dias_sem_chuva,precipitacao,risco_fogo,bioma,frp,ano,mes,dia,dt_carga
41572,2024-01-09 16:46:57+00:00,GOES-16,MIRANDA,MATO GROSSO DO SUL,5.0,0.0,0.08,Pantanal,60.0,2024,1,9,2025-10-13
41574,2024-01-09 16:46:57+00:00,GOES-16,MIRANDA,MATO GROSSO DO SUL,6.0,0.0,0.05,Pantanal,43.4,2024,1,9,2025-10-13
51587,2024-01-10 20:25:39+00:00,GOES-16,MOMBA√áA,CEAR√Å,9.0,0.0,1.0,Caatinga,99.7,2024,1,10,2025-10-13
41576,2024-01-09 16:46:57+00:00,GOES-16,MIRANDA,MATO GROSSO DO SUL,6.0,0.0,0.09,Pantanal,49.2,2024,1,9,2025-10-13
51592,2024-01-10 20:26:53+00:00,GOES-16,CORUMB√Å,MATO GROSSO DO SUL,5.0,0.0,0.05,Pantanal,147.7,2024,1,10,2025-10-13


In [None]:
from huggingface_hub import HfApi, HfFolder, upload_file
import os

token = None

# Salva o token localmente (autentica√ß√£o)
# HfFolder.save_token(token)

# Define o nome do seu dataset (use o mesmo nome criado no site)
repo_id = "Rob-A-B/wildfire-focus"  # ajuste para seu user e nome
api = HfApi()

# Envia um arquivo (teste)
file_path = "/content/projeto_pcb/dados/silver/focos_silver_20251013.parquet/part.0.parquet"

upload_file(
    path_or_fileobj=file_path,
    path_in_repo=os.path.basename(file_path),  # nome do arquivo no reposit√≥rio
    repo_id=repo_id,
    repo_type="dataset"
)

print(f"‚úÖ Enviado! Veja em: https://huggingface.co/datasets/{repo_id}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...13.parquet/part.0.parquet:   0%|          |  531kB /  145MB            

‚úÖ Enviado! Veja em: https://huggingface.co/datasets/Rob-A-B/wildfire-focus


In [None]:
bronze_ddf.isna().sum()

In [None]:
bronze_ddf = None
del bronze_ddf
gc.collect()

In [None]:
GOLD = "/content/projeto_pcb/dados/gold"
os.makedirs(GOLD, exist_ok=True)

In [None]:
import dask.dataframe as dd
import os
import glob
import datetime
import pandas as pd # Keep pandas for final output display if needed
import dask_cudf # Import dask_cudf

GOLD = "/content/projeto_pcb/dados/gold"
os.makedirs(GOLD, exist_ok=True)

padrao_silver = "focos_silver_*.parquet"   # adjustment for parquet files

paths = sorted(glob.glob(os.path.join(SILVER, padrao_silver)))
assert paths, f"Nenhum arquivo Parquet Silver encontrado em: {SILVER}"

# Read the silver parquet files into a Dask DataFrame
# Dask can read multiple files matching a pattern directly
ddf_silver = dask_cudf.read_parquet(paths)

# Ensure 'data_hora_gmt' is datetime with UTC timezone
# ddf_silver["data_hora_gmt"] = ddf_silver["data_hora_gmt"].astype('datetime64[ns, UTC]') # Removed this line


# Normaliza√ß√µes √∫teis para agrega√ß√£o using Dask's .dt accessor
ddf_silver["ano"]  = ddf_silver["data_hora_gmt"].dt.year.astype("Int64")
ddf_silver["mes"]  = ddf_silver["data_hora_gmt"].dt.month.astype("Int64")
ddf_silver["dia"]  = ddf_silver["data_hora_gmt"].dt.day.astype("Int64")
ddf_silver["hora"] = ddf_silver["data_hora_gmt"].dt.hour.astype("Int64") # Convert to Int64

# Recreate 'data' column as a string for grouping
ddf_silver["data"] = ddf_silver["data_hora_gmt"].dt.strftime('%Y-%m-%d')


# Aggregations using Dask groupby
por_estado_mes = (
    ddf_silver.groupby(["estado","ano","mes"])
      .size()
      .reset_index()
      .rename(columns={0: "qtd_focos"}) # Rename the column after reset_index
      .compute() # Trigger computation and get pandas DataFrame
      .sort_values(["ano","mes","qtd_focos"], ascending=[True,True,False])
)


por_municipio_dia = (
    ddf_silver.groupby(["estado","municipio","data"])
      .size()
      .reset_index()
      .rename(columns={0: "qtd_focos"}) # Rename the column after reset_index
      .compute() # Trigger computation and get pandas DataFrame
      .sort_values(["data","qtd_focos"], ascending=[True,False])
)


por_satelite_mes = (
    ddf_silver.groupby(["satelite","ano","mes"])
      .size()
      .reset_index()
      .rename(columns={0: "qtd_focos"}) # Rename the column after reset_index
      .compute() # Trigger computation and get pandas DataFrame
      .sort_values(["ano","mes","qtd_focos"], ascending=[True,True,False])
)

por_satelite_total = (
    ddf_silver.groupby("satelite")
      .size()
      .reset_index()
      .rename(columns={0: "qtd_focos"}) # Rename the column after reset_index
      .compute() # Trigger computation and get pandas DataFrame
      .assign(perc=lambda x: (x["qtd_focos"] / x["qtd_focos"].sum())*100)
      .sort_values("qtd_focos", ascending=False)
      .round({"perc": 2})
)

# For spatial grid, compute first as it might be a large result
# Round lat/lon using Dask map_partitions if needed for large data
# ddf_silver["lat_cell"] = ddf_silver["lat"].map_partitions(lambda x: x.round(2))
# ddf_silver["lon_cell"] = ddf_silver["lon"].map_partitions(lambda x: x.round(2))
# Simple round for now, assuming it fits in memory after aggregation
ddf_silver["lat_cell"] = ddf_silver["lat"].round(2)
ddf_silver["lon_cell"] = ddf_silver["lon"].round(2)


grade_espacial = (
    ddf_silver.groupby(["lat_cell","lon_cell"])
      .size()
      .reset_index()
      .rename(columns={0: "qtd_focos"}) # Rename the column after reset_index
      .compute() # Trigger computation and get pandas DataFrame
      .sort_values("qtd_focos", ascending=False)
)

# ------------------------------
# Salva em GOLD
# ------------------------------
hoje = datetime.date.today().isoformat()

g1 = os.path.join(GOLD, f"gold_focos_por_estado_mes_{hoje}.parquet")
g2 = os.path.join(GOLD, f"gold_focos_por_municipio_dia_{hoje}.parquet")
g3 = os.path.join(GOLD, f"gold_focos_por_satelite_mes_{hoje}.parquet")
g3b= os.path.join(GOLD, f"gold_focos_por_satelite_total_{hoje}.parquet")
g4 = os.path.join(GOLD, f"gold_grade_espacial_{hoje}.parquet")

# Save pandas DataFrames to parquet
por_estado_mes.to_parquet(g1, index=False)
por_municipio_dia.to_parquet(g2, index=False)
por_satelite_mes.to_parquet(g3, index=False)
por_satelite_total.to_parquet(g3b, index=False)
grade_espacial.to_parquet(g4, index=False)


print("Gold salvo:")
print(" -", g1)
print(" -", g2)
print(" -", g3)
print(" -", g3b)
print(" -", g4)

try:
    ult_ano  = por_estado_mes["ano"].max()
    ult_mes  = por_estado_mes.query("ano == @ult_ano")["mes"].max()
    top_estados = (
        por_estado_mes
        .query("ano == @ult_ano and mes == @ult_mes")
        .nlargest(10, "qtd_focos")
    )
    print(f"\nTOP estados no m√™s mais recente ({ult_ano}-{ult_mes:02d}):")
    print(top_estados.to_string())
except Exception as e:
    print("\n[Aviso] N√£o foi poss√≠vel imprimir TOP estados do m√™s mais recente:", e)

print("\n% por satelite (total):")
print(por_satelite_total.to_string())

In [None]:
# cont = df["Grey"].value_counts().sort_index()
# plt.bar(cont.index.astype(str), cont.values)
# plt.title("Distribui√ß√£o da classe Grey")
# plt.xlabel("Grey (0=n√£o cinza, 1=cinza)")
# # plt.ylabel("n")
# plt.show()


In [None]:
# agg = df.groupby("Grey")[["R","G","B"]].mean().reset_index()
# for ch in ["R","G","B"]:
#     plt.bar(agg["Grey"].astype(str), agg[ch], label=ch)
#     plt.title(f"M√©dia do canal {ch} por Grey")
#     plt.xlabel("Grey")
#     plt.ylabel(f"M√©dia {ch} (escala {'0‚Äì255' if df['R'].max()>1 else '0‚Äì1'})")
#     plt.show()
