In [None]:
import columns
import paths

# Linking INPE data to INMET Stations
### Using Nearest Neighbors (KD-tree) for efficient spatial joins

In [None]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# --------------------------------------------
# 1. Load stations (616 rows)
# --------------------------------------------
stations = pd.read_csv(paths.inmet_stations_file)

station_coords = stations[['LATITUDE', 'LONGITUDE']].to_numpy()
station_codes = stations[columns.code_meta_label].to_numpy()

# Build KD-tree (fast)
nn = NearestNeighbors(n_neighbors=1, algorithm='kd_tree')
nn.fit(station_coords)

# --------------------------------------------
# 2. Read big CSV in chunks (31M rows)
# --------------------------------------------
chunk_size = 500_000   # adjust depending on RAM (~24GB allows 1M)
reader = pd.read_csv(paths.inpe_all_concat_file, chunksize=chunk_size)

first = True
for chunk in reader:
    coords = chunk[['latitude', 'longitude']].to_numpy()  # Extract coordinates
    distances, indices = nn.kneighbors(coords, return_distance=True)  # Query nearest station (KD-tree)
    indices = indices.flatten()  # Flatten
    chunk[columns.code_meta_label] = station_codes[indices]  # Map to station codes

    chunk = chunk[['latitude', 'longitude', columns.unix_date_time_column, 'frp', columns.code_meta_label]]
    chunk.to_csv(paths.inpe_stations_linked_file, mode='w' if first else 'a', index=False, header=first)  # Write chunk to output CSV
    first = False

# Linking INPE data to INMET
### Creates new columns of frp on INMET with the matching station and timestamp

In [None]:
import pandas as pd

# Load the INPE-linked dataset (unix_hour, frp, codigo)
inpe_stations_df = pd.read_csv(paths.inpe_stations_linked_file)

# Load the INMET cleaned dataset (with unix_hour + station + all variables)
merged_df = pd.read_csv(paths.inmet_concat_file)

station_coords = pd.read_csv(paths.inmet_stations_file)[[columns.code_meta_label, 'LATITUDE', 'LONGITUDE']]

merged_df = merged_df.merge(station_coords, on=columns.code_meta_label, how='left')

# Merge on unix_hour AND station code
merged_df = merged_df.merge(
    inpe_stations_df,
    on=[columns.unix_date_time_column, columns.code_meta_label],
    how='left'
)

# Use fire coords when they are present, if not use station ones
merged_df['latitude'].fillna(merged_df['LATITUDE'], inplace=True)
merged_df['longitude'].fillna(merged_df['LONGITUDE'], inplace=True)
merged_df.drop(columns=[columns.code_meta_label, 'LATITUDE', 'LONGITUDE'], inplace=True)

merged_df['fire_occurred'] = merged_df['frp'].notna().astype(int)

# Export final dataset with FRP added
merged_df.to_csv(paths.inmet_inpe_linked_file, index=False)