In [1]:
from pyproj import datadir, CRS
import os

proj_path = r"C:\Users\Shivam\miniconda3\envs\myenv\Library\share\proj"
datadir.set_data_dir(proj_path)
os.environ["PROJ_LIB"] = proj_path
os.environ["PROJ_DATA"] = proj_path

print("Using PROJ data from:", datadir.get_data_dir())
print("CRS check:", CRS.from_epsg(4326))

Using PROJ data from: C:\Users\Shivam\miniconda3\envs\myenv\Library\share\proj
CRS check: EPSG:4326


  _set_context_ca_bundle_path(ca_bundle_path)


In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.geometry as sg
from rasterstats import zonal_stats

# Paths for All Datasets

In [5]:
ERA5_CSV_FILE = "../../data/preprocessed/era5-datasets/era5_merged.csv"
DEM_RASTER = "../../data/raw/P5_PAN_CD_N30_000_E078_000_30m/P5_PAN_CD_N30_000_E078_000_DEM_30m.tif"
SLOPE_RASTER = "../../data/preprocessed/DEM/slope_wgs84.tif"
ASPECT_RASTER = "../../data/preprocessed/DEM/aspect_wgs84.tif"
LULC_RASTER = "../../data/preprocessed/LULC/uk_lulc_classified.tif"
FIRMS_CSV_FILE = "../../data/preprocessed/firms/fires_viirs_2016_2019.csv"
FINAL_OUTPUT_FILE = "../../data/preprocessed/final_training_dataset.csv"

## Load ERA5 Data

In [6]:
era5_df = pd.read_csv(ERA5_CSV_FILE)
era5_df.head()

Unnamed: 0,valid_time,latitude,longitude,u10,v10,d2m,t2m,lai_hv,lai_lv,wind_speed,rh,vpd,number,expver,year
0,2016-01-01 00:00:00,31.3,77.5,-1.796256,-0.72764,-15.016418,-1.26651,2.174683,1.621314,1.938039,34.337837,3.659109,0,1,2016
1,2016-01-01 00:00:00,31.3,77.75,-2.54003,-1.461649,-20.002747,-6.835846,2.435181,1.207129,2.930558,34.264458,2.411721,0,1,2016
2,2016-01-01 00:00:00,31.3,78.0,-2.340568,-1.2595,-23.637512,-11.565338,3.491089,1.080359,2.657931,36.10997,1.617916,0,1,2016
3,2016-01-01 00:00:00,31.3,78.25,-1.654532,-0.617289,-29.244934,-14.932526,4.195557,0.847266,1.765934,28.436842,1.378873,0,1,2016
4,2016-01-01 00:00:00,31.3,78.5,-1.472403,-0.341532,-34.576965,-19.01065,3.301758,0.702429,1.511495,23.977932,1.040542,0,1,2016


In [7]:
#Convert 'valid_time' to datetime objects
era5_df['valid_time'] = pd.to_datetime(era5_df['valid_time'])
# Extract the month and create a new 'month' column
era5_df['month'] = era5_df['valid_time'].dt.month

## Define Rectangular Grid

In [8]:
grid_points = era5_df[['longitude', 'latitude']].drop_duplicates().reset_index(drop=True)
print(f"Found {len(grid_points)} unique ERA5 grid cells.")

Found 165 unique ERA5 grid cells.


In [9]:
grid_res = 0.25
half_res = grid_res / 2

geometry = [
    sg.box(
        pt.longitude - half_res, pt.latitude - half_res,
        pt.longitude + half_res, pt.latitude + half_res
    )
    for pt in grid_points.itertuples()
]

grid_poly = gpd.GeoDataFrame(grid_points, geometry=geometry, crs=CRS.from_epsg(4326))
print("Created grid poly. 30m resolution:")

Created grid poly. 30m resolution:


## Calculate Zonal Statistics for Static Rasters 

In [10]:
dem_stats = zonal_stats(grid_poly, DEM_RASTER, stats="mean", nodata=-32768)
slope_stats = zonal_stats(grid_poly, SLOPE_RASTER, stats="mean", nodata=-32768)
aspect_stats = zonal_stats(grid_poly, ASPECT_RASTER, stats="mean", nodata=-32768)


In [11]:
static_features_df = grid_poly.drop(columns='geometry')
static_features_df['elevation_mean'] = [s['mean'] for s in dem_stats]
static_features_df['slope_mean'] = [s['mean'] for s in slope_stats]
static_features_df['aspect_mean'] = [s['mean'] for s in aspect_stats]

# Categorical Data (LULC)

In [12]:
lulc_stats = zonal_stats(grid_poly, LULC_RASTER, categorical=True, all_touched=True)



In [13]:
lulc_df = pd.DataFrame(lulc_stats)
lulc_df.fillna(0, inplace=True)

In [14]:
lulc_total_pixels = lulc_df.sum(axis=1)

In [15]:
lulc_percent_df = lulc_df.div(lulc_total_pixels.replace(0, 1), axis=0)

In [16]:
lulc_percent_df = lulc_percent_df.rename(columns=lambda c: f"lulc_class_{c}")

In [17]:
print(f"Processed {len(lulc_percent_df.columns)} LULC classes.")
# Add LULC features to the static features DataFrame
static_features_df = static_features_df.join(lulc_percent_df)

Processed 21 LULC classes.


In [18]:
X_df = pd.merge(
    era5_df,
    static_features_df,
    on=['longitude', 'latitude'],
    how='left'
)

In [19]:
print(X_df.head())

  valid_time  latitude  longitude       u10       v10        d2m        t2m  \
0 2016-01-01      31.3      77.50 -1.796256 -0.727640 -15.016418  -1.266510   
1 2016-01-01      31.3      77.75 -2.540030 -1.461649 -20.002747  -6.835846   
2 2016-01-01      31.3      78.00 -2.340568 -1.259500 -23.637512 -11.565338   
3 2016-01-01      31.3      78.25 -1.654532 -0.617289 -29.244934 -14.932526   
4 2016-01-01      31.3      78.50 -1.472403 -0.341532 -34.576965 -19.010650   

     lai_hv    lai_lv  wind_speed  ...  lulc_class_1  lulc_class_2  \
0  2.174683  1.621314    1.938039  ...           0.0           0.0   
1  2.435181  1.207129    2.930558  ...           0.0           0.0   
2  3.491089  1.080359    2.657931  ...           0.0           0.0   
3  4.195557  0.847266    1.765934  ...           0.0           0.0   
4  3.301758  0.702429    1.511495  ...           0.0           0.0   

   lulc_class_3  lulc_class_5  lulc_class_15  lulc_class_19  lulc_class_9  \
0           0.0           0

In [20]:
X_df.reset_index(drop=True, inplace=True)

In [21]:
firms_df = pd.read_csv(FIRMS_CSV_FILE)

In [22]:
firms_df['acq_date'] = pd.to_datetime(firms_df['acq_date'])

In [23]:
firms_gdf = gpd.GeoDataFrame(firms_df, geometry=gpd.points_from_xy(firms_df['longitude'], firms_df['latitude']), crs=CRS.from_epsg(4326))

In [24]:
fires_in_grid = gpd.sjoin(
    firms_gdf,
    grid_poly,
    how='inner',
    predicate='within',
    lsuffix='_fire',
    rsuffix='_grid'
)

In [25]:
if fires_in_grid.empty:
    print("Warning: No fire points were found within your ERA5 grid cells.")
    X_df['fire_occurred'] = 0
else:
    print(f"Successfully mapped {len(fires_in_grid)} fire hotspots to grid cells.")


Successfully mapped 100662 fire hotspots to grid cells.


In [26]:
fires_in_grid['date'] = fires_in_grid['acq_date'].dt.date
X_df['date'] = X_df['valid_time'].dt.date

In [27]:
print("\nfires_in_grid columns after spatial join:\n", fires_in_grid.columns.tolist())


fires_in_grid columns after spatial join:
 ['latitude__fire', 'longitude__fire', 'brightness', 'scan', 'track', 'acq_date', 'acq_time', 'satellite', 'instrument', 'confidence', 'version', 'bright_t31', 'frp', 'daynight', 'type', 'geometry', 'index__grid', 'longitude__grid', 'latitude__grid', 'date']


In [28]:
lon_col = next((c for c in fires_in_grid.columns if 'longitude' in c and 'grid' in c), None)
lat_col = next((c for c in fires_in_grid.columns if 'latitude' in c and 'grid' in c), None)

In [29]:
fire_log = fires_in_grid[[lon_col, lat_col, 'date']].drop_duplicates()
fire_log = fire_log.rename(columns={lon_col: 'longitude', lat_col: 'latitude'})
fire_log['fire_today'] = 1

In [30]:
print(f"Found {len(fire_log)} unique fire-day grid cell entries.")

Found 19220 unique fire-day grid cell entries.


In [31]:
final_df = pd.merge(
    X_df,
    fire_log,
    on=['longitude', 'latitude', 'date'],
    how='left'
)

final_df['fire_today'] = final_df['fire_today'].fillna(0)
final_df = final_df.rename(columns={'fire_today': 'fire_occurred'})
final_df = final_df.drop(columns='date')

In [32]:
final_df.to_csv(FINAL_OUTPUT_FILE, index=False)