In [44]:
import os
import sys
import time
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
from dbfread import DBF
import rasterio
from tqdm import tqdm

# Add project path to os.path
current_path = os.path.abspath('.')
project_name = 'California_Wild_Fire_Prediction'
project_path = os.path.join(current_path.split(project_name)[0], project_name)
print(project_path)
sys.path.append(project_path)
load_dotenv('.env')

/Users/shawn/Documents/sjsu/2025-1/ML_CMPE257/California_Wild_Fire_Prediction


False

In [19]:
fire_df = pd.read_csv("/Users/shawn/Documents/sjsu/2025-1/ML_CMPE257/California_Wild_Fire_Prediction/data/calfire_cimis_data/processed/finalized/calfire_cimis_all_rows.csv")
fire_df.head(3)

Unnamed: 0,incident_id,incident_name,incident_date_last_update,incident_date_created,incident_acres_burned,incident_longitude,incident_latitude,incident_dateonly_created,DayAirTmpAvg01,DayPrecip01,...,DayAirTmpAvg13,DayPrecip13,DayRelHumAvg13,DaySoilTmpAvg13,DayWindSpdAvg13,DayAirTmpAvg14,DayPrecip14,DayRelHumAvg14,DaySoilTmpAvg14,DayWindSpdAvg14
0,2ca11d45-8139-4c16-8af0-880d99b21e82,Bridge Fire,2018-01-09T13:46:00Z,2017-10-31T11:22:00Z,37.0,-122.309,40.774,2017-10-31,9.6,0.0,...,10.2,0.0,62.0,12.1,0.8,9.5,0.0,66.0,11.9,2.8
1,f904489b-0488-4764-9a98-bd3b818d1756,111 Fire,2022-05-01T22:29:02Z,2022-04-30T20:33:00Z,15.0,-115.916687,33.510716,2022-04-30,22.3,0.0,...,22.4,0.0,38.0,22.2,3.2,22.8,0.0,33.0,22.7,2.0
2,f9e85371-fc46-4f54-822c-7dcdaadde551,152 Fire,2013-06-05T08:30:00Z,2013-06-04T15:42:00Z,124.0,-121.006939,37.03776,2013-06-04,20.2,0.0,...,23.8,0.0,43.0,20.5,4.0,22.0,0.0,50.0,20.7,4.2


In [20]:
import folium

lat_list = fire_df.incident_latitude
lng_list = fire_df.incident_longitude

m = folium.Map(location=[lat_list[0], lng_list[0]], zoom_start=6)

for lat, lng in zip(lat_list, lng_list):
    folium.Marker(location=[lat, lng]).add_to(m)

m.save("map.html")
m

# Extract data from LANDFIRE.gov

In [59]:
data_dir = "/Users/shawn/Downloads/landfire_bdq7AFNARJrHJo3233me/"
data_hashmap = {}

for _data in os.listdir(data_dir):
    if _data == '.DS_Store':
        continue
    
    data_name = _data.split('_')[1]
    dbf_path, tif_path = '', ''
    for _file in os.listdir(os.path.join(data_dir, _data)):
        if 'vat.dbf' in _file:
            dbf_path = os.path.join(data_dir, _data, _file)
        if _file.split('.')[-1] == 'tif':
            tif_path = os.path.join(data_dir, _data, _file)
    
    if not (len(dbf_path) or len(tif_path)):
        continue

    data_hashmap[data_name] = {
        'dbf': pd.DataFrame(iter(DBF(dbf_path, load=True))),
        'tif': tif_path
    }
    
print(data_hashmap.keys())

dict_keys(['FDist', 'FVC', 'CBD', 'FBFM40', 'EVC', 'FBFM13'])


## CBD (Canopy Bulk Density) data

How dense the forest canopy fuels are, vertically. This is important for modeling crown fire intensity.

In [79]:
# Load VALUE → label mapping
vat_df, tif_path = data_hashmap['CBD'].values()
value_to_label = dict(zip(vat_df["VALUE"], vat_df["KGM3"]))

# Load raster and extract valid pixels
with rasterio.open(tif_path) as src:
    band = src.read(1)
    transform = src.transform
    nodata = src.nodata

    mask = (band != nodata) & (band != 0)
    rows, cols = np.where(mask)
    
    # Sample fewer if needed (for speed)
    SAMPLE_SIZE = 500_000
    if len(rows) > SAMPLE_SIZE:
        indices = np.random.choice(len(rows), size=SAMPLE_SIZE, replace=False)
        rows = rows[indices]
        cols = cols[indices]

    lats, lngs, vals = [], [], []

    for r, c in tqdm(zip(rows, cols), total=len(rows)):
        lng, lat = rasterio.transform.xy(transform, r, c)
        v = band[r, c]
        label = value_to_label.get(v)
        if label:
            lats.append(lat)
            lngs.append(lng)
            vals.append(v)

# Create cbd_df
cbd_df = pd.DataFrame({
    "latitude": lats,
    "longitude": lngs,
    "CBD_VALUE": vals
})

100%|██████████| 500000/500000 [00:05<00:00, 85018.36it/s]


In [88]:
cbd_df.to_csv(os.path.join(project_path, "data/landfire_data/processed/cbd_data.csv"))

In [80]:
# Build KDTree on cbd_df
cbd_coords = np.radians(cbd_df[["latitude", "longitude"]].values)
tree = sklearn.neighbors.BallTree(cbd_coords, metric='haversine')

# Query nearest neighbors for fire_df coords
fire_coords = np.radians(fire_df[["incident_latitude", "incident_longitude"]].values)
dist, idx = tree.query(fire_coords, k=1)

# Add matched CBD info to fire_df
matched = cbd_df.iloc[idx.flatten()].reset_index(drop=True)
fire_df["CBD_VALUE"] = matched["CBD_VALUE"]
fire_df["CBD_LABEL"] = matched["CBD_VALUE"].map(value_to_label)
fire_df["CBD_DISTANCE_KM"] = dist.flatten() * 6371  # Convert from radians to km


## Existing Vegetation Cover (EVC)

How much ground is covered by vegetation, includes grasses, shrubs, trees

In [68]:
vat_df, tif_path = data_hashmap['EVC'].values()
vat_df.head(4)

Unnamed: 0,VALUE,COUNT,CLASSNAMES,R,G,B,RED,GREEN,BLUE
0,-9999,1512576,Fill-NoData,255,255,255,1.0,1.0,1.0
1,11,9615374,Open Water,0,0,255,0.0,0.0,1.0
2,12,40405,Snow/Ice,159,161,240,0.623529,0.631373,0.941176
3,13,301351,Developed-Upland Deciduous Forest,64,61,168,0.25098,0.239216,0.658824


In [81]:
vat_df, tif_path = data_hashmap['EVC'].values()
value_to_label = dict(zip(vat_df["VALUE"], vat_df["CLASSNAMES"]))

# Load raster and extract valid pixels
with rasterio.open(tif_path) as src:
    band = src.read(1)
    transform = src.transform
    nodata = src.nodata

    mask = (band != nodata) & (band != 0)
    rows, cols = np.where(mask)
    
    # Sample fewer if needed (for speed)
    SAMPLE_SIZE = 500_000
    if len(rows) > SAMPLE_SIZE:
        indices = np.random.choice(len(rows), size=SAMPLE_SIZE, replace=False)
        rows = rows[indices]
        cols = cols[indices]

    lats, lngs, vals = [], [], []

    for r, c in tqdm(zip(rows, cols), total=len(rows)):
        lng, lat = rasterio.transform.xy(transform, r, c)
        v = band[r, c]
        label = value_to_label.get(v)
        if label:
            lats.append(lat)
            lngs.append(lng)
            vals.append(v)


evc_df = pd.DataFrame({
    "latitude": lats,
    "longitude": lngs,
    "EVC_VALUE": vals
})

100%|██████████| 500000/500000 [00:06<00:00, 80709.65it/s]


In [87]:
evc_df.to_csv(os.path.join(project_path, "data/landfire_data/processed/evc_data.csv"))

In [82]:
evc_coords = np.radians(evc_df[["latitude", "longitude"]].values)
tree = sklearn.neighbors.BallTree(evc_coords, metric='haversine')

# Query nearest neighbors for fire_df coords
fire_coords = np.radians(fire_df[["incident_latitude", "incident_longitude"]].values)
dist, idx = tree.query(fire_coords, k=1)

# Add matched CBD info to fire_df
matched = evc_df.iloc[idx.flatten()].reset_index(drop=True)
fire_df["EVC_VALUE"] = matched["EVC_VALUE"]
fire_df["EVC_LABEL"] = matched["EVC_VALUE"].map(value_to_label)
fire_df["EVC_DISTANCE_KM"] = dist.flatten() * 6371  # Convert from radians to km


## FBFM13

Fire Behavior Fuel Model 13, classic 13 fire fuel models based on vegetation type and fire bahvior.

- 1: Short grass
- 2: Timber
- 3: Tall grass
- 4: Chaparral
- 5: Brush
- 6: Dormant brush, hardwood slash
- 7: Southern rough
- 8: Closed timber litter
- 9: Hardwood litter
- 10: Timber (litter and understory)
- 11: Light logging slash
- 12: Medium logging slash
- 13: Heavy logging slash

In [84]:
vat_df, tif_path = data_hashmap['FBFM13'].values()
vat_df.head(4)

Unnamed: 0,VALUE,COUNT,FBFM13,R,G,B,RED,GREEN,BLUE
0,-9999,1512576,Fill-NoData,255,255,255,1.0,1.0,1.0
1,2,74531382,FBFM2,255,255,0,1.0,1.0,0.0
2,4,12164024,FBFM4,255,211,127,1.0,0.827451,0.498039
3,5,83824454,FBFM5,255,170,102,1.0,0.666667,0.4


In [85]:
vat_df, tif_path = data_hashmap['FBFM13'].values()

# Load raster and extract valid pixels
with rasterio.open(tif_path) as src:
    band = src.read(1)
    transform = src.transform
    nodata = src.nodata

    mask = (band != nodata) & (band != 0)
    rows, cols = np.where(mask)
    
    # Sample fewer if needed (for speed)
    SAMPLE_SIZE = 500_000
    if len(rows) > SAMPLE_SIZE:
        indices = np.random.choice(len(rows), size=SAMPLE_SIZE, replace=False)
        rows = rows[indices]
        cols = cols[indices]

    lats, lngs, vals = [], [], []

    for r, c in tqdm(zip(rows, cols), total=len(rows)):
        lng, lat = rasterio.transform.xy(transform, r, c)
        v = band[r, c]
        label = value_to_label.get(v)
        if label:
            lats.append(lat)
            lngs.append(lng)
            vals.append(v)


fbfm_df = pd.DataFrame({
    "latitude": lats,
    "longitude": lngs,
    "FBFM_VALUE": vals
})

100%|██████████| 500000/500000 [00:05<00:00, 83757.87it/s]


In [89]:
fbfm_df.to_csv(os.path.join(project_path, "data/landfire_data/processed/fbfm_data.csv"))

In [90]:
fbfm_coords = np.radians(fbfm_df[["latitude", "longitude"]].values)
tree = sklearn.neighbors.BallTree(fbfm_coords, metric='haversine')

fire_coords = np.radians(fire_df[["incident_latitude", "incident_longitude"]].values)
dist, idx = tree.query(fire_coords, k=1)

matched = fbfm_df.iloc[idx.flatten()].reset_index(drop=True)
fire_df["FBFM_VALUE"] = matched["FBFM_VALUE"]
fire_df["FBFM_DISTANCE_KM"] = dist.flatten() * 6371  # Convert from radians to km

## Fuel Distrubance (FDist)

Fuel Disturbance, Recent disturbance to fuels like thinning, harvest, fire, windothrow, etc.

In [92]:
vat_df, tif_path = data_hashmap['FDist'].values()
vat_df.head(4)

Unnamed: 0,VALUE,COUNT,D_TYPE,D_SEVERITY,D_TIME,R,G,B,RED,GREEN,BLUE
0,-9999,1512576,Fill-NoData,,,255,255,255,1.0,1.0,1.0
1,0,308221079,No Disturbance,,,0,0,0,0.0,0.0,0.0
2,111,1279155,Fire,Low,One Year,214,96,77,0.839216,0.376471,0.301961
3,112,18626896,Fire,Low,Two to Five Years,214,96,77,0.839216,0.376471,0.301961


In [98]:
vat_df, tif_path = data_hashmap['FDist'].values()
value_to_label = dict(zip(vat_df["VALUE"], vat_df['D_TYPE'] + '/' + vat_df['D_SEVERITY'] + '/' + vat_df['D_TIME']))

# Load raster and extract valid pixels
with rasterio.open(tif_path) as src:
    band = src.read(1)
    transform = src.transform
    nodata = src.nodata

    mask = (band != nodata) & (band != 0)
    rows, cols = np.where(mask)
    
    # Sample fewer if needed (for speed)
    SAMPLE_SIZE = 500_000
    if len(rows) > SAMPLE_SIZE:
        indices = np.random.choice(len(rows), size=SAMPLE_SIZE, replace=False)
        rows = rows[indices]
        cols = cols[indices]

    lats, lngs, vals = [], [], []

    for r, c in tqdm(zip(rows, cols), total=len(rows)):
        lng, lat = rasterio.transform.xy(transform, r, c)
        v = band[r, c]
        label = value_to_label.get(v)
        if label:
            lats.append(lat)
            lngs.append(lng)
            vals.append(v)


fdist_df = pd.DataFrame({
    "latitude": lats,
    "longitude": lngs,
    "FDIST_VALUE": vals
})

fdist_df.to_csv(os.path.join(project_path, "data/landfire_data/processed/fdist_data.csv"))

100%|██████████| 500000/500000 [00:05<00:00, 84572.92it/s]


In [99]:
fdist_coords = np.radians(fdist_df[["latitude", "longitude"]].values)
tree = sklearn.neighbors.BallTree(fdist_coords, metric='haversine')

fire_coords = np.radians(fire_df[["incident_latitude", "incident_longitude"]].values)
dist, idx = tree.query(fire_coords, k=1)

matched = fdist_df.iloc[idx.flatten()].reset_index(drop=True)
fire_df["FDIST_VALUE"] = matched["FDIST_VALUE"]
fire_df["FDIST_DISTANCE_KM"] = dist.flatten() * 6371  # Convert from radians to km

## Fuel Vegetation Cover (FVC)

Similar to EVC, but more focused on fuel-relevant vegetation, helps estimate surface fuel continuity

In [100]:
vat_df, tif_path = data_hashmap['FVC'].values()
vat_df.head(4)

Unnamed: 0,VALUE,COUNT,CLASSNAMES,R,G,B,RED,GREEN,BLUE
0,-9999,1512576,Fill-NoData,255,255,255,1.0,1.0,1.0
1,11,9615374,Open Water,0,0,255,0.0,0.0,1.0
2,12,40405,Snow/Ice,159,161,240,0.623529,0.631373,0.941176
3,13,347404,Developed-Upland Deciduous Forest,64,61,168,0.25098,0.239216,0.658824


In [101]:
vat_df, tif_path = data_hashmap['FVC'].values()
value_to_label = dict(zip(vat_df["VALUE"], vat_df["CLASSNAMES"]))

# Load raster and extract valid pixels
with rasterio.open(tif_path) as src:
    band = src.read(1)
    transform = src.transform
    nodata = src.nodata

    mask = (band != nodata) & (band != 0)
    rows, cols = np.where(mask)
    
    # Sample fewer if needed (for speed)
    SAMPLE_SIZE = 500_000
    if len(rows) > SAMPLE_SIZE:
        indices = np.random.choice(len(rows), size=SAMPLE_SIZE, replace=False)
        rows = rows[indices]
        cols = cols[indices]

    lats, lngs, vals = [], [], []

    for r, c in tqdm(zip(rows, cols), total=len(rows)):
        lng, lat = rasterio.transform.xy(transform, r, c)
        v = band[r, c]
        label = value_to_label.get(v)
        if label:
            lats.append(lat)
            lngs.append(lng)
            vals.append(v)


fvc_df = pd.DataFrame({
    "latitude": lats,
    "longitude": lngs,
    "FVC_VALUE": vals
})

fvc_df.to_csv(os.path.join(project_path, "data/landfire_data/processed/fvc_data.csv"))

100%|██████████| 500000/500000 [00:06<00:00, 82968.30it/s]


In [102]:
fvc_coords = np.radians(fvc_df[["latitude", "longitude"]].values)
tree = sklearn.neighbors.BallTree(fvc_coords, metric='haversine')

fire_coords = np.radians(fire_df[["incident_latitude", "incident_longitude"]].values)
dist, idx = tree.query(fire_coords, k=1)

matched = fvc_df.iloc[idx.flatten()].reset_index(drop=True)
fire_df["FVC_VALUE"] = matched["FVC_VALUE"]
fire_df["FVC_DISTANCE_KM"] = dist.flatten() * 6371  # Convert from radians to km

In [103]:
fire_df.head(4)

Unnamed: 0,incident_id,incident_name,incident_date_last_update,incident_date_created,incident_acres_burned,incident_longitude,incident_latitude,incident_dateonly_created,DayAirTmpAvg01,DayPrecip01,...,CBD_DISTANCE_KM,EVC_VALUE,EVC_LABEL,EVC_DISTANCE_KM,FBFM_VALUE,FBFM_DISTANCE_KM,FDIST_VALUE,FDIST_DISTANCE_KM,FVC_VALUE,FVC_DISTANCE_KM
0,2ca11d45-8139-4c16-8af0-880d99b21e82,Bridge Fire,2018-01-09T13:46:00Z,2017-10-31T11:22:00Z,37.0,-122.309,40.774,2017-10-31,9.6,0.0,...,1216.028099,251,Shrub Cover = 51%,1216.028099,-9999,1216.028099,112,1216.028099,125,1216.028099
1,f904489b-0488-4764-9a98-bd3b818d1756,111 Fire,2022-05-01T22:29:02Z,2022-04-30T20:33:00Z,15.0,-115.916687,33.510716,2022-04-30,22.3,0.0,...,548.933201,125,Tree Cover = 25%,548.933201,-9999,548.933201,112,548.933201,106,548.933201
2,f9e85371-fc46-4f54-822c-7dcdaadde551,152 Fire,2013-06-05T08:30:00Z,2013-06-04T15:42:00Z,124.0,-121.006939,37.03776,2013-06-04,20.2,0.0,...,788.094762,31,Barren,788.094762,-9999,788.094762,113,788.094762,111,788.094762
3,ca7da36b-1951-4d52-92b2-1a0b04aab586,241 Fire,2015-07-15T18:45:00Z,2015-07-13T10:47:00Z,214.0,-117.7274,33.7626,2015-07-13,27.4,0.0,...,470.160274,166,Tree Cover = 66%,470.160274,-9999,470.160274,112,470.160274,106,470.160274


In [None]:
fire_df.to_csv(os.path.join(project_path, "data/landfire_data/finalized/calfire_landfire_cimis_merged.csv"))