In [None]:
import pandas as pd
import numpy as np
import random

BASE_PATH = "data/processed/non_wildfire/"

# Load the dataset
df = pd.read_csv('data/raw/mapdataall.csv')

# filtering rows related to wildfires
wildfire_data = df[df['incident_type'].str.contains('wildfire', case=False, na=False)]

# Save the filtered data to a new CSV file
wildfire_data.to_csv('data/calfire_cimis_non_wildfire_data/wildfire_records(debug).csv', index=False)

print(f"Filtered wildfire data saved to 'wildfire_records.csv'.")

Filtered wildfire data saved to 'wildfire_records.csv'.


In [17]:
print(wildfire_data.dtypes)

incident_name                        object
incident_is_final                    object
incident_date_last_update            object
incident_date_created                object
incident_administrative_unit         object
incident_administrative_unit_url    float64
incident_county                      object
incident_location                    object
incident_acres_burned               float64
incident_containment                float64
incident_control                     object
incident_cooperating_agencies        object
incident_longitude                  float64
incident_latitude                   float64
incident_type                        object
incident_id                          object
incident_url                         object
incident_date_extinguished           object
incident_dateonly_extinguished       object
incident_dateonly_created            object
is_active                            object
calfire_incident                       bool
notification_desired            

In [18]:
county_intervals = {}
for _, row in wildfire_data.iterrows():
    county = row['incident_county']
    start = pd.to_datetime(row['incident_dateonly_created'], errors='coerce')
    end = pd.to_datetime(row['incident_dateonly_extinguished'], errors='coerce')
    # Skip rows with invalid dates
    if pd.isnull(start) or pd.isnull(end):
        continue
    if county not in county_intervals:
        county_intervals[county] = []
    county_intervals[county].append((start, end))

# Helper function: Check if a candidate date falls within any interval in a list
def is_in_any_interval(date, intervals):
    return any(start <= date <= end for start, end in intervals)

In [19]:
# Define overall date range
start_date = pd.to_datetime("2010-01-01")
end_date = pd.to_datetime("2025-03-20")
date_range = pd.date_range(start=start_date, end=end_date)

# Determine the number of non-wildfire dates needed (here, twice the number of wildfire incidents)
num_wildfire_incidents = len(wildfire_data)
random_missing_data = []

# Generate candidate dates ensuring they do not fall within the wildfire intervals for the associated county
while len(random_missing_data) < 2 * num_wildfire_incidents:
    # Randomly select one wildfire incident to get the associated county and location info
    random_wildfire = wildfire_data.sample(1).iloc[0]
    county = random_wildfire['incident_county']
    latitude = random_wildfire['incident_latitude']
    longitude = random_wildfire['incident_longitude']
    
    # Randomly pick a candidate date from the global date range
    candidate_date = random.choice(date_range)
    
    # Retrieve the intervals for the county (if any)
    intervals = county_intervals.get(county, [])
    
    # If the candidate date is not within any wildfire interval, accept it
    if not is_in_any_interval(candidate_date, intervals):
        random_missing_data.append([candidate_date, county, latitude, longitude])

In [None]:
# Convert the collected data into a DataFrame
missing_df = pd.DataFrame(random_missing_data, columns=["date", "county", "latitude", "longitude"])

# Save the DataFrame to a CSV file
missing_df.to_csv('data/calfire_cimis_non_wildfire_data/non_wildfire_dates.csv', index=False)
print(f"Randomly selected non-wildfire dates saved to 'non_wildfire_dates.csv'.")

Randomly selected non-wildfire dates saved to 'non_wildfire_dates.csv'.


In [21]:
num_rows = len(missing_df)
print(f"Total rows: {num_rows}")

Total rows: 3142
