In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [2]:
# Load datasets
population_data = pd.read_csv('california_population_by_county.csv')
wildfire_data = pd.read_csv('map_data_final.csv')
temperature_data = pd.read_csv('temperature_map_data.csv')

In [3]:
# Clean and preprocess population data
# Remove the suffix ' County, California' from the County column to match with the incident_county column in wildfire_data
population_data['County'] = population_data['County'].str.replace(' County, California', '')

In [4]:
# Merge wildfire and population data
# Combine the wildfire and population data based on the incident county to get population information for each wildfire incident
merged_data = pd.merge(wildfire_data, population_data, left_on='incident_county', right_on='County', how='left')

In [5]:
# Drop unnecessary columns
# Remove columns that are not needed for the analysis
merged_data = merged_data.drop(['state', 'county', 'County', 'incident_control', 'incident_dateonly_extinguished', 'incident_dateonly_created', 'incident_type'], axis=1)

In [6]:
# Rename columns for clarity
merged_data = merged_data.rename(columns={'TotalPopulation': 'county_population'})

In [7]:
# Merge temperature data
# Add temperature data to the merged_data based on incident_id to include weather information
final_data = pd.merge(merged_data, temperature_data[['incident_id', 'mean_temperature']], on='incident_id', how='left')
final_data.head()

Unnamed: 0,incident_name,incident_is_final,incident_date_last_update,incident_date_created,incident_administrative_unit,incident_county,incident_acres_burned,incident_containment,incident_cooperating_agencies,incident_longitude,incident_latitude,incident_id,incident_date_extinguished,is_active,county_population,mean_temperature
0,Bridge Fire,Y,2018-01-09 13:46:00+00:00,2017-10-31 11:22:00+00:00,Shasta-Trinity National Forest,Shasta,37.0,100.0,Shasta-Trinity National Forest,-122.309,40.774,2ca11d45-8139-4c16-8af0-880d99b21e82,2018-01-09 13:46:00+00:00,N,181852.0,9.712425
1,Pala Fire,Y,2020-09-16 14:07:35+00:00,2009-05-24 14:56:00+00:00,CAL FIRE San Diego Unit,San Diego,122.0,100.0,CAL FIRE San Diego Unit,1.0,1.0,8f61f461-552d-4538-b186-35ab030da416,2009-05-25 00:00:00+00:00,N,3289701.0,26.150545
2,River Fire,Y,2022-10-24 11:39:23+00:00,2013-02-24 08:16:00+00:00,CAL FIRE San Bernardino Unit,Inyo,407.0,100.0,"CAL FIRE San Bernardino Unit, Inyo County Sher...",-118.01651,36.602575,094719ba-a47b-4abb-9ec5-a506b2b9fd23,2013-02-28 20:00:00+00:00,N,18829.0,18.313614
3,Fawnskin Fire,Y,2013-04-22 09:00:00+00:00,2013-04-20 17:30:00+00:00,San Bernardino National Forest,San Bernardino,30.0,100.0,San Bernardino National Forest,-116.941311,34.288877,58f89ff8-bd3e-4355-b1c0-8fa05c747d3f,2013-04-22 09:00:00+00:00,N,2180563.0,11.841945
4,Gold Fire,Y,2013-05-01 07:00:00+00:00,2013-04-30 12:59:00+00:00,CAL FIRE Madera-Mariposa-Merced Unit,Madera,274.0,100.0,CAL FIRE Madera-Mariposa-Merced Unit,-119.635004,37.116295,357ffc13-bef9-48eb-810f-c5de851972eb,2013-05-01 07:00:00+00:00,N,157243.0,21.468794


In [8]:
# Create relevant features
# Convert date columns to datetime format and calculate the containment time in hours
final_data['incident_date_created'] = pd.to_datetime(final_data['incident_date_created'])
final_data['incident_date_extinguished'] = pd.to_datetime(final_data['incident_date_extinguished'])
final_data['containment_time'] = (final_data['incident_date_extinguished'] - final_data['incident_date_created']).dt.total_seconds() / 3600


In [9]:
# Extract additional date features from incident_date_created
# Create new features based on the creation date of the incident for better temporal analysis
final_data['day_of_year_created'] = final_data['incident_date_created'].dt.dayofyear
final_data['day_of_week_created'] = final_data['incident_date_created'].dt.dayofweek
final_data['month_created'] = final_data['incident_date_created'].dt.month
final_data['year_created'] = final_data['incident_date_created'].dt.year

In [10]:
# Extract additional date features from incident_date_extinguished
# Create new features based on the extinguished date of the incident for better temporal analysis
final_data['day_of_year_extinguished'] = final_data['incident_date_extinguished'].dt.dayofyear
final_data['day_of_week_extinguished'] = final_data['incident_date_extinguished'].dt.dayofweek
final_data['month_extinguished'] = final_data['incident_date_extinguished'].dt.month
final_data['year_extinguished'] = final_data['incident_date_extinguished'].dt.year


In [11]:
# Drop rows with missing target values
# Remove rows where containment_time is NaN
final_data = final_data.dropna(subset=['containment_time'])

# Drop rows with missing feature values
# Remove rows where any of the key feature columns have NaN values
final_data = final_data.dropna(subset=['incident_acres_burned', 'county_population', 'mean_temperature'])


In [12]:
# Classify containment time into bins
# Create a new column 'containment_time_class' by binning the containment_time into 'short', 'medium', and 'long'
bins = [0, 24, 72, float('inf')]
labels = ['short', 'medium', 'long']
final_data['containment_time_class'] = pd.cut(final_data['containment_time'], bins=bins, labels=labels, include_lowest=True)


In [13]:
# Remove any rows where the target is NaN
# Ensure that there are no NaN values in the containment_time_class column
final_data = final_data.dropna(subset=['containment_time_class'])

In [14]:
# Select features and target variable
features = [
    'incident_acres_burned', 'county_population', 'mean_temperature',
    'incident_latitude', 'incident_longitude', 'day_of_year_created',
    'day_of_week_created', 'month_created', 'year_created',
    'day_of_year_extinguished', 'day_of_week_extinguished',
    'month_extinguished', 'year_extinguished'
]
target = 'containment_time_class'

X = final_data[features]
y = final_data[target]

# Encode categorical target
y = y.astype('category').cat.codes


In [15]:
# Ensure correct unique mapping of labels
# Create a mapping from numerical codes to original class labels for interpretation
unique_classes = y.unique()
unique_classes_labels = {code: labels[code] for code in range(len(labels))}

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [17]:
# Adjust the hyperparameters for tuning
model = RandomForestClassifier(
    n_estimators=1000,           # Number of trees in the forest
    max_depth=50,               # Maximum depth of the tree
    min_samples_split=10,        # Minimum number of samples required to split an internal node
    min_samples_leaf=5,         # Minimum number of samples required to be at a leaf node
    max_features='log2',        # Number of features to consider when looking for the best split
    random_state=42
)
model.fit(X_train, y_train)

In [18]:
# Make predictions
y_pred = model.predict(X_test)


In [19]:
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=labels))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

       short       0.62      0.57      0.59        84
      medium       0.48      0.44      0.46        88
        long       0.85      0.89      0.87       304

    accuracy                           0.75       476
   macro avg       0.65      0.64      0.64       476
weighted avg       0.74      0.75      0.75       476

[[ 48  21  15]
 [ 18  39  31]
 [ 12  21 271]]


In [20]:
# Use cross-validation for a more reliable performance estimate
# Perform cross-validation and print the average accuracy and standard deviation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print(f'Cross-validated accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}')

Cross-validated accuracy: 0.77 ± 0.01


In [21]:
# Save the model
joblib_file = "cawildfire_random_forest_model.pkl"
joblib.dump(model, joblib_file)

['cawildfire_random_forest_model.pkl']

In [22]:
# Save the final_data DataFrame to a CSV file
final_data.to_csv('cawildfire_percent75_data.csv', index=False)

In [23]:
final_data.head()

Unnamed: 0,incident_name,incident_is_final,incident_date_last_update,incident_date_created,incident_administrative_unit,incident_county,incident_acres_burned,incident_containment,incident_cooperating_agencies,incident_longitude,...,containment_time,day_of_year_created,day_of_week_created,month_created,year_created,day_of_year_extinguished,day_of_week_extinguished,month_extinguished,year_extinguished,containment_time_class
0,Bridge Fire,Y,2018-01-09 13:46:00+00:00,2017-10-31 11:22:00+00:00,Shasta-Trinity National Forest,Shasta,37.0,100.0,Shasta-Trinity National Forest,-122.309,...,1682.4,304,1,10,2017,9.0,1.0,1.0,2018.0,long
1,Pala Fire,Y,2020-09-16 14:07:35+00:00,2009-05-24 14:56:00+00:00,CAL FIRE San Diego Unit,San Diego,122.0,100.0,CAL FIRE San Diego Unit,1.0,...,9.066667,144,6,5,2009,145.0,0.0,5.0,2009.0,short
2,River Fire,Y,2022-10-24 11:39:23+00:00,2013-02-24 08:16:00+00:00,CAL FIRE San Bernardino Unit,Inyo,407.0,100.0,"CAL FIRE San Bernardino Unit, Inyo County Sher...",-118.01651,...,107.733333,55,6,2,2013,59.0,3.0,2.0,2013.0,long
3,Fawnskin Fire,Y,2013-04-22 09:00:00+00:00,2013-04-20 17:30:00+00:00,San Bernardino National Forest,San Bernardino,30.0,100.0,San Bernardino National Forest,-116.941311,...,39.5,110,5,4,2013,112.0,0.0,4.0,2013.0,medium
4,Gold Fire,Y,2013-05-01 07:00:00+00:00,2013-04-30 12:59:00+00:00,CAL FIRE Madera-Mariposa-Merced Unit,Madera,274.0,100.0,CAL FIRE Madera-Mariposa-Merced Unit,-119.635004,...,18.016667,120,1,4,2013,121.0,2.0,5.0,2013.0,short
