In [1]:
import pandas as pd
import math

fire_locations = pd.read_csv('../data/Wildland_Fire_Incident_Locations.csv')

fire_locations.head()

Unnamed: 0,IncidentName,UniqueFireIdentifier,FireDiscoveryDateTime,InitialResponseAcres,CreatedOnDateTime_dt,GACC,ContainmentDateTime,ControlDateTime,FireOutDateTime,DiscoveryAcres,FinalAcres,IncidentSize,InitialLatitude,InitialLongitude,Latitude,Longitude
0,Grand Pager,2014-AKFAS-411093,1530/08/16,0.1,2014-05-21 04:09:45+00:00,AKCC,2014/05/12 05:17:21+00,2014/05/12 05:17:35+00,2014/05/13 01:53:46+00,0.1,0.1,0.1,64.80265,-147.744683,64.802702,-147.745026
1,Glenn Alps,2014-AKMSS-401077,2014/05/09 03:22:55+00,0.1,2014-05-21 04:10:05+00:00,AKCC,2014/05/09 07:09:00+00,2014/05/09 07:09:31+00,2014/05/10 20:27:38+00,0.1,0.7,0.7,61.102967,-149.662833,61.103002,-149.663023
2,Johnson Lake,2014-AKKKS-403067,2014/05/05 00:18:25+00,0.1,2014-05-21 04:10:06+00:00,AKCC,2014/05/05 00:28:35+00,2014/05/05 00:28:00+00,2014/05/05 00:33:26+00,0.1,0.1,0.1,60.295033,-151.262467,60.295001,-151.262022
3,Mile 9 Talkeetna Spur,2014-AKMSS-401137,2014/05/19 21:45:02+00,0.1,2014-05-21 04:10:07+00:00,AKCC,2014/05/19 22:02:00+00,2014/05/19 22:02:31+00,2014/05/25 20:02:25+00,0.1,0.1,0.1,62.422633,-150.081217,62.422602,-150.081024
4,Jim Lake,2014-AKMSS-401131,2014/05/19 00:01:00+00,0.1,2014-05-21 04:10:07+00:00,AKCC,2014/05/19 16:14:45+00,2014/05/19 16:14:49+00,2014/05/29 23:42:41+00,0.1,1.0,1.0,61.543767,-148.879433,61.543802,-148.879023


In [2]:
total_instances = fire_locations.shape[0]
print("Total instances: " + str(total_instances))
missing_values = fire_locations.isnull().sum()
print("Missing values for each attribute:")
for attribute, count in missing_values.items():
    print(f"{attribute}: {count} ({int(count/total_instances * 10000)/100}%)")
    # Note: I'm sure there's a cleaner way to concatenate to two decmial values, but this all I could come up with

Total instances: 207301
Missing values for each attribute:
IncidentName: 40 (0.01%)
UniqueFireIdentifier: 0 (0.0%)
FireDiscoveryDateTime: 0 (0.0%)
InitialResponseAcres: 133197 (64.25%)
CreatedOnDateTime_dt: 0 (0.0%)
GACC: 58 (0.02%)
ContainmentDateTime: 81565 (39.34%)
ControlDateTime: 92348 (44.54%)
FireOutDateTime: 81482 (39.3%)
DiscoveryAcres: 59460 (28.68%)
FinalAcres: 195128 (94.12%)
IncidentSize: 65217 (31.46%)
InitialLatitude: 59051 (28.48%)
InitialLongitude: 59051 (28.48%)
Latitude: 0 (0.0%)
Longitude: 0 (0.0%)


In [3]:
# remove instances with missing values for FireOutDateTime and IncidentSize because we need both for calculations
fire_locations.dropna(subset=["FireOutDateTime"], inplace=True)
fire_locations.dropna(subset=["IncidentSize"], inplace=True)

print("Total instances: " + str(fire_locations.shape[0]))

Total instances: 119225


In [4]:
# remove UniqueFireID 2014-IDNCF-000609 and 2014-AKFAS-411093 because they have dates in 1530
fire_locations.drop(fire_locations[fire_locations['UniqueFireIdentifier'] == '2014-AKFAS-411093'].index, inplace=True)
fire_locations.drop(fire_locations[fire_locations['UniqueFireIdentifier'] == '2014-IDNCF-000609'].index, inplace=True)

fire_locations['FireDiscoveryDateTime'] = pd.to_datetime(fire_locations['FireDiscoveryDateTime'], format='%Y/%m/%d %H:%M:%S+00')
fire_locations['FireOutDateTime'] = pd.to_datetime(fire_locations['FireOutDateTime'], format='%Y/%m/%d %H:%M:%S+00')

# Calculate the difference between 'FireOutDateTime' and 'FireDiscoveryDateTime'
fire_locations['TimeBurned'] = fire_locations['FireOutDateTime'] - fire_locations['FireDiscoveryDateTime']

# Displaying the first few rows to verify the new column
print(fire_locations[['FireDiscoveryDateTime', 'FireOutDateTime', 'TimeBurned']].head())

  FireDiscoveryDateTime     FireOutDateTime       TimeBurned
1   2014-05-09 03:22:55 2014-05-10 20:27:38  1 days 17:04:43
2   2014-05-05 00:18:25 2014-05-05 00:33:26  0 days 00:15:01
3   2014-05-19 21:45:02 2014-05-25 20:02:25  5 days 22:17:23
4   2014-05-19 00:01:00 2014-05-29 23:42:41 10 days 23:41:41
5   2014-05-17 01:37:14 2014-05-20 00:51:20  2 days 23:14:06


In [5]:
fire_locations['HoursBurned'] = fire_locations['TimeBurned'].dt.total_seconds() / (60*60)

# To get an idea of what we're working with: range and average of both metrics we'll be working with for supression result
summary = fire_locations[['HoursBurned', 'IncidentSize']].describe()

time_burned_range = summary.loc['max', 'HoursBurned'] - summary.loc['min', 'HoursBurned']
acres_burned_range = summary.loc['max', 'IncidentSize'] - summary.loc['min', 'IncidentSize']
average_time_burned = summary.loc['mean', 'HoursBurned']
average_acres_burned = summary.loc['mean', 'IncidentSize']

print("HoursBurned Range:", time_burned_range)
print("HoursBurned Average:", average_time_burned)
print("IncidentSize Range:", acres_burned_range)
print("IncidentSize Average:", average_acres_burned)


# Log option
log_time_range = math.log2(time_burned_range)
log_acres_range = math.log2(acres_burned_range)

print("log time: " + str(log_time_range) + " log acres: " + str(log_acres_range))

fire_locations["LNormalizedTime"] = fire_locations["HoursBurned"].apply(lambda x: math.log2(x+1)/log_time_range if x > 0 else 0)
fire_locations["LNormalizedAcreage"] = fire_locations["IncidentSize"].apply(lambda x: math.log2(x+1)/log_acres_range if x > 0 else 0)
fire_locations['LSupressionResult'] = (1 - (fire_locations['LNormalizedTime'] + fire_locations['LNormalizedAcreage'])/2) * 100

filtered_fire_locations = fire_locations[fire_locations['IncidentSize'] > 500]
print(filtered_fire_locations[['FireDiscoveryDateTime', 'HoursBurned', 'IncidentSize', 'LNormalizedTime', 'LNormalizedAcreage', 'LSupressionResult']].head())

# print(fire_locations[['FireDiscoveryDateTime', 'HoursBurned', 'IncidentSize', 'NormalizedTime', 'NormalizedAcreage']].head())

HoursBurned Range: 28090.043611111112
HoursBurned Average: 295.28396181478774
IncidentSize Range: 589368.0
IncidentSize Average: 435.8770563314125
log time: 14.777771243867264 log acres: 19.168809205250618
    FireDiscoveryDateTime  HoursBurned  IncidentSize  LNormalizedTime  \
11    2014-05-19 22:46:36  1104.806944        1906.0         0.684195   
17    2014-05-20 00:03:50  4866.938889      196610.0         0.828887   
132   2014-05-11 21:02:00  5274.466667        5484.0         0.836735   
143   2014-04-19 23:30:00  2089.500000       73622.0         0.746366   
145   2014-05-17 21:00:00  1420.000000        1482.0         0.708679   

     LNormalizedAcreage  LSupressionResult  
11             0.568480          37.366216  
17             0.917375          12.686929  
132            0.647994          25.763528  
143            0.843447          20.509343  
145            0.549554          37.088349  


In [8]:
# Exponent option
exp_time_range = (0.005 * time_burned_range) ** 2
exp_acres_range = (0.0003 * acres_burned_range) ** 2

print("exp time: " + str(exp_time_range) + " exp acres: " + str(exp_acres_range))

fire_locations["ENormalizedTime"] = fire_locations["HoursBurned"].apply(lambda x: ((0.005 * x) ** 2) / exp_time_range if x > 0 else 0)
fire_locations["ENormalizedAcreage"] = fire_locations["IncidentSize"].apply(lambda x: ((0.0003 * x) ** 2) / exp_acres_range if x > 0 else 0)
fire_locations['ESupressionResult'] = (fire_locations['ENormalizedTime'] + fire_locations['ENormalizedAcreage']/2) * 100

filtered_fire_locations = fire_locations[fire_locations['IncidentSize'] > 500]
print(filtered_fire_locations[['FireDiscoveryDateTime', 'HoursBurned', 'IncidentSize', 'ENormalizedTime', 'ENormalizedAcreage', 'ESupressionResult']].head())

exp time: 19726.263751853105 exp acres: 31261.917548159996
    FireDiscoveryDateTime  HoursBurned  IncidentSize  ENormalizedTime  \
11    2014-05-19 22:46:36  1104.806944        1906.0         0.001547   
17    2014-05-20 00:03:50  4866.938889      196610.0         0.030020   
132   2014-05-11 21:02:00  5274.466667        5484.0         0.035258   
143   2014-04-19 23:30:00  2089.500000       73622.0         0.005533   
145   2014-05-17 21:00:00  1420.000000        1482.0         0.002555   

     ENormalizedAcreage  ESupressionResult  
11             0.000010           0.155215  
17             0.111285           8.566243  
132            0.000087           3.530085  
143            0.015604           1.333536  
145            0.000006           0.255864  


In [7]:
# We need to normalize the bell curve? Find a way to visualize this distribution