In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats

# Read Data and Create Dataframe

'df' consists of combined wildfire and weather data -- each row has weather data for each day, and NaN values for the rest of the rows if fire event is 0 (wildfire did not occur on that day). Of the 309,512K rows in the dataset, there were 23,472 wildfire events

In [3]:
df = pd.read_csv('Wildfire_Weather_Merged_new.csv', low_memory=False)
df.head()

Unnamed: 0,Serial,FireCategory,FireYear,Area,DistrictName,UnitName,FullFireNumber,FireName,Size_class,EstTotalAcres,...,Industrial_Restriction,Ign_DateTime,ReportDateTime,Discover_DateTime,Control_DateTime,CreationDate,ModifiedDate,DistrictCode,UnitCode,DistFireNumber
0,58256,STAT,2000,EOA,Central Oregon,John Day,00-952011-01,Slick Ear #2,B,0.75,...,Does Not Apply - Eastern OR,07/18/2000 07:00:00 PM,07/19/2000 01:20:00 PM,07/19/2000 01:15:00 PM,07/20/2000 12:50:00 AM,07/20/2000 09:13:00 AM,11/14/2000 09:16:00 AM,95,952,11
1,59312,STAT,2000,EOA,Northeast Oregon,La Grande,00-971024-01,Woodley,C,80.0,...,Does Not Apply - Eastern OR,08/24/2000 05:30:00 AM,08/24/2000 01:07:00 PM,08/24/2000 01:07:00 PM,09/01/2000 21:30,08/29/2000 03:59:00 PM,12/21/2000 04:22:00 PM,97,971,24
2,61657,STAT,2001,SOA,Southwest Oregon,Grants Pass,01-712133-02,QUEENS BRANCH,A,0.1,...,Lvl 3 Restricted Shutdown,08/10/2001 17:40,08/10/2001 17:47,08/10/2001 17:45,08/10/2001 18:30,08/10/2001 18:42,08/17/2001 11:45:00 AM,71,712,133
3,63735,STAT,2002,NOA,West Oregon,Philomath,02-551001-03,WREN,A,0.01,...,Lvl 1 Fire Season Only,07/06/2002 13:01,07/06/2002 13:04,07/06/2002 13:02,07/06/2002 13:07,07/07/2002 09:16,07/28/2002 10:08:00 AM,55,551,1
4,68019,STAT,2003,NOA,West Oregon,Dallas,03-552013-04,Ritner Creek,A,0.01,...,Lvl 3 Restricted Shutdown,08/22/2003 04:00:00 AM,08/22/2003 05:00:00 AM,08/22/2003 05:00:00 AM,08/22/2003 09:30:00 AM,08/22/2003 04:12:00 PM,09/04/2003 09:39,55,552,13


In [4]:
df.shape

(23490, 38)

# Clean and Prepare Data


In [5]:
df.duplicated().sum()

0

In [6]:
cols_to_convert = ['ReportDateTime', 'Control_DateTime']
df[cols_to_convert] = df[cols_to_convert].apply(pd.to_datetime)

In [7]:
df.dtypes

Serial                             int64
FireCategory                      object
FireYear                           int64
Area                              object
DistrictName                      object
UnitName                          object
FullFireNumber                    object
FireName                          object
Size_class                        object
EstTotalAcres                    float64
Protected_Acres                  float64
HumanOrLightning                  object
CauseBy                           object
GeneralCause                      object
SpecificCause                     object
Cause_Comments                    object
Lat_DD                           float64
Long_DD                          float64
LatLongDD                         object
FO_LandOwnType                    object
Twn                               object
Rng                               object
Sec                              float64
Subdiv                            object
LandmarkLocation

In [8]:
df.head()

Unnamed: 0,Serial,FireCategory,FireYear,Area,DistrictName,UnitName,FullFireNumber,FireName,Size_class,EstTotalAcres,...,Industrial_Restriction,Ign_DateTime,ReportDateTime,Discover_DateTime,Control_DateTime,CreationDate,ModifiedDate,DistrictCode,UnitCode,DistFireNumber
0,58256,STAT,2000,EOA,Central Oregon,John Day,00-952011-01,Slick Ear #2,B,0.75,...,Does Not Apply - Eastern OR,07/18/2000 07:00:00 PM,2000-07-19 13:20:00,07/19/2000 01:15:00 PM,2000-07-20 00:50:00,07/20/2000 09:13:00 AM,11/14/2000 09:16:00 AM,95,952,11
1,59312,STAT,2000,EOA,Northeast Oregon,La Grande,00-971024-01,Woodley,C,80.0,...,Does Not Apply - Eastern OR,08/24/2000 05:30:00 AM,2000-08-24 13:07:00,08/24/2000 01:07:00 PM,2000-09-01 21:30:00,08/29/2000 03:59:00 PM,12/21/2000 04:22:00 PM,97,971,24
2,61657,STAT,2001,SOA,Southwest Oregon,Grants Pass,01-712133-02,QUEENS BRANCH,A,0.1,...,Lvl 3 Restricted Shutdown,08/10/2001 17:40,2001-08-10 17:47:00,08/10/2001 17:45,2001-08-10 18:30:00,08/10/2001 18:42,08/17/2001 11:45:00 AM,71,712,133
3,63735,STAT,2002,NOA,West Oregon,Philomath,02-551001-03,WREN,A,0.01,...,Lvl 1 Fire Season Only,07/06/2002 13:01,2002-07-06 13:04:00,07/06/2002 13:02,2002-07-06 13:07:00,07/07/2002 09:16,07/28/2002 10:08:00 AM,55,551,1
4,68019,STAT,2003,NOA,West Oregon,Dallas,03-552013-04,Ritner Creek,A,0.01,...,Lvl 3 Restricted Shutdown,08/22/2003 04:00:00 AM,2003-08-22 05:00:00,08/22/2003 05:00:00 AM,2003-08-22 09:30:00,08/22/2003 04:12:00 PM,09/04/2003 09:39,55,552,13


In [12]:
# Calculate and add a new column for Fire Duration (hour)
df['FireDuration_hrs'] = df['EstTotalAcres']
df['FireDuration_hrs'] = (df['Control_DateTime'] - df['ReportDateTime'])

In [14]:

df.dtypes

Serial                             int64
FireCategory                      object
FireYear                           int64
Area                              object
DistrictName                      object
UnitName                          object
FullFireNumber                    object
FireName                          object
Size_class                        object
EstTotalAcres                    float64
Protected_Acres                  float64
HumanOrLightning                  object
CauseBy                           object
GeneralCause                      object
SpecificCause                     object
Cause_Comments                    object
Lat_DD                           float64
Long_DD                          float64
LatLongDD                         object
FO_LandOwnType                    object
Twn                               object
Rng                               object
Sec                              float64
Subdiv                            object
LandmarkLocation

In [16]:
df.isnull().sum().sort_values(ascending=False)

Cause_Comments            13684
LandmarkLocation           1282
SpecificCause               224
CauseBy                     128
Control_DateTime             95
Discover_DateTime            94
Ign_DateTime                 94
Industrial_Restriction       89
RegUseRestriction            89
EstTotalAcres                79
RegUseZone                   50
Subdiv                       31
Rng                          15
FireDuration_hrs             12
Twn                          12
Sec                          12
Lat_DD                       10
Long_DD                      10
LatLongDD                    10
CreationDate                  3
DistrictCode                  0
DistFireNumber                0
ModifiedDate                  0
ReportDateTime                0
UnitCode                      0
Serial                        0
County                        0
FireCategory                  0
GeneralCause                  0
HumanOrLightning              0
Protected_Acres               0
Size_cla

There are over 13K null values in the Cause_Comments column, so we will not use this in our analysis. Our main focus for reviewing causality will be looking at 'GeneralCause' as there are 0 null values present. As for fire date, we will focus on 'ReportDateTime' since there are no values missing. This is the date that the fire was first reported.

In [22]:
# Accurate location is important for this analysis, so we will drop the rows 13684 or more with NA values from lat/long columns and cause comments
df = df.dropna(subset=['Lat_DD', 'Long_DD', 'LatLongDD', 'Cause_Comments'])

In [23]:
count_removed = len(df[(df['EstTotalAcres'].isna()) & (df['Protected_Acres'] == 0.00)])
print("Count of rows to be removed:", count_removed)

# Here we will remove (21) rows with NaN values for 'EstTotalAcres' And '0.00' protected acres burned. 
#These fire events do not contain acres burned data so they will not be usefull in this analysis since we do not know the size

df = df.loc[~((df['EstTotalAcres'].isna()) & (df['Protected_Acres'] == 0.00))]

Count of rows to be removed: 0
