In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import reverse_geocoder as rg

Helper Functions

In [2]:
def removeCoordinatesOutOfBounds(df, location): 
    falsePositives = {(42.86744, -86.81662), (41.275833, -70.055833), (35.71333, -75.4865), (35.85408, -75.57812), 
                      (28.59278, -80.40258), (25.31667, -82.53156), (29.07127, -83.39877), (30, -88.97737), 
                      (32.4944, -121.4582), (53, -148), (33.9897, -119.7199), (40.00139, -124.6116),
                      (41.703833, -124.711667), (41.99665, -124.661)}
    for a, b in falsePositives:
        df = df[np.logical_and(df['Lat'] != a, df['Long'] != b)]
        
    df.reset_index(inplace=True, drop=True)
    coordinates = ()
    
    for (a, b) in zip(df.Lat, df.Long):
        coordinates += ((a, b),)

    results = rg.search(coordinates)
    indexes = [idx for idx, result in enumerate(results) if result['cc'] != location]
    df.drop(indexes, axis=0, inplace=True)
    
    return df

In [3]:
def durationDays(start, end):
    start = datetime.fromisoformat(start)
    end = datetime.fromisoformat(end)
    duration = end - start
    return duration.days + (duration.seconds/60/60/24)

Clean Columns

In [4]:
def getRelevantColumns(df_original):
    df = df_original.drop_duplicates(subset='UniqueFireIdentifier').copy()
    df = df[['InitialLatitude', 'InitialLongitude', 'POOState', 'FireCause', 'InitialResponseAcres',
             'IsFSAssisted', 'FireDiscoveryDateTime', 'FireOutDateTime']]
    return df

In [5]:
def fixColumns(df):
    df.rename(columns={"InitialLatitude":"Lat", "InitialLongitude":"Long", "POOState":"State"}, inplace=True)
    df["State"] = df["State"].str.replace("US-", "")
    df.loc[df.FireCause == "Undetermined", "FireCause"] = "Unknown"
    dateTimes = ["FireDiscoveryDateTime", "FireOutDateTime"]
    for col in dateTimes:
        df[col] = df[col].str.replace('/','-')
        df[col] = df[col].str.split('+', expand = True)[0]
    return df

In [6]:
def addColumns(df):
    # Duration of the fire in days starting from discovery and ending with extinguished fire.
    df["FireDurationDays"] = [durationDays(s, e) for (s, e) in zip(df['FireDiscoveryDateTime'], df['FireOutDateTime'])]
    
    # Extra column to help with EDA and ML
    df.loc[df.FireCause == "Human", "FireCauseNum"] = 0
    df.loc[df.FireCause == "Natural", "FireCauseNum"] = 1
    return df

Clean Rows

In [7]:
def dropRows(df):
    df.dropna(how="any", inplace=True)
    df = removeCoordinatesOutOfBounds(df, "US")
    return df

Program

In [12]:
def runProgram():
    df_original = pd.read_csv("Full_Wildland_Fires.csv")
    df = getRelevantColumns(df_original)
    df = fixColumns(df)
    df = dropRows(df)
    df = addColumns(df)
    return df

In [9]:
df = runProgram()
df

Loading formatted geocoded file...


Unnamed: 0,Lat,Long,State,FireCause,InitialResponseAcres,IsFSAssisted,FireDiscoveryDateTime,FireOutDateTime,FireDurationDays,FireCauseNum
0,48.07167,-114.83030,MT,Human,1.00,0.0,2017-10-17 20:20:24,2017-11-09 21:59:59,23.069155,0.0
1,44.65363,-111.56360,MT,Natural,0.10,0.0,2020-08-27 14:06:38,2020-08-27 20:52:59,0.282187,1.0
2,34.40333,-112.43940,AZ,Natural,0.50,0.0,2019-09-05 19:17:00,2019-09-09 17:00:00,3.904861,1.0
3,37.30310,-113.20540,UT,Natural,0.10,1.0,2016-06-13 12:29:00,2016-07-04 18:14:59,21.240266,1.0
4,38.27257,-119.25090,CA,Human,0.10,1.0,2016-09-02 22:49:00,2016-09-04 00:59:59,1.090961,0.0
...,...,...,...,...,...,...,...,...,...,...
68041,43.35600,-72.90700,VT,Human,30.00,0.0,2021-04-27 18:28:39,2021-05-03 15:00:00,5.855104,0.0
68042,43.39500,-72.83000,VT,Human,21.10,0.0,2021-04-19 17:19:54,2021-04-29 15:59:59,9.944502,0.0
68043,43.36300,-72.89900,VT,Human,23.00,0.0,2021-05-14 19:47:45,2021-05-23 14:35:00,8.782813,0.0
68044,43.40900,-72.92200,VT,Human,10.00,0.0,2021-04-27 18:30:00,2021-05-03 15:00:00,5.854167,0.0
