In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import reverse_geocoder as rg

Helper Functions

In [None]:
def removeCoordinatesOutOfBounds(df, location): 
    falsePositives = {(42.86744, -86.81662), (41.275833, -70.055833), (35.71333, -75.4865), (35.85408, -75.57812), 
                      (28.59278, -80.40258), (25.31667, -82.53156), (29.07127, -83.39877), (30, -88.97737), 
                      (32.4944, -121.4582), (53, -148), (33.9897, -119.7199), (40.00139, -124.6116),
                      (41.703833, -124.711667), (41.99665, -124.661)}
    for a, b in falsePositives:
        df = df[np.logical_and(df['Lat'] != a, df['Long'] != b)]
        
    df.reset_index(inplace=True, drop=True)
    coordinates = ()
    
    for (a, b) in zip(df.Lat, df.Long):
        coordinates += ((a, b),)

    results = rg.search(coordinates)
    indexes = [idx for idx, result in enumerate(results) if result['cc'] != location]
    df.drop(indexes, axis=0, inplace=True)
    
    return df

In [None]:
def durationDays(start, end):
    start = datetime.fromisoformat(start)
    end = datetime.fromisoformat(end)
    duration = end - start
    return duration.days + (duration.seconds/60/60/24)

In [None]:
def combineFullAndRecent(df_full, df_recent):
    df_recent = convertDateTime(df_recent)
    df = df_recent.append(df_full)
    df = df.reset_index(drop=True)
    return df

Clean Columns

In [None]:
def getRelevantColumns(df_original):
    df = df_original.drop_duplicates(subset='UniqueFireIdentifier').copy()
    df = df[['InitialLatitude', 'InitialLongitude', 'POOState', 'FireCause', 'InitialResponseAcres',
             'IsFSAssisted', 'FireDiscoveryDateTime', 'FireOutDateTime']]
    return df

In [None]:
def fixColumns(df):
    df.rename(columns={"InitialLatitude":"Lat", "InitialLongitude":"Long", "POOState":"State"}, inplace=True)
    df["State"] = df["State"].str.replace("US-", "")
    df.loc[df.FireCause == "Undetermined", "FireCause"] = "Unknown"
    return df

In [None]:
def addColumns(df):
    # Duration of the fire in days starting from discovery and ending with extinguished fire.
    df["FireDurationDays"] = [durationDays(s, e) for (s, e) in zip(df['FireDiscoveryDateTime'], df['FireOutDateTime'])]
    
    # Extra column to help with EDA and ML
    df.loc[df.FireCause == "Human", "FireCauseNum"] = 0
    df.loc[df.FireCause == "Natural", "FireCauseNum"] = 1
    df.loc[df.FireCause == "Unknown", "FireCauseNum"] = 2
    return df

In [None]:
def convertDateTime(df):
    def convert(s):
        if isinstance(s, str):
            d = datetime.strptime(s, '%m-%d-%Y, %I:%M %p')
            d = d.strftime('%Y-%m-%d %H:%M:%S')
            return str(d)
        else:
            return np.NaN
    
    dateTimes = [col for col in df if "Date" in col]

    for col in dateTimes:
        if df[col].dtypes == np.dtype("O"):
            df[col] = df[col].str.replace('/','-')
            df[col] = [convert(x) for x in df[col]]
    return df

Clean Rows

In [None]:
def dropRows(df):
    df.dropna(how="any", inplace=True)
    df = removeCoordinatesOutOfBounds(df, "US")
    return df

Program

In [None]:
def runProgram():
    df_full = pd.read_csv("Full_Wildland_Fires.csv")
    df_recent = pd.read_csv("Recent_Wildland_Fires.csv")
    
    df = combineFullAndRecent(df_full, df_recent)
    df = getRelevantColumns(df)
    df = fixColumns(df)
    df = dropRows(df)
    df = addColumns(df)
    return df

Driver Code

In [None]:
df = runProgram()
df.to_csv("Cleaned_Wildland_Fires.csv", index=False)