# Data Cleaning

In [None]:
import json
import requests
import numpy as np
import pandas as pd
from datetime import datetime
import reverse_geocoder as rg

## Weather
##### Recieve weather data for a state at the dates of the fires. 
##### Sending each (lat, long) would take too long and cost a lot of money, so for temperature we take the highest temp of that day which is whats releveant for a fire.

In [None]:
# visualcrossing API for weather data
def historyWeatherAPI(location, date):
    key = "UQSRASVQA4PFTETMQEG3QMZX7" 
    url = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/weatherdata/history?"
    
    querystring ={"location":location,
                  "contentType ":"JSON",
                  "key":key,
                  "lang":"en",
                  "aggregateHours":"24",
                  "startDateTime":date,
                  "endDateTime":date}

    response = requests.request("GET", url, params=querystring)
    response = response.text.split(",")
    MaxTemp, Humidity, WindSpeed, Precipitation = response[27], response[30], response[32], response[36]
    return [MaxTemp, Humidity, WindSpeed, Precipitation]

In [None]:
def runWeather(df):
    df_weather = df[['State', 'Date']].copy()
    df_weather.drop_duplicates(inplace=True)
    
    State, Date = df_weather.loc[:, "State"], df_weather.loc[:, "Date"]
    Date = [d.split('-')[2] + '-' + d.split('-')[1]  + '-' + d.split('-')[0] for d in Date]
    Location = [a + ',' + "USA" for a in State]
    
    res = [historyWeatherAPI(l, d) for l, d in zip(Location, Date)]
    
    df_weather[['MaxTemp', 'Humidity', 'WindSpeed', 'Precipitation']] = res
    df_weather["MaxTemp"] = d.MaxTemp.apply(lambda x : (x-32) * (5.0/9.0)).round(1)    # farenheit to celcius
    
    return df_weather.copy()

In [None]:
def insertWeatherInfo(df, df_weather):
    df[["MaxTemp", "Humidity", "WindSpeed", "Precipitation"]] = weather[["MaxTemp", "Humidity", "WindSpeed", "Precipitation"]].copy()
    df["ID"] = [a+','+b for a, b in zip(df.State, df.Date)]
    
    for col in ["MaxTemp", "Humidity", "WindSpeed", "Precipitation"]:
        dic = {a+','+b : c for a, b, c in zip(df_weather.State, df_weather.Date, df_weather[col])}
        df[col] = df[col].fillna(df.ID.map(dic))
    
    df.drop("ID", axis=1, inplace=True)
    return df

## Row Cleaning

In [None]:
def removeCoordinatesOutOfBounds(df, location): 
    falsePositives = {(42.86744, -86.81662), (41.275833, -70.055833), (35.71333, -75.4865), (35.85408, -75.57812), 
                      (28.59278, -80.40258), (25.31667, -82.53156), (29.07127, -83.39877), (30, -88.97737), 
                      (32.4944, -121.4582), (53, -148), (33.9897, -119.7199), (40.00139, -124.6116),
                      (41.703833, -124.711667), (41.99665, -124.661)}
    for a, b in falsePositives:
        df = df[np.logical_and(df['Lat'] != a, df['Long'] != b)]
    df = df[np.logical_and(df.Lat <= 90, df.Lat >= -90)]
    df = df[np.logical_and(df.Long <= 180, df.Long >= -180)]
    
    df.reset_index(inplace=True, drop=True)
    coordinates = ()
    
    for (a, b) in zip(df.Lat, df.Long):
        coordinates += ((a, b),)

    results = rg.search(coordinates)
    indexes = [idx for idx, result in enumerate(results) if result['cc'] != location]
    df.drop(indexes, axis=0, inplace=True)
    
    return df

In [None]:
def dropRows(df):
    df.dropna(how="any", inplace=True)
    df = df[df.Acres >= 0.1]
    df = removeCoordinatesOutOfBounds(df, "US")
    return df

## Column Cleaning

In [None]:
def dropNoLongerRelevantColumns(df):
    df.drop(columns = ['Date', 'FireOutDateTime', 'ID'], axis=1, inplace=True)
    return df

In [None]:
def combineFullAndRecent(df_full, df_recent):
    df_recent = convertDateTime(df_recent)
    df = df_recent.append(df_full)
    df = df.reset_index(drop=True)
    return df

In [None]:
def durationDays(start, end):
    if isinstance(start, str) and isinstance(end, str):
        start = datetime.fromisoformat(start)
        end = datetime.fromisoformat(end)
        duration = end - start
        return duration.days + (duration.seconds/60/60/24)
    else:
        return np.nan

In [None]:
def changeAbbreviations(df):
    abbreviations = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado",
          "CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho",
          "IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine",
          "MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri",
          "MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico",
          "NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon",
          "PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee",
          "TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia",
          "WI":"Wisconsin","WY":"Wyoming"}
    df.replace({"State": abbreviations}, inplace=True)
    
    return df

In [None]:
def fixColumns(df):
    # Columns
    renameDict = {'InitialLatitude': 'Lat', 'InitialLongitude': 'Long', 'POOState': 'State', 'DailyAcres' : 'Acres'}
    df.rename(columns=renameDict, inplace=True)
    
    # State
    df["State"] = df["State"].str.replace("US-", "")
    df = df[np.logical_and(df.State != "PR", df.State != "GU")]
    for noNeed in ['MX-CA', 'MX-SO', 'MX-BN', 'CA-BC', 'MX-SON', 'MX-BCN']:
        df = df[df.State != noNeed]
    df = changeAbbreviations(df)
    
    # Date
    df["Date"] = df.FireDiscoveryDateTime.str.split(" ", expand=True)[0]
    df = df[df.Date >= "2014-01-01"]
    
    # Duration
    df["Duration"] = [durationDays(s, e) for (s, e) in zip(df['FireDiscoveryDateTime'], df['FireOutDateTime'])]
    df = df[df.Duration >= 0.01] # 15 minutes
    
    return df

In [None]:
def getRelevantColumns(df_original):
    df = df_original.drop_duplicates(subset='UniqueFireIdentifier').copy()
    df = df[['InitialLatitude', 'InitialLongitude', 'POOState', 'FireDiscoveryDateTime', 'FireCause',
             'FireOutDateTime', 'DailyAcres']]
    return df

## Program

In [None]:
def runProgram(recent=False, weatherAPI=False):
    df = pd.read_csv("Full_Wildland_Fires.csv")
    
    # Connect recent data gathered from crawling to current df
    if recent:
        df_recent = pd.read_csv("Recent_Wildland_Fires.csv")
        df = combineFullAndRecent(df_full, df_recent)
        
    df = getRelevantColumns(df)
    df = fixColumns(df)
    df = dropRows(df)
    
    # Run the weather API to gather historic weather info
    if weatherAPI:
        df_weather = runWeather(df)
        df = insertWeatherInfo(df, df_weather)
    
    df = dropNoLongerRelevantColumns(df)
    
    return df

In [None]:
df = runProgram()
df.to_csv("Cleaned_Wildland_Fires.csv", index=False)