# Explore Data
This notebook serves to begin examining datasets, determine their data accessibility, and determine how best to read them into pandas.

In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import xarray as xr
import pandas as pd

data_dir = "/pool0/home/steinadi/data/drought/DatabaseDrafts"

Populating the interactive namespace from numpy and matplotlib


## Config Reading Settings

In [9]:
# Contains Data, good to go
# -------------------------
# pd.read_excel(f"{data_dir}/DroughtDeclarations_WA_OR.xlsx", skiprows=np.arange(1,38), index_col=0, header=1, usecols="A:M")
# pd.read_excel(f"{data_dir}/DroughtPermits_WA_1994_to_2019.xlsx", skiprows=np.arange(1,7), index_col=0, header=1)
# pd.read_excel(f"{data_dir}/FireDangerRating_WA_2010_2020.xlsx", skiprows=np.arange(1,13),index_col=0,header=1)
# pd.read_excel(f"{data_dir}/FireDangerRatings_OR_ODF_2020.xlsx", skiprows=np.arange(1,15), index_col=0, header=1)
# pd.read_excel(f"{data_dir}/FireClosures_OR_ODF_2011-2020.xlsx", skiprows=np.arange(1,13), header=1, sheet_name="2011-2020") # object index
# pd.read_excel(f"{data_dir}/RedFlagWarnings_WA_OR.xlsx", skiprows=np.arange(1,32), header=1) # object index
# pd.read_excel(f"{data_dir}/WaterRightsTransfers_OR.xlsx", skiprows=np.arange(1,7), header=1, index_col=[0,1])
# pd.read_excel(f"{data_dir}/NationalPark_visitations.xlsx", skiprows=np.arange(1,18), header=1, sheet_name="Recreation Visits", index_col=[0,1,2], usecols='B:Q')
# pd.read_excel(f"{data_dir}/NationalPark_visitations.xlsx", skiprows=np.arange(1,20), header=1, sheet_name="Overnight", usecols='B:N', index_col=[0,1,2])
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="WA Overnight", skiprows=np.arange(1,5), usecols='B:Q', index_col=[0,1,2], header=1)
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="WA Total", skiprows=np.arange(1,5), usecols='B:Q', index_col=[0,1,2], header=1)
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="WA Day Use", skiprows=np.arange(1,5), usecols='B:Q', index_col=[0,1,2], header=1)
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="OR Day User Monthly", skiprows=np.arange(1,4), index_col=[0,1,2], header=1)
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="OR Overnight Monthly", skiprows=np.arange(1,4), index_col=[0,1,2], header=1)

# Contains Data, can be spliced in pandas
# ---------------------------------------
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="OR Annual Day Use", skiprows=np.arange(1,6), index_col=0, header=1) # some totals that I think can be cut out?
# pd.read_excel(f"{data_dir}/StatePark_Visitations.xlsx", sheet_name="OR Annual Overnight", skiprows=np.arange(1,7), index_col=2, header=1) # also some strange totals

# Contains Data, needs editing outside pandas
# -------------------------------------------
# pd.read_excel(f"{data_dir}/FireClosures_Fire Restrictions-USFS.xlsx", skiprows=np.arange(1,16), header=1) # needs cleaning up
# pd.read_excel(f"{data_dir}/FireClosures_OR_ODF_2011-2020.xlsx", sheet_name="2018 Days Per Level") # needs cleaning up and context
# pd.read_excel(f"{data_dir}/FireClosures_Restrictions-BLM.xlsx") # needs cleaning up
# pd.read_excel(f"{data_dir}/FireSeason_OR.xlsx", skiprows=np.arange(1,9), index_col=0, usecols="A,B:V", header=1, sheet_name="By District") # needs to be split into different datasets
# pd.read_excel(f"{data_dir}/FireSeason_OR.xlsx", sheet_name="Avg") # needs transposition and header names

# Empty
# -----
# pd.read_excel(f"{data_dir}/BurnRestrictions_OR.xlsx")
# pd.read_excel(f"{data_dir}/BurnRestrictions_WA.xlsx")
# pd.read_excel(f"{data_dir}/Database - FireDangerRestrictions.xlsx")

# copy a file from above to read it!
test = pd.read_excel(f"{data_dir}/WaterRightsTransfers_OR.xlsx", skiprows=np.arange(1,7), header=1, index_col=[0,1])
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Applications Rec'd,Applications Denied,Applications Approved,Acres Approved,Transfers Rec'd,Transfers Denied,Transfers Approved
Drought Year,County,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005.0,Baker,4.0,1.0,3.0,1081.33,,,
2005.0,Klamath,6.0,1.0,5.0,28933.68,,,
2005.0,Umatilla,2.0,1.0,1.0,67.4,,,
2005.0,Wasco,1.0,,1.0,120.8,,,
2010.0,Klamath,60.0,2.0,58.0,36096.66,1.0,,1.0
2012.0,Klamath,1.0,,1.0,145.2,,,
2013.0,Klamath,17.0,2.0,15.0,5711.61,,,
2013.0,Malheur,4.0,,4.0,1304.3,,,
2014.0,Harney,3.0,1.0,2.0,717.0,1.0,,1.0
2014.0,Klamath,46.0,6.0,40.0,18760.52,,,


## List Counties
(only currently relevant for one of the fire datasets)

In [40]:
# https://www.officialusa.com/stateguides/washington/county/
wa_counties = ["Adams", "Asotin", "Benton", "Chelan", "Clallam", "Clark", "Columbia", "Cowlitz", "Douglas", "Ferry", "Franklin", "Garfield", "Grant", "Grays Harbor", "Island", "Jefferson", "King", "Kitsap", "Kittitas", "Klickitat", "Lewis", "Lincoln", "Mason", "Okanogan", "Pacific", "Pend Oreille", "Pierce", "San Juan", "Skagit", "Skamania", "Snohomish", "Spokane", "Stevens", "Thurston", "Wahkiakum", "Walla Walla", "Whatcom", "Whitman", "Yakima"]

or_counties = ["Baker", "Benton", "Clackamas", "Clatsop", "Columbia", "Coos", "Crook", "Curry", "Deschutes", "Douglas", "Gilliam", "Grant", "Harney", "Hood River", "Jackson", "Jefferson", "Josephine", "Klamath", "Lake", "Lane", "Lincoln", "Linn", "Malheur", "Marion", "Morrow", "Multnomah", "Polk", "Sherman", "Tillamook", "Umatilla", "Union", "Wallowa", "Wasco", "Washington", "Wheeler", "Yamhill"]

Check what counties are in the dataset and what we are missing in the above list. 

In [52]:
wa_rep = []
or_rep = []
not_found = []

wa_counties_lower = [wa_county.lower() for wa_county in wa_counties]
or_counties_lower = [or_county.lower() for or_county in or_counties]

for col in test.columns:
    if "County" in col or "COUNTY"in col:
        col = col[:-7]
    col = col.lower()
    if col in wa_counties_lower:
        wa_rep.append(col)
    elif col in or_counties_lower:
        or_rep.append(col)
    else:
        not_found.append(col)

print(len(wa_rep))
print(len(or_rep))
print(len(not_found))


57
0
23


-----------------------------------------------------
## Other counties found: 


**FireDangerRating_WA_2010_2020.xlsx**

['valley fdra', 'chelan fdra', 'blue mountain grass and brush fdra', 'foothills fdra', 'highlands fdra', 'kaniksu fdra', 'lower basin fdra', 'methow fdra', 'lower yakima fdra', 'northern blue mountains fdra', 'upper yakima fdra', 'upper basin fdra', 'blue mountain grass and brush', 'northern blue mountains ', 'foothills ', 'highlands ', 'kaniksu ', 'methow ', 'upper basin ', 'chelan ', 'valley ', 'lower yakima ', 'upper yakima ']