# Working with WY Aggregated Data

Preprocessing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Wyoming/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

# Water Use Data

In [3]:
# Input Dataframe
fileInput = "WYAggData_input.csv"
df = pd.read_csv(fileInput)
print(len(df))
df.head(1)

57


Unnamed: 0,Water Use by Basin,UseType,Source,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Bear River Planning Basin,Agricultural Consumptive Use,Ground Water,1334.272368,1498.365835,1226.612777,1617.425105,1846.693415,2059.991249,1826.625609,1417.511879,1863.082731,2106.901111,1934.922563,2258.502278,1526.792191,1546.331897,2196.058986,1746.463647,2270.430057,2287.420314,2110.106133


In [4]:
# temp database #1: base elements
droplist = ["2000", "2001", "2002", "2003", "2004", "2005", "2006",
            "2007", "2008", "2009", "2010", "2011", "2012", "2013",
            "2014", "2015", "2016", "2017", "2018"]

dftempK = df.copy()
dftempK = dftempK.drop(columns=droplist).reset_index()
print(len(dftempK))
dftempK.head(1)

57


Unnamed: 0,index,Water Use by Basin,UseType,Source
0,0,Bear River Planning Basin,Agricultural Consumptive Use,Ground Water


In [5]:
# temp database #2: time series amounts
# restructure

dftempY = df.copy()
dftempY = dftempY.drop(columns=['Water Use by Basin', 'UseType', 'Source'])

dftempY = dftempY.T.unstack().reset_index(level=1, name='Amount').rename(columns={'level_1':'ReportYear'})[['Amount','ReportYear']]

print(len(dftempY))
dftempY.head(1)

1083


Unnamed: 0,Amount,ReportYear
0,1334.272368,2000


In [6]:
# Combine database #1 to database #2.
dftempY['Water Use by Basin'] = dftempK['Water Use by Basin']
dftempY['UseType'] = dftempK['UseType']
dftempY['Source'] = dftempK['Source']

print(len(dftempY))
dftempY.head(1)

1083


Unnamed: 0,Amount,ReportYear,Water Use by Basin,UseType,Source
0,1334.272368,2000,Bear River Planning Basin,Agricultural Consumptive Use,Ground Water


In [7]:
# Fixing UseType errors
# Uncessary spaces & white space.

def fixUseType(val):
    val = str(val).strip() 
    if val == "" or pd.isnull(val):
        val = "Unspecified"
    else:
        # Cleaning text / simple search format
        val = val.strip()
        val = val.replace("  ", " ")
        val = val.replace(",", "")
        val = val.replace(".", "")
        val = val.replace(";", "")
        val = val.replace("-", "")
        val = val.replace("/", "")
        val = val.replace("(", "")
        val = val.replace(")", "")

        return val

dftempY['UseType'] = dftempY.apply(lambda row: fixUseType(row['UseType']), axis=1)
dftempY['UseType'].unique()

array(['Agricultural Consumptive Use', 'Domestic Use', 'Industrial Use',
       'Municipal Use'], dtype=object)

## WaDE Custom Elements (due to missing sate info)

In [8]:
# temp watersource type

def tempWaterSourceType(WST):
    WST = str(WST).strip()
    if WST == "Ground Water":
        outString = "Groundwater"
    else:
        outString = WST
   
    return outString

dftempY['tempWST'] = dftempY.apply(lambda row: tempWaterSourceType(row['Source']), axis=1)
dftempY['tempWST'].unique()

array(['Groundwater', 'Surface Water', 'Cross Basin Diversion'],
      dtype=object)

In [9]:
# Creating VariableSpecificCV

def createVariableSpecificCV(UT, WST):
    UT = str(UT).strip()
    WST = str(WST).strip()
    
    outString = "Consumptive Use_Annual_" +  UT + "_" + WST
    
    return outString

dftempY['in_VariableSpecificCV'] = dftempY.apply(lambda row: createVariableSpecificCV(row['UseType'], row['tempWST']), axis=1)
dftempY['in_VariableSpecificCV'].unique()

array(['Consumptive Use_Annual_Agricultural Consumptive Use_Groundwater',
       'Consumptive Use_Annual_Agricultural Consumptive Use_Surface Water',
       'Consumptive Use_Annual_Domestic Use_Groundwater',
       'Consumptive Use_Annual_Domestic Use_Surface Water',
       'Consumptive Use_Annual_Industrial Use_Groundwater',
       'Consumptive Use_Annual_Industrial Use_Surface Water',
       'Consumptive Use_Annual_Municipal Use_Groundwater',
       'Consumptive Use_Annual_Municipal Use_Surface Water',
       'Consumptive Use_Annual_Municipal Use_Cross Basin Diversion'],
      dtype=object)

In [10]:
# Creating WaDE Custom reporting u nit native ID for easy identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp ReportingUnit native id dataframe of unique site.
def assignReportNID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEWY_RU" + string1
    return outstring

dfReportNativeID = pd.DataFrame()

dfReportNativeID['in_ReportingUnitName'] = dftempY['Water Use by Basin']
dfReportNativeID = dfReportNativeID.drop_duplicates()

dftemp = pd.DataFrame(index = dfReportNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfReportNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportNID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom ReportingUnit native id
def retrieveReportNID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfReportNativeID.loc[(dfReportNativeID['in_ReportingUnitName'] == A), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dftempY['in_ReportingUnitNativeID'] = dftempY.apply(lambda row: retrieveReportNID(row['Water Use by Basin']), axis=1)
dftempY['in_ReportingUnitNativeID'].unique()

array(['WaDEWY_RU1', 'WaDEWY_RU2', 'WaDEWY_RU3', 'WaDEWY_RU4',
       'WaDEWY_RU5', 'WaDEWY_RU6', 'WaDEWY_RU7'], dtype=object)

In [11]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEWY_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dftempY['Source']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dftempY['in_WaterSourceNativeID'] = dftempY.apply(lambda row: retrieveWaterSourceNativeID(row['Source']), axis=1)
dftempY['in_WaterSourceNativeID'].unique()

array(['WaDEWY_WS1', 'WaDEWY_WS2', 'WaDEWY_WS3'], dtype=object)

# Shapefile Data

In [12]:
# Shapefile input
shpInput = "WYBasinShapefile/BasinShapefile.shp"
BasinShape = gpd.read_file(shpInput)
print(len(BasinShape))
BasinShape.head(1)

7


Unnamed: 0,ReportingU,Reportin_1,Reportin_2,Reportin_3,Reportin_4,StateCV,EPSGCodeCV,Shape_Leng,Shape_Area,TypeIDNum,TypeNameNu,OBJECTID,Reportin_5,Reportin_6,Reportin_7,Reportin_8,Reportin_9,StateCV_1,EPSGCode_1,TypeName_1,TypeIDNum_,geometry
0,20222,WY_1,1,Bear River Planning Basin,Basin,WY,EPSG:4326,4.782684,0.421512,3_1_47,3_Bear River Planning Basin_47,1213,21511,WY_1,1,Bear River Planning Basin,Basin,WY,EPSG:4326,3_Bear River Planning Basin_47,3_1_47,"POLYGON ((-110.74937 42.60565, -110.72887 42.5..."


In [13]:
#Create geo output
columnsList = ['RU_Name', 'geometry']
dfshape = pd.DataFrame(columns=columnsList)
dfshape['RU_Name'] = BasinShape['Reportin_3']
dfshape['geometry'] = BasinShape['geometry']
dfshape = dfshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfshape))
dfshape.head(1)

7


Unnamed: 0,RU_Name,geometry
0,Bear River Planning Basin,"POLYGON ((-110.74937 42.60565, -110.72887 42.5..."


### Inspect Output Data & Export

In [14]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dftempY.dtypes)

Amount                      float64
ReportYear                   object
Water Use by Basin           object
UseType                      object
Source                       object
tempWST                      object
in_VariableSpecificCV        object
in_ReportingUnitNativeID     object
in_WaterSourceNativeID       object
dtype: object


In [15]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfshape.dtypes)

RU_Name       object
geometry    geometry
dtype: object


In [16]:
# Export out to CSV.
dftempY.to_csv('P_wyAggMaster.csv', index=False) # The output.
dfshape.to_csv('P_wyGeometry.csv', index=False) # The output geometry.