# Pre-processing Aggregated Water Use data for WaDE upload.
Purpose:  To pre-process the aggregated water use data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/GreatLakes/WaterUse_AggregatedArea" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/GreatLakes/WaterUse_AggregatedArea


## Input Source Data

In [3]:
# Time series info
inputFile ="RawInputData/WSWC Great Lakes Data 2012-2022.zip"
df_ts = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_ts:
    df_ts['WaDEUUID'] = "glag" + df_ts.index.astype(str)
    df_ts.to_csv('RawInputData/WSWC Great Lakes Data 2012-2022.zip', compression=dict(method='zip', archive_name='WSWC Great Lakes Data 2012-2022.csv'), index=False)

print(len(df_ts))
df_ts.head()

5189


Unnamed: 0,sector,source,basin,jurisdiction,unit,with_amt,consump,consump_percent,consump_coefficient,facit_num,intrabasin_transfer_amt,intrabasin_return,net_intrabasin_change,intrabasin_consump,intrabasin_num,diversion_amt,diversion_return,net_diversion_change,diversion_num,method,all_with_amts,all_consump,year,WaDEUUID
0,1,1,2,1,1,783.81,0.0,0.0,0,16.0,0.0,0.0,0.0,0.0,0.0,783.81,0.0,783.81,16.0,1.0,783.81,0.0,2022,glag0
1,2,1,2,1,1,1.834,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0,1.834,0.0,1.834,2.0,1.0,1.834,0.0,2022,glag1
2,2,3,2,1,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,2022,glag2
3,3,1,2,1,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,2022,glag3
4,3,3,2,1,1,0.082,0.082,100.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.082,0.082,2022,glag4


In [4]:
# create unique id to combine with shp file data

df_ts['shpID'] = "j" + df_ts['jurisdiction'].astype(float).astype(int).astype(str).str.strip() + "_b" + df_ts['basin'].astype(float).astype(int).astype(str).str.strip()
df_ts['shpID'].unique()

array(['j1_b2', 'j2_b2', 'j2_b4', 'j3_b1', 'j3_b2', 'j3_b3', 'j3_b4',
       'j4_b1', 'j5_b4', 'j5_b5', 'j5_b6', 'j6_b4', 'j7_b1', 'j7_b3',
       'j7_b4', 'j7_b5', 'j7_b6', 'j8_b4', 'j8_b5', 'j9_b6', 'j10_b1',
       'j10_b2'], dtype=object)

In [5]:
# Polygon shp info
inputFile ="RawInputData/shapefiles/Exploded_GLCompact_Basins_Jurisdictions.zip"
df_poly = gpd.read_file(inputFile)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_poly:
    df_poly['WaDEUUID'] = "glag" + df_poly.index.astype(str)
    df_poly.to_csv('RawInputData/Exploded_GLCompact_Basins_Jurisdictions.zip', compression=dict(method='zip', archive_name='Exploded_GLCompact_Basins_Jurisdictions.csv'), index=False)

print(len(df_poly))
df_poly.head()

22


Unnamed: 0,Shape_Leng,jurisdic,basin,wadeID,jur_name,basin_name,notes,Shape_Le_1,Shape_Area,geometry,WaDEUUID
0,34.25696,7.0,4.0,j7_b4,Ontario,Lake Erie,,15.01843,2.5708,"POLYGON ((-80.14996 43.93119, -80.14998 43.929...",glag0
1,47.39882,7.0,3.0,j7_b3,Ontario,Lake Huron,,47.69183,10.17209,"POLYGON ((-81.31089 47.44333, -81.31057 47.442...",glag1
2,60.62183,7.0,1.0,j7_b1,Ontario,Lake Superior,,49.62184,12.48842,"POLYGON ((-89.13481 51.12819, -89.09873 51.108...",glag2
3,34.16079,7.0,5.0,j7_b5,Ontario,Lake Ontario,,23.80428,3.1934,"POLYGON ((-76.57685 44.21177, -76.59855 44.211...",glag3
4,58.22232,7.0,6.0,j7_b6,Ontario,St. Lawrence River,,33.43084,6.31962,"POLYGON ((-79.51780 48.07788, -79.51781 48.074...",glag4


In [6]:
# merge input data into single dataframe

dfIn = pd.merge(df_ts, df_poly, left_on='shpID', right_on='wadeID', how='left')
print(len(dfIn))
dfIn.head()

5189


Unnamed: 0,sector,source,basin_x,jurisdiction,unit,with_amt,consump,consump_percent,consump_coefficient,facit_num,intrabasin_transfer_amt,intrabasin_return,net_intrabasin_change,intrabasin_consump,intrabasin_num,diversion_amt,diversion_return,net_diversion_change,diversion_num,method,all_with_amts,all_consump,year,WaDEUUID_x,shpID,Shape_Leng,jurisdic,basin_y,wadeID,jur_name,basin_name,notes,Shape_Le_1,Shape_Area,geometry,WaDEUUID_y
0,1,1,2,1,1,783.81,0.0,0.0,0,16.0,0.0,0.0,0.0,0.0,0.0,783.81,0.0,783.81,16.0,1.0,783.81,0.0,2022,glag0,j1_b2,40.14281,1.0,2.0,j1_b2,Illinois,Lake Michigan,IL is exempt from some parts of the Great Lake...,2.33655,0.02798,"POLYGON ((-87.82568 42.49206, -87.81587 42.491...",glag21
1,2,1,2,1,1,1.834,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0,1.834,0.0,1.834,2.0,1.0,1.834,0.0,2022,glag1,j1_b2,40.14281,1.0,2.0,j1_b2,Illinois,Lake Michigan,IL is exempt from some parts of the Great Lake...,2.33655,0.02798,"POLYGON ((-87.82568 42.49206, -87.81587 42.491...",glag21
2,2,3,2,1,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,2022,glag2,j1_b2,40.14281,1.0,2.0,j1_b2,Illinois,Lake Michigan,IL is exempt from some parts of the Great Lake...,2.33655,0.02798,"POLYGON ((-87.82568 42.49206, -87.81587 42.491...",glag21
3,3,1,2,1,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,2022,glag3,j1_b2,40.14281,1.0,2.0,j1_b2,Illinois,Lake Michigan,IL is exempt from some parts of the Great Lake...,2.33655,0.02798,"POLYGON ((-87.82568 42.49206, -87.81587 42.491...",glag21
4,3,3,2,1,1,0.082,0.082,100.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.082,0.082,2022,glag4,j1_b2,40.14281,1.0,2.0,j1_b2,Illinois,Lake Michigan,IL is exempt from some parts of the Great Lake...,2.33655,0.02798,"POLYGON ((-87.82568 42.49206, -87.81587 42.491...",glag21


In [7]:
# determine WaDE MethodTypeCV and Name from dictionary

MethodDict = {
"1":"Measured",
"2":"Partially Measured",
"3":"Calculated"}

def DetermineMethodFunc(valA):  
    if valA == '' or pd.isnull(valA):
        outString = 'WaDE Blank'
    else:
        try:
            valA = str(int(float(valA))).strip()
            outString = MethodDict[valA]
        except:
            outString = 'WaDE Blank'
    return outString

dfIn['in_MethodTypeCV'] = dfIn.apply(lambda row: DetermineMethodFunc(row['method']), axis=1)
dfIn['in_MethodTypeCV'].value_counts()

in_MethodTypeCV
Partially Measured    2670
Measured               973
Calculated             791
WaDE Blank             755
Name: count, dtype: int64

In [8]:
# determine WaDE iWaterSourceTypeCV dictionary

WaterSourceDict = {
"1":"Great Lakes St Lawrence Surface Water",
"2":"Other Surface Water",
"3":"Groundwater"}

def DetermineWaterSourceFunc(valA):  
    if valA == '' or pd.isnull(valA):
        outString = 'WaDE Blank'
    else:
        try:
            valA = str(int(float(valA))).strip()
            outString = WaterSourceDict[valA]
        except:
            outString = 'WaDE Blank'
    return outString

dfIn['in_WaterSourceTypeCV'] = dfIn.apply(lambda row: DetermineWaterSourceFunc(row['source']), axis=1)
dfIn['in_WaterSourceTypeCV'].value_counts()

in_WaterSourceTypeCV
Other Surface Water                      1790
Great Lakes St Lawrence Surface Water    1704
Groundwater                              1695
Name: count, dtype: int64

In [9]:
# determine WaDE Benefical Use from dictionary

BenUseDict = {
"1":"Public Water Supply",
"2":"Self Supply Commercial and Institutional",
"3":"Self Supply Irrigation",
"4":"Self Supply Livestock",
"5":"Self Supply Industrial",
"6":"Self Supply Thermoelectric Power Production Once Through Cooling",
"7":"Self Supply Thermoelectric Power Production Recirculated Cooling",
"8":"Off Stream Hydroelectric Power Production",
"9":"In Stream Hydroelectric Water Use",
"10":"Other Self Supply"}

def DetermineBenUseFunc(valA):
    if valA == '' or pd.isnull(valA):
        outString = 'WaDE Blank'
    else:
        try:
            valA = str(int(float(valA))).strip()
            outString = BenUseDict[valA]
        except:
            outString = 'WaDE Blank'
    return outString

dfIn['in_BeneficialUseCategory'] = dfIn.apply(lambda row: DetermineBenUseFunc(row['sector']), axis=1)
dfIn['in_BeneficialUseCategory'].value_counts()

in_BeneficialUseCategory
Public Water Supply                                                 671
Self Supply Industrial                                              662
Self Supply Irrigation                                              639
Self Supply Commercial and Institutional                            597
Other Self Supply                                                   528
Self Supply Livestock                                               526
Self Supply Thermoelectric Power Production Once Through Cooling    483
In Stream Hydroelectric Water Use                                   428
Self Supply Thermoelectric Power Production Recirculated Cooling    336
Off Stream Hydroelectric Power Production                           319
Name: count, dtype: int64

In [10]:
# determine WaDE Amount
# need to convert to an MG/year value to be specific with the annual timestep
# "1" : MG/day
# "2" : ML/day

def ConvertAmountToAfFunc(valAmount, valUnit):
    valUnit = str(valUnit).strip()
    if valUnit == "1": 
        outValue = valAmount * 365
    elif valUnit == "2":
        outValue = valAmount * 0.264172 * 365
    else:
        outValue = valAmount
    return outValue

dfIn['in_Amount'] = dfIn.apply(lambda row: ConvertAmountToAfFunc(row['with_amt'], row['unit']), axis=1)
dfIn['in_Amount'].unique()

array([286090.64999999997, 669.4100000000001, 0.0, ..., 4253.3085,
       502.386, 639.918], dtype=object)

In [11]:
# Transfer input data to WaDE specific output

# create dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfIn['WaDEUUID_x']

# Method Info
df['in_MethodTypeCV'] = dfIn['in_MethodTypeCV']
df['in_MethodUUID'] = "" # determine below

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = "Withdrawal"
df['in_VariableSpecificCV'] = "" # determine below

# Organization Info
df['in_OrganizationUUID'] = "GLwuag_O1"

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = "WaDE Blank"
df['in_WaterSourceNativeID'] = "" # determine below if not provided
df["in_WaterSourceTypeCV"] = dfIn['in_WaterSourceTypeCV']

# ReportingUnits Info
df['in_EPSGCodeCV'] = "4326"
df['in_ReportingUnitName'] = dfIn['jur_name'] + dfIn['basin_name']
df['in_ReportingUnitNativeID'] = "GL_" + dfIn['shpID'].astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Basin"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "GL"

# AggregatedAmounts Info
df['in_AllocationCropDutyAmount'] = ""
df['in_Amount'] = dfIn['in_Amount']
df['in_BeneficialUseCategory'] = dfIn['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_InterbasinTransferFromID'] = ""
df['in_InterbasinTransferToID'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategoryCV'] = ""
df['in_ReportYearCV'] = dfIn['year'].astype(float).astype(int).astype(str).str.strip()
df['in_SDWISIdentifierCV'] = ""
df['in_TimeframeEnd'] = "12/31/" + dfIn['year'].astype(float).astype(int).astype(str).str.strip()
df['in_TimeframeStart'] = "01/01/" + dfIn['year'].astype(float).astype(int).astype(str).str.strip()

outdf = df.copy()
outdf = outdf.drop_duplicates().reset_index(drop=True)
print(len(outdf))
outdf.head(5)

5189


Unnamed: 0,WaDEUUID,in_MethodTypeCV,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategoryCV,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,glag0,Measured,,,Annual,Withdrawal,,GLwuag_O1,,,,WaDE Blank,,Great Lakes St Lawrence Surface Water,4326,IllinoisLake Michigan,GL_j1_b2,,Basin,,GL,,286090.65,Public Water Supply,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
1,glag1,Measured,,,Annual,Withdrawal,,GLwuag_O1,,,,WaDE Blank,,Great Lakes St Lawrence Surface Water,4326,IllinoisLake Michigan,GL_j1_b2,,Basin,,GL,,669.41,Self Supply Commercial and Institutional,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
2,glag2,WaDE Blank,,,Annual,Withdrawal,,GLwuag_O1,,,,WaDE Blank,,Groundwater,4326,IllinoisLake Michigan,GL_j1_b2,,Basin,,GL,,0.0,Self Supply Commercial and Institutional,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
3,glag3,WaDE Blank,,,Annual,Withdrawal,,GLwuag_O1,,,,WaDE Blank,,Great Lakes St Lawrence Surface Water,4326,IllinoisLake Michigan,GL_j1_b2,,Basin,,GL,,0.0,Self Supply Irrigation,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
4,glag4,Measured,,,Annual,Withdrawal,,GLwuag_O1,,,,WaDE Blank,,Groundwater,4326,IllinoisLake Michigan,GL_j1_b2,,Basin,,GL,,29.93,Self Supply Irrigation,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022


## Concatenate Data (if needed).

In [12]:
# # Concatenate dataframes

# frames = [df1, df2]  # list all out dataframes here
# outdf = pd.concat(frames)
# outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
# print(len(outdf))

## Clean Data / Data Types

In [13]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

  Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')


In [14]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wade Blank'], dtype=object)

In [15]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

array(['Illinoislake Michigan', 'Indianalake Michigan',
       'Indianalake Erie', 'Michiganlake Superior',
       'Michiganlake Michigan', 'Michiganlake Huron', 'Michiganlake Erie',
       'Minnesotalake Superior', 'New Yorklake Erie',
       'New Yorklake Ontario', 'New Yorkst Lawrence River',
       'Ohiolake Erie', 'Ontariolake Superior', 'Ontariolake Huron',
       'Ontariolake Erie', 'Ontariolake Ontario',
       'Ontariost Lawrence River', 'Pennsylvanialake Erie',
       'Pennsylvanialake Ontario', 'Quebecst Lawrence River',
       'Wisconsinlake Superior', 'Wisconsinlake Michigan'], dtype=object)

In [16]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [17]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Wade Blank'], dtype=object)

In [18]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Great Lakes St Lawrence Surface Water', 'Groundwater',
       'Other Surface Water'], dtype=object)

In [19]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

array(['Illinoislake Michigan', 'Indianalake Michigan',
       'Indianalake Erie', 'Michiganlake Superior',
       'Michiganlake Michigan', 'Michiganlake Huron', 'Michiganlake Erie',
       'Minnesotalake Superior', 'New Yorklake Erie',
       'New Yorklake Ontario', 'New Yorkst Lawrence River',
       'Ohiolake Erie', 'Ontariolake Superior', 'Ontariolake Huron',
       'Ontariolake Erie', 'Ontariolake Ontario',
       'Ontariost Lawrence River', 'Pennsylvanialake Erie',
       'Pennsylvanialake Ontario', 'Quebecst Lawrence River',
       'Wisconsinlake Superior', 'Wisconsinlake Michigan'], dtype=object)

In [20]:
outdf['in_ReportingUnitTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitTypeCV']), axis=1)
outdf['in_ReportingUnitTypeCV'].unique()

array(['Basin'], dtype=object)

In [21]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['In Stream Hydroelectric Water Use',
 'Off Stream Hydroelectric Power Production',
 'Other Self Supply',
 'Public Water Supply',
 'Self Supply Commercial and Institutional',
 'Self Supply Industrial',
 'Self Supply Irrigation',
 'Self Supply Livestock',
 'Self Supply Thermoelectric Power Production Once Through Cooling',
 'Self Supply Thermoelectric Power Production Recirculated Cooling']

In [22]:
# Fixing n_Amount entry is either numireic or a 0

outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array([286090.65, 669.41, '', ..., 4253.31, 502.39, 639.92], dtype=object)

In [23]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3'], dtype=object)

In [24]:
# determine in_VariableSpecificCV

outdf['in_VariableSpecificCV'] = outdf['in_VariableCV'].astype(str) + "_" + outdf['in_AggregationIntervalUnitCV'].astype(str) + "_" + outdf['in_BeneficialUseCategory'].astype(str) + "_" + outdf['in_WaterSourceTypeCV'].astype(str)
outdf['in_VariableSpecificCV'].unique()

array(['Withdrawal_Annual_Public Water Supply_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Self Supply Commercial and Institutional_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Self Supply Commercial and Institutional_Groundwater',
       'Withdrawal_Annual_Self Supply Irrigation_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Self Supply Irrigation_Groundwater',
       'Withdrawal_Annual_Self Supply Livestock_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Self Supply Industrial_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Self Supply Thermoelectric Power Production Once Through Cooling_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Self Supply Thermoelectric Power Production Recirculated Cooling_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_Off Stream Hydroelectric Power Production_Great Lakes St Lawrence Surface Water',
       'Withdrawal_Annual_I

In [25]:
# Create MethodUUID

Methodict = {
"Measured" : " GLwuag_M1",
"Partially Measured" : " GLwuag_M2",
"Calculated" : " GLwuag_M3",
"WaDE Blank" : " GLwuag_M4"}

def DetermineMethodUUIDFunc(valA):
    if valA == '' or pd.isnull(valA):
        outString = 'WaDE Blank'
    else:
        try:
            valA = str(valA).strip()
            outString = Methodict[valA]
        except:
            outString = 'WaDE Blank'
    return outString

outdf['in_MethodUUID'] = outdf.apply(lambda row: DetermineMethodUUIDFunc(row['in_MethodTypeCV']), axis=1)
outdf['in_MethodUUID'].value_counts()

in_MethodUUID
GLwuag_M2    2670
GLwuag_M1     973
GLwuag_M3     791
GLwuag_M4     755
Name: count, dtype: int64

In [26]:
%%time

# Creating WaDE VariableSpecificUUID for easy VariableSpecificCV identification 
# use these inputs: VariableCV_AggregationIntervalUnitCV_BeneficalUse_WaterSourceTypeCV
# ----------------------------------------------------------------------------------------------------

# Create temp VariableSpecificUUID dataframe of unique water source.
def assignVariableSpecificUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "NDwuag_V" + string1
    return outstring

dfVariableSpecificUUID = pd.DataFrame()
dfVariableSpecificUUID['in_VariableCV'] = outdf['in_VariableCV']
dfVariableSpecificUUID['in_AggregationIntervalUnitCV'] = outdf['in_AggregationIntervalUnitCV']
dfVariableSpecificUUID['in_BeneficialUseCategory'] = outdf['in_BeneficialUseCategory']
dfVariableSpecificUUID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfVariableSpecificUUID = dfVariableSpecificUUID.drop_duplicates()

dftemp = pd.DataFrame(index=dfVariableSpecificUUID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfVariableSpecificUUID['in_VariableSpecificUUID'] = dftemp.apply(lambda row: assignVariableSpecificUUID(row['Count']), axis=1)
dfVariableSpecificUUID['linkKey'] = dfVariableSpecificUUID['in_VariableCV'].astype(str) + dfVariableSpecificUUID['in_AggregationIntervalUnitCV'].astype(str) + dfVariableSpecificUUID['in_BeneficialUseCategory'].astype(str) + dfVariableSpecificUUID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom variable native ID
VariableSpecificUUIDdict = pd.Series(dfVariableSpecificUUID.in_VariableSpecificUUID.values, index=dfVariableSpecificUUID.linkKey.astype(str)).to_dict()
def retrieveVariableSpecificUUID(A, B, C, D):
    if (A == '' and B == '' and C == '' and D == '') or (pd.isnull(A) and pd.isnull(B) and pd.isnull(C) and pd.isnull(D)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip() + str(C).strip() + str(D).strip()
        try:
            outList = VariableSpecificUUIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_VariableSpecificUUID'] = outdf.apply(lambda row: retrieveVariableSpecificUUID(row['in_VariableCV'], row['in_AggregationIntervalUnitCV'], row['in_BeneficialUseCategory'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificUUID'].unique()

CPU times: total: 78.1 ms
Wall time: 80.6 ms


array(['NDwuag_V1', 'NDwuag_V2', 'NDwuag_V3', 'NDwuag_V4', 'NDwuag_V5',
       'NDwuag_V6', 'NDwuag_V7', 'NDwuag_V8', 'NDwuag_V9', 'NDwuag_V10',
       'NDwuag_V11', 'NDwuag_V12', 'NDwuag_V13', 'NDwuag_V14',
       'NDwuag_V15', 'NDwuag_V16', 'NDwuag_V17', 'NDwuag_V18',
       'NDwuag_V19', 'NDwuag_V20', 'NDwuag_V21', 'NDwuag_V22',
       'NDwuag_V23', 'NDwuag_V24', 'NDwuag_V25', 'NDwuag_V26',
       'NDwuag_V27', 'NDwuag_V28', 'NDwuag_V29', 'NDwuag_V30'],
      dtype=object)

## Shapefile Data
- For attaching geometry to csv inputs.

In [27]:
# Shapefile Data (can just be copy of above input)\

# shapefileInput = gpd.read_file('RawInputData/shapefiles/{enter file name here}.zip')
# dfshapetemp = gpd.read_file(shapefileInput)

dfshapetemp = df_poly.copy()
print(len(dfshapetemp))
dfshapetemp.head()

22


Unnamed: 0,Shape_Leng,jurisdic,basin,wadeID,jur_name,basin_name,notes,Shape_Le_1,Shape_Area,geometry,WaDEUUID
0,34.25696,7.0,4.0,j7_b4,Ontario,Lake Erie,,15.01843,2.5708,"POLYGON ((-80.14996 43.93119, -80.14998 43.929...",glag0
1,47.39882,7.0,3.0,j7_b3,Ontario,Lake Huron,,47.69183,10.17209,"POLYGON ((-81.31089 47.44333, -81.31057 47.442...",glag1
2,60.62183,7.0,1.0,j7_b1,Ontario,Lake Superior,,49.62184,12.48842,"POLYGON ((-89.13481 51.12819, -89.09873 51.108...",glag2
3,34.16079,7.0,5.0,j7_b5,Ontario,Lake Ontario,,23.80428,3.1934,"POLYGON ((-76.57685 44.21177, -76.59855 44.211...",glag3
4,58.22232,7.0,6.0,j7_b6,Ontario,St. Lawrence River,,33.43084,6.31962,"POLYGON ((-79.51780 48.07788, -79.51781 48.074...",glag4


In [28]:
columnsList = ['in_ReportingUnitNativeID', 'geometry']
outshape = pd.DataFrame(columns=columnsList)
outshape['in_ReportingUnitNativeID'] = "GL_" + "j" + dfshapetemp['jurisdic'].astype(float).astype(int).astype(str).str.strip() + "_b" + dfshapetemp['basin'].astype(float).astype(int).astype(str).str.strip()
outshape['geometry'] = dfshapetemp['geometry']
outshape = outshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
outshape.head(20)

Unnamed: 0,in_ReportingUnitNativeID,geometry
0,GL_j7_b4,"POLYGON ((-80.14996 43.93119, -80.14998 43.929..."
1,GL_j7_b3,"POLYGON ((-81.31089 47.44333, -81.31057 47.442..."
2,GL_j7_b1,"POLYGON ((-89.13481 51.12819, -89.09873 51.108..."
3,GL_j7_b5,"POLYGON ((-76.57685 44.21177, -76.59855 44.211..."
4,GL_j7_b6,"POLYGON ((-79.51780 48.07788, -79.51781 48.074..."
5,GL_j3_b4,"POLYGON ((-82.42207 43.00626, -82.42173 42.998..."
6,GL_j3_b3,"POLYGON ((-84.18079 46.53084, -84.17163 46.528..."
7,GL_j3_b2,"POLYGON ((-88.13544 46.73357, -88.13491 46.733..."
8,GL_j3_b1,"POLYGON ((-87.89235 47.47407, -87.86253 47.471..."
9,GL_j6_b4,"POLYGON ((-80.51867 41.68897, -80.52239 41.688..."


# Export Data

In [29]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5189 entries, 0 to 5188
Data columns (total 41 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   WaDEUUID                        5189 non-null   object
 1   in_MethodTypeCV                 5189 non-null   object
 2   in_MethodUUID                   5189 non-null   object
 3   in_VariableSpecificUUID         5189 non-null   object
 4   in_AggregationIntervalUnitCV    5189 non-null   object
 5   in_VariableCV                   5189 non-null   object
 6   in_VariableSpecificCV           5189 non-null   object
 7   in_OrganizationUUID             5189 non-null   object
 8   in_Geometry                     5189 non-null   object
 9   in_GNISFeatureNameCV            5189 non-null   object
 10  in_WaterQualityIndicatorCV      5189 non-null   object
 11  in_WaterSourceName              5189 non-null   object
 12  in_WaterSourceNativeID          5189 non-null   

In [30]:
outdf

Unnamed: 0,WaDEUUID,in_MethodTypeCV,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategoryCV,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,glag0,Measured,GLwuag_M1,NDwuag_V1,Annual,Withdrawal,Withdrawal_Annual_Public Water Supply_Great La...,GLwuag_O1,,,,Wade Blank,wadeId1,Great Lakes St Lawrence Surface Water,4326,Illinoislake Michigan,GL_j1_b2,,Basin,,GL,,286090.65000,Public Water Supply,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
1,glag1,Measured,GLwuag_M1,NDwuag_V2,Annual,Withdrawal,Withdrawal_Annual_Self Supply Commercial and I...,GLwuag_O1,,,,Wade Blank,wadeId1,Great Lakes St Lawrence Surface Water,4326,Illinoislake Michigan,GL_j1_b2,,Basin,,GL,,669.41000,Self Supply Commercial and Institutional,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
2,glag2,WaDE Blank,GLwuag_M4,NDwuag_V3,Annual,Withdrawal,Withdrawal_Annual_Self Supply Commercial and I...,GLwuag_O1,,,,Wade Blank,wadeId2,Groundwater,4326,Illinoislake Michigan,GL_j1_b2,,Basin,,GL,,,Self Supply Commercial and Institutional,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
3,glag3,WaDE Blank,GLwuag_M4,NDwuag_V4,Annual,Withdrawal,Withdrawal_Annual_Self Supply Irrigation_Great...,GLwuag_O1,,,,Wade Blank,wadeId1,Great Lakes St Lawrence Surface Water,4326,Illinoislake Michigan,GL_j1_b2,,Basin,,GL,,,Self Supply Irrigation,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
4,glag4,Measured,GLwuag_M1,NDwuag_V5,Annual,Withdrawal,Withdrawal_Annual_Self Supply Irrigation_Groun...,GLwuag_O1,,,,Wade Blank,wadeId2,Groundwater,4326,Illinoislake Michigan,GL_j1_b2,,Basin,,GL,,29.93000,Self Supply Irrigation,,,,,,,,,,,,,,2022,,12/31/2022,01/01/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5184,glag5184,Calculated,GLwuag_M3,NDwuag_V21,Annual,Withdrawal,Withdrawal_Annual_Self Supply Thermoelectric P...,GLwuag_O1,,,,Wade Blank,wadeId3,Other Surface Water,4326,Wisconsinlake Michigan,GL_j10_b2,,Basin,,GL,,109550.37000,Self Supply Thermoelectric Power Production On...,,,,,,,,,,,,,,2012,,12/31/2012,01/01/2012
5185,glag5185,Calculated,GLwuag_M3,NDwuag_V9,Annual,Withdrawal,Withdrawal_Annual_Self Supply Thermoelectric P...,GLwuag_O1,,,,Wade Blank,wadeId1,Great Lakes St Lawrence Surface Water,4326,Wisconsinlake Michigan,GL_j10_b2,,Basin,,GL,,4253.31000,Self Supply Thermoelectric Power Production Re...,,,,,,,,,,,,,,2012,,12/31/2012,01/01/2012
5186,glag5186,WaDE Blank,GLwuag_M4,NDwuag_V30,Annual,Withdrawal,Withdrawal_Annual_Other Self Supply_Groundwater,GLwuag_O1,,,,Wade Blank,wadeId2,Groundwater,4326,Wisconsinlake Superior,GL_j10_b1,,Basin,,GL,,,Other Self Supply,,,,,,,,,,,,,,2012,,12/31/2012,01/01/2012
5187,glag5187,Calculated,GLwuag_M3,NDwuag_V29,Annual,Withdrawal,Withdrawal_Annual_Other Self Supply_Other Surf...,GLwuag_O1,,,,Wade Blank,wadeId3,Other Surface Water,4326,Wisconsinlake Michigan,GL_j10_b2,,Basin,,GL,,502.39000,Other Self Supply,,,,,,,,,,,,,,2012,,12/31/2012,01/01/2012


In [31]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwuag_Main.zip', compression=dict(method='zip', archive_name='Pag_Main.csv'), index=False)   # The output, save as a zip
outshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.