# Pre-processing Delaware Riber Basin Commission Aggregated Amounts data for WaDE upload.
Date Updated: 06/05/2023
Purpose:  To pre-process the DRBC data into one master file for simple DataFrame creation and extraction

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/DelawareRiverBasinCommission/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

## Input Source Data

In [3]:
# timeseries data, xlsx file
# we only want the historic surface water and groundwater data. Which can be found on the following sheets: A-1, A-6, A-9, A-11, A-14, A-17, A-22
dfA1 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-1')
dfA6 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-6')
dfA9 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-9')
dfA11 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-11')
dfA14 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-14')
dfA17 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-17')
dfA22 = pd.read_excel(io="2060report_data-release_v2110.xlsx", sheet_name='A-22')

# Concatenate dataframes
frames = [dfA1, dfA6, dfA9, dfA11, dfA14, dfA17, dfA22] 
dfIn = pd.concat(frames)
dfIn = dfIn.drop_duplicates().reset_index(drop=True).replace(np.nan, "")


# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfIn:
    dfIn['WaDEUUID'] = "drbc" + dfIn.index.astype(str)
    dfIn.to_csv('2060report_data-release_v2110_A1_6_9_11_14_17_22.zip', compression=dict(method='zip', archive_name='2060report_data-release_v2110_A1_6_9_11_14_17_22.csv'), index=False)

print(len(dfIn))
dfIn.head(1)

21468


Unnamed: 0,SECTOR,CATEGORY,STATE,BASIN_ID,GWPA_ID,DESIGNATION,YEAR,WD_MGD,CU_MGD,NUCLEAR_FACILITY,COOLING_TYPE,HYDRO_TYPE,WaDEUUID
0,Public Water Supply,Public Water Supply,PA,DB-074,1.0,GW,1990,0.04144,0.00414,,,,drbc0


In [4]:
# shapefile info
gdf_drb147 = gpd.read_file('shapefile/drb147.shp', crs="EPSG:4326")
print(len(gdf_drb147))
gdf_drb147.head(1)

147


Unnamed: 0,OBJECTID,SQM,STATEID,BASIN_ID,STREAMS,Shape_Leng,HUC8_Name,Shape_Le_1,Shape_Area,geometry
0,1,143.786,NY,DB-001,Upper West Br Delaware River,118207.57278,Upper Delaware,1.26136,0.04069,"POLYGON ((-74.82876 42.39163, -74.82836 42.392..."


In [5]:
# left merge timeseries info with shapefile info.
dfIn2 = pd.DataFrame()
dfIn2 = pd.merge(dfIn, gdf_drb147, on='BASIN_ID', how='left')
dfIn2 = dfIn2.drop_duplicates().replace(np.nan, "").replace("nan,nan", "").reset_index(drop=True)
print(len(dfIn2))
dfIn2.head(1)

21468


Unnamed: 0,SECTOR,CATEGORY,STATE,BASIN_ID,GWPA_ID,DESIGNATION,YEAR,WD_MGD,CU_MGD,NUCLEAR_FACILITY,COOLING_TYPE,HYDRO_TYPE,WaDEUUID,OBJECTID,SQM,STATEID,STREAMS,Shape_Leng,HUC8_Name,Shape_Le_1,Shape_Area,geometry
0,Public Water Supply,Public Water Supply,PA,DB-074,1.0,GW,1990,0.04144,0.00414,,,,drbc0,82,112.155,PA,Tohichon Cr,116919.21975,Middle Delaware-Musconetcong,1.2317,0.03083,"POLYGON ((-75.34706 40.51062, -75.34622 40.509..."


## Convert to WaDE Input Data

In [6]:
# covert MGPD to AFY to
# 1000000 US liquid gallons / Day = 1120.14406 Acre-foot / year.

def convertMGPDtoAFYFunc(Val):
    Val = str(Val).strip()
    if (Val == "" ) or (pd.isnull(Val)):
        outList = ""
    else:
        outList = float(Val) * 1120.14406 
    return outList

In [7]:
# withdrawal values
dfIn2['in_WD_AFY'] = dfIn2.apply(lambda row: convertMGPDtoAFYFunc(row['WD_MGD']), axis=1)
dfIn2['in_WD_AFY'].unique()

array([  46.41692851,  401.47190666,  320.76015658, ..., 1025.87273591,
        158.88123347,  855.96932172])

In [8]:
# consumptive values
dfIn2['in_CU_AFY'] = dfIn2.apply(lambda row: convertMGPDtoAFYFunc(row['CU_MGD']), axis=1)
dfIn2['in_CU_AFY'].unique()

array([ 4.64169285, 40.14719067, 32.07601566, ..., 51.2936368 ,
        7.94406167, 42.79846609])

In [9]:
# Withdrawal dataframe
# --------------------------

# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfIn2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "DRBCag_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = "Withdrawal"
df['in_VariableSpecificCV'] = "" # determine below

# Organization Info
df['in_OrganizationUUID'] = "DRBCag_O1"

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = ""
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = dfIn2['DESIGNATION']

# ReportingUnits Info
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_ReportingUnitName'] = dfIn2['STREAMS']
df['in_ReportingUnitNativeID'] = dfIn2['BASIN_ID'].replace("", 0).fillna(0).astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Subbasin"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "DE"

# AggregatedAmounts Info
df['in_AllocationCropDutyAmount'] = ""
df['in_Amount'] = dfIn2['in_WD_AFY'] #use withdrawl value here, see above for conversion
df['in_BeneficialUseCategory'] = dfIn2['CATEGORY']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_InterbasinTransferFromID'] = ""
df['in_InterbasinTransferToID'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfIn2['YEAR']
df['in_SDWISIdentifierCV'] = ""
df['in_TimeframeEnd'] = "" # determine below
df['in_TimeframeStart'] = "" # determine below

outWith = df.copy()
outWith = outWith.drop_duplicates().reset_index(drop=True)
print(len(outWith))
outWith.head()

21468


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,drbc0,DRBCag_M1,,Annual,Withdrawal,,DRBCag_O1,,,,,,GW,4326,Tohichon Cr,DB-074,,Subbasin,,DE,,46.41693,Public Water Supply,,,,,,,,,,,,,,1990,,,
1,drbc1,DRBCag_M1,,Annual,Withdrawal,,DRBCag_O1,,,,,,GW,4326,Neshaminy Cr below Little Neshaminy Creek,DB-084,,Subbasin,,DE,,401.47191,Public Water Supply,,,,,,,,,,,,,,1990,,,
2,drbc2,DRBCag_M1,,Annual,Withdrawal,,DRBCag_O1,,,,,,GW,4326,"Crum Cr, Ridley Cr, Marcus Hook Cr",DB-114,,Subbasin,,DE,,320.76016,Public Water Supply,,,,,,,,,,,,,,1990,,,
3,drbc3,DRBCag_M1,,Annual,Withdrawal,,DRBCag_O1,,,,,,GW,4326,West Branch Perkiomen Cr,DB-106,,Subbasin,,DE,,107.65659,Public Water Supply,,,,,,,,,,,,,,1990,,,
4,drbc4,DRBCag_M1,,Annual,Withdrawal,,DRBCag_O1,,,,,,GW,4326,West Branch Perkiomen Cr,DB-106,,Subbasin,,DE,,354.37369,Public Water Supply,,,,,,,,,,,,,,1990,,,


In [10]:
# Consumptive dataframe
# --------------------------

# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfIn2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "DRBCag_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = "Consumptive"
df['in_VariableSpecificCV'] = "" # determine below

# Organization Info
df['in_OrganizationUUID'] = "DRBCag_O1"

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = ""
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = dfIn2['DESIGNATION']

# ReportingUnits Info
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_ReportingUnitName'] = dfIn2['STREAMS']
df['in_ReportingUnitNativeID'] = dfIn2['BASIN_ID'].replace("", 0).fillna(0).astype(str).str.strip()
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = "Subbasin"
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = "DE"

# AggregatedAmounts Info
df['in_AllocationCropDutyAmount'] = ""
df['in_Amount'] = dfIn2['in_CU_AFY'] #use consumptive value here, see above for conversion
df['in_BeneficialUseCategory'] = dfIn2['CATEGORY']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_InterbasinTransferFromID'] = ""
df['in_InterbasinTransferToID'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfIn2['YEAR']
df['in_SDWISIdentifierCV'] = ""
df['in_TimeframeEnd'] = "" # determine below
df['in_TimeframeStart'] = "" # determine below

outCons = df.copy()
outCons = outCons.drop_duplicates().reset_index(drop=True)
print(len(outCons))
outCons.head()

21468


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,drbc0,DRBCag_M1,,Annual,Consumptive,,DRBCag_O1,,,,,,GW,4326,Tohichon Cr,DB-074,,Subbasin,,DE,,4.64169,Public Water Supply,,,,,,,,,,,,,,1990,,,
1,drbc1,DRBCag_M1,,Annual,Consumptive,,DRBCag_O1,,,,,,GW,4326,Neshaminy Cr below Little Neshaminy Creek,DB-084,,Subbasin,,DE,,40.14719,Public Water Supply,,,,,,,,,,,,,,1990,,,
2,drbc2,DRBCag_M1,,Annual,Consumptive,,DRBCag_O1,,,,,,GW,4326,"Crum Cr, Ridley Cr, Marcus Hook Cr",DB-114,,Subbasin,,DE,,32.07602,Public Water Supply,,,,,,,,,,,,,,1990,,,
3,drbc3,DRBCag_M1,,Annual,Consumptive,,DRBCag_O1,,,,,,GW,4326,West Branch Perkiomen Cr,DB-106,,Subbasin,,DE,,10.76566,Public Water Supply,,,,,,,,,,,,,,1990,,,
4,drbc4,DRBCag_M1,,Annual,Consumptive,,DRBCag_O1,,,,,,GW,4326,West Branch Perkiomen Cr,DB-106,,Subbasin,,DE,,35.43737,Public Water Supply,,,,,,,,,,,,,,1990,,,


In [11]:
# Concatenate dataframes together
frames = [outWith, outCons] #withdrawl, consumptive
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

42936


## WaDE Custom Elements (due to missing sate info)

In [12]:
# determine water source type from provided abbreviations.

def determineWaterSourceTypeCVFunc(Val):
    Val = str(Val).strip()
    if (Val == "GW"):
        outString = "Groundwater"
    elif (Val == "SW"):
        outString = "Surface Water"
    else:
        outString = ""
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: determineWaterSourceTypeCVFunc(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [13]:
# create in_TimeframeEnd & in_TimeframeStart, use in_ReportYearCV as input.

def determineReportYearCVFunc(mdVal, Val):
    Val = str(int(Val)).strip()
    if (Val == "" ) or (pd.isnull(Val)):
        outString = ""
    else:
        outString = mdVal + Val
    return outString

In [14]:
# in_TimeframeEnd values
outdf['in_TimeframeEnd'] = outdf.apply(lambda row: determineReportYearCVFunc("12/31/", row['in_ReportYearCV']), axis=1)
outdf['in_TimeframeEnd'].unique()

array(['12/31/1990', '12/31/1991', '12/31/1992', '12/31/1993',
       '12/31/1994', '12/31/1995', '12/31/1996', '12/31/1997',
       '12/31/1998', '12/31/1999', '12/31/2000', '12/31/2001',
       '12/31/2002', '12/31/2003', '12/31/2004', '12/31/2005',
       '12/31/2006', '12/31/2007', '12/31/2008', '12/31/2009',
       '12/31/2010', '12/31/2011', '12/31/2012', '12/31/2013',
       '12/31/2014', '12/31/2015', '12/31/2016', '12/31/2017'],
      dtype=object)

In [15]:
# in_TimeframeStart values
outdf['in_TimeframeStart'] = outdf.apply(lambda row: determineReportYearCVFunc("01/01/", row['in_ReportYearCV']), axis=1)
outdf['in_TimeframeStart'].unique()

array(['01/01/1990', '01/01/1991', '01/01/1992', '01/01/1993',
       '01/01/1994', '01/01/1995', '01/01/1996', '01/01/1997',
       '01/01/1998', '01/01/1999', '01/01/2000', '01/01/2001',
       '01/01/2002', '01/01/2003', '01/01/2004', '01/01/2005',
       '01/01/2006', '01/01/2007', '01/01/2008', '01/01/2009',
       '01/01/2010', '01/01/2011', '01/01/2012', '01/01/2013',
       '01/01/2014', '01/01/2015', '01/01/2016', '01/01/2017'],
      dtype=object)

In [16]:
# Clean owner name up
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [17]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

array(['Tohichon Cr', 'Neshaminy Cr Below Little Neshaminy Creek',
       'Crum Cr Ridley Cr Marcus Hook Cr', 'West Branch Perkiomen Cr',
       'Neshaminy Cr Above Little Neshaminy Creek',
       'Perkiomen Cr Above Northeast Branch',
       'Tribs To Middle Schuylkill River Lower',
       'Lower Schuylkill Tribs Above Skippack Cr',
       'Poquessing Cr Pennypack Cr Tribs To Delaware River',
       'Lower Schuylkill Tribs Below Skippack Cr',
       'East Br Brandywine Cr', 'Chester Cr', 'West Br Brandywine Cr',
       'Brandywine Cr Main Stem', 'Wissahickon Cr', 'French Cr',
       'Upper West Br Delaware River',
       'Third Branch West Branch East Branch', 'Oquaga Cr',
       'Upper East Br Delaware River Above Platte Kill', 'Willowemoc Cr',
       'Lower East Br Delaware River',
       'Hankins Crbasket Crhoolahan Crabe Lord Crhumphries Crblue Mill Stream Tribs To Delaware River',
       'North Br Callicoon Cr', 'Unamed Tributaries To Delaware River',
       'W Branch Lackawaxen 

In [18]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [19]:
def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [20]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [21]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [22]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

array(['Tohichon Cr', 'Neshaminy Cr Below Little Neshaminy Creek',
       'Crum Cr Ridley Cr Marcus Hook Cr', 'West Branch Perkiomen Cr',
       'Neshaminy Cr Above Little Neshaminy Creek',
       'Perkiomen Cr Above Northeast Branch',
       'Tribs To Middle Schuylkill River Lower',
       'Lower Schuylkill Tribs Above Skippack Cr',
       'Poquessing Cr Pennypack Cr Tribs To Delaware River',
       'Lower Schuylkill Tribs Below Skippack Cr',
       'East Br Brandywine Cr', 'Chester Cr', 'West Br Brandywine Cr',
       'Brandywine Cr Main Stem', 'Wissahickon Cr', 'French Cr',
       'Upper West Br Delaware River',
       'Third Branch West Branch East Branch', 'Oquaga Cr',
       'Upper East Br Delaware River Above Platte Kill', 'Willowemoc Cr',
       'Lower East Br Delaware River',
       'Hankins Crbasket Crhoolahan Crabe Lord Crhumphries Crblue Mill Stream Tribs To Delaware River',
       'North Br Callicoon Cr', 'Unamed Tributaries To Delaware River',
       'W Branch Lackawaxen 

In [23]:
outdf['in_ReportingUnitTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitTypeCV']), axis=1)
outdf['in_ReportingUnitTypeCV'].unique()

array(['Subbasin'], dtype=object)

In [24]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
for x in outdf['in_BeneficialUseCategory'].sort_values().unique():
    print(f'"' + x + '",')

"Agriculture",
"Bottled Water",
"Commercial",
"Cranberries",
"Fish Hatchery",
"Golf/CC",
"Hospital/Health",
"Hydroelectric Power",
"Industrial",
"Military",
"Mining",
"Non-Agricultural Irrigation",
"Nursery",
"Parks/Recreation",
"Prison",
"Public Water Supply",
"Refinery",
"Remediation",
"School",
"Ski/Snowmaking",
"Thermoelectric Power",
"Unassociated GW",
"Unassociated SW",


In [25]:
# Fixing n_Amount datatype
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').replace(0,"").fillna("")
outdf['in_Amount'].unique()

array([46.41692851369868, 401.471906655343, 320.76015657863053, ...,
       51.29363679552, 7.944061673520001, 42.798466085812684],
      dtype=object)

In [26]:
%%time

# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

Wall time: 521 ms


array(['wadeID1', 'wadeID2'], dtype=object)

In [27]:
# determine in_VariableSpecificCV
outdf['in_VariableSpecificCV'] = outdf['in_VariableCV'].astype(str) + "_" + outdf['in_AggregationIntervalUnitCV'].astype(str) + "_" + outdf['in_BeneficialUseCategory'].astype(str) + "_" + outdf['in_WaterSourceTypeCV'].astype(str)
outdf['in_VariableSpecificCV'].unique()

array(['Withdrawal_Annual_Public Water Supply_Groundwater',
       'Withdrawal_Annual_Public Water Supply_Surface Water',
       'Withdrawal_Annual_Thermoelectric Power_Surface Water',
       'Withdrawal_Annual_Thermoelectric Power_Groundwater',
       'Withdrawal_Annual_Hydroelectric Power_Surface Water',
       'Withdrawal_Annual_Industrial_Groundwater',
       'Withdrawal_Annual_Remediation_Groundwater',
       'Withdrawal_Annual_Refinery_Groundwater',
       'Withdrawal_Annual_Industrial_Surface Water',
       'Withdrawal_Annual_Remediation_Surface Water',
       'Withdrawal_Annual_Refinery_Surface Water',
       'Withdrawal_Annual_Mining_Groundwater',
       'Withdrawal_Annual_Mining_Surface Water',
       'Withdrawal_Annual_Agriculture_Groundwater',
       'Withdrawal_Annual_Cranberries_Groundwater',
       'Withdrawal_Annual_Golf/CC_Groundwater',
       'Withdrawal_Annual_Non-Agricultural Irrigation_Groundwater',
       'Withdrawal_Annual_Nursery_Groundwater',
       'Withdrawal

In [28]:
%%time

# Creating WaDE VariableSpecificUUID for easy VariableSpecificCV identification 
# use these inputs: VariableCV_AggregationIntervalUnitCV_BeneficalUse_WaterSourceTypeCV
# ----------------------------------------------------------------------------------------------------

# Create temp VariableSpecificUUID dataframe of unique water source.
def assignVariableSpecificUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "DRBCwr_V" + string1
    return outstring

dfVariableSpecificUUID = pd.DataFrame()
dfVariableSpecificUUID['in_VariableCV'] = outdf['in_VariableCV']
dfVariableSpecificUUID['in_AggregationIntervalUnitCV'] = outdf['in_AggregationIntervalUnitCV']
dfVariableSpecificUUID['in_BeneficialUseCategory'] = outdf['in_BeneficialUseCategory']
dfVariableSpecificUUID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfVariableSpecificUUID = dfVariableSpecificUUID.drop_duplicates()

dftemp = pd.DataFrame(index=dfVariableSpecificUUID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfVariableSpecificUUID['in_VariableSpecificUUID'] = dftemp.apply(lambda row: assignVariableSpecificUUID(row['Count']), axis=1)
dfVariableSpecificUUID['linkKey'] = dfVariableSpecificUUID['in_VariableCV'].astype(str) + dfVariableSpecificUUID['in_AggregationIntervalUnitCV'].astype(str) + dfVariableSpecificUUID['in_BeneficialUseCategory'].astype(str) + dfVariableSpecificUUID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
VariableSpecificUUIDdict = pd.Series(dfVariableSpecificUUID.in_VariableSpecificUUID.values, index=dfVariableSpecificUUID.linkKey.astype(str)).to_dict()
def retrieveVariableSpecificUUID(A, B, C, D):
    if (A == '' and B == '' and C == '' and D == '') or (pd.isnull(A) and pd.isnull(B) and pd.isnull(C) and pd.isnull(D)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip() + str(C).strip() + str(D).strip()
        try:
            outList = VariableSpecificUUIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_VariableSpecificUUID'] = outdf.apply(lambda row: retrieveVariableSpecificUUID(row['in_VariableCV'], row['in_AggregationIntervalUnitCV'], row['in_BeneficialUseCategory'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificUUID'].unique()

Wall time: 646 ms


array(['DRBCwr_V1', 'DRBCwr_V2', 'DRBCwr_V3', 'DRBCwr_V4', 'DRBCwr_V5',
       'DRBCwr_V6', 'DRBCwr_V7', 'DRBCwr_V8', 'DRBCwr_V9', 'DRBCwr_V10',
       'DRBCwr_V11', 'DRBCwr_V12', 'DRBCwr_V13', 'DRBCwr_V14',
       'DRBCwr_V15', 'DRBCwr_V16', 'DRBCwr_V17', 'DRBCwr_V18',
       'DRBCwr_V19', 'DRBCwr_V20', 'DRBCwr_V21', 'DRBCwr_V22',
       'DRBCwr_V23', 'DRBCwr_V24', 'DRBCwr_V25', 'DRBCwr_V26',
       'DRBCwr_V27', 'DRBCwr_V28', 'DRBCwr_V29', 'DRBCwr_V30',
       'DRBCwr_V31', 'DRBCwr_V32', 'DRBCwr_V33', 'DRBCwr_V34',
       'DRBCwr_V35', 'DRBCwr_V36', 'DRBCwr_V37', 'DRBCwr_V38',
       'DRBCwr_V39', 'DRBCwr_V40', 'DRBCwr_V41', 'DRBCwr_V42',
       'DRBCwr_V43', 'DRBCwr_V44', 'DRBCwr_V45', 'DRBCwr_V46',
       'DRBCwr_V47', 'DRBCwr_V48', 'DRBCwr_V49', 'DRBCwr_V50',
       'DRBCwr_V51', 'DRBCwr_V52', 'DRBCwr_V53', 'DRBCwr_V54',
       'DRBCwr_V55', 'DRBCwr_V56', 'DRBCwr_V57', 'DRBCwr_V58',
       'DRBCwr_V59', 'DRBCwr_V60', 'DRBCwr_V61', 'DRBCwr_V62',
       'DRBCwr_V63', 'DRBCwr_V64', '

## Shapefile Data
- For attaching geometry to csv inputs.

In [29]:
# PoU Shapefile Data
# Shapefile input
dfshapetemp = gpd.read_file('shapefile/drb147.shp', crs="EPSG:4326")
print(len(dfshapetemp))
dfshapetemp.head()

147


Unnamed: 0,OBJECTID,SQM,STATEID,BASIN_ID,STREAMS,Shape_Leng,HUC8_Name,Shape_Le_1,Shape_Area,geometry
0,1,143.786,NY,DB-001,Upper West Br Delaware River,118207.57278,Upper Delaware,1.26136,0.04069,"POLYGON ((-74.82876 42.39163, -74.82836 42.392..."
1,2,210.122,NY,DB-010,Upper East Br Delaware River above Platte Kill,122934.23131,East Branch Delaware,1.28764,0.05932,"POLYGON ((-74.55828 42.36695, -74.55811 42.366..."
2,3,82.85,NY,DB-003,Middle West Br Delaware River,99450.67916,Upper Delaware,1.05879,0.0234,"POLYGON ((-75.04722 42.30325, -75.04739 42.304..."
3,4,52.263,NY,DB-002,Little Delaware River,67203.77633,Upper Delaware,0.72892,0.01477,"POLYGON ((-74.66058 42.29526, -74.66106 42.294..."
4,5,53.084,NY,DB-004,"Third Branch, West Branch, East Branch",58100.6679,Upper Delaware,0.61221,0.01499,"POLYGON ((-75.20544 42.25283, -75.20539 42.252..."


In [30]:
columnsList = ['in_ReportingUnitNativeID', 'geometry']
outshape = pd.DataFrame(columns=columnsList)
outshape['in_ReportingUnitNativeID'] = dfshapetemp['BASIN_ID'].replace("", 0).fillna(0).astype(str).str.strip()
outshape['geometry'] = dfshapetemp['geometry']
outshape = outshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
outshape.head()

Unnamed: 0,in_ReportingUnitNativeID,geometry
0,DB-001,"POLYGON ((-74.82876 42.39163, -74.82836 42.392..."
1,DB-010,"POLYGON ((-74.55828 42.36695, -74.55811 42.366..."
2,DB-003,"POLYGON ((-75.04722 42.30325, -75.04739 42.304..."
3,DB-002,"POLYGON ((-74.66058 42.29526, -74.66106 42.294..."
4,DB-004,"POLYGON ((-75.20544 42.25283, -75.20539 42.252..."


# Export the Output 

In [31]:
outdf.info

<bound method DataFrame.info of         WaDEUUID in_MethodUUID in_VariableSpecificUUID  \
0          drbc0     DRBCag_M1               DRBCwr_V1   
1          drbc1     DRBCag_M1               DRBCwr_V1   
2          drbc2     DRBCag_M1               DRBCwr_V1   
3          drbc3     DRBCag_M1               DRBCwr_V1   
4          drbc4     DRBCag_M1               DRBCwr_V1   
...          ...           ...                     ...   
42931  drbc21463     DRBCag_M1              DRBCwr_V70   
42932  drbc21464     DRBCag_M1              DRBCwr_V70   
42933  drbc21465     DRBCag_M1              DRBCwr_V70   
42934  drbc21466     DRBCag_M1              DRBCwr_V70   
42935  drbc21467     DRBCag_M1              DRBCwr_V70   

      in_AggregationIntervalUnitCV in_VariableCV  \
0                           Annual    Withdrawal   
1                           Annual    Withdrawal   
2                           Annual    Withdrawal   
3                           Annual    Withdrawal   
4          

In [32]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_VariableSpecificCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_EPSGCodeCV,in_ReportingUnitName,in_ReportingUnitNativeID,in_ReportingUnitProductVersion,in_ReportingUnitTypeCV,in_ReportingUnitUpdateDate,in_StateCV,in_AllocationCropDutyAmount,in_Amount,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_InterbasinTransferFromID,in_InterbasinTransferToID,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifierCV,in_TimeframeEnd,in_TimeframeStart
0,drbc0,DRBCag_M1,DRBCwr_V1,Annual,Withdrawal,Withdrawal_Annual_Public Water Supply_Groundwater,DRBCag_O1,,,,,wadeID1,Groundwater,4326,Tohichon Cr,DB-074,,Subbasin,,DE,,46.41693,Public Water Supply,,,,,,,,,,,,,,1990,,12/31/1990,01/01/1990
1,drbc1,DRBCag_M1,DRBCwr_V1,Annual,Withdrawal,Withdrawal_Annual_Public Water Supply_Groundwater,DRBCag_O1,,,,,wadeID1,Groundwater,4326,Neshaminy Cr Below Little Neshaminy Creek,DB-084,,Subbasin,,DE,,401.47191,Public Water Supply,,,,,,,,,,,,,,1990,,12/31/1990,01/01/1990
2,drbc2,DRBCag_M1,DRBCwr_V1,Annual,Withdrawal,Withdrawal_Annual_Public Water Supply_Groundwater,DRBCag_O1,,,,,wadeID1,Groundwater,4326,Crum Cr Ridley Cr Marcus Hook Cr,DB-114,,Subbasin,,DE,,320.76016,Public Water Supply,,,,,,,,,,,,,,1990,,12/31/1990,01/01/1990
3,drbc3,DRBCag_M1,DRBCwr_V1,Annual,Withdrawal,Withdrawal_Annual_Public Water Supply_Groundwater,DRBCag_O1,,,,,wadeID1,Groundwater,4326,West Branch Perkiomen Cr,DB-106,,Subbasin,,DE,,107.65659,Public Water Supply,,,,,,,,,,,,,,1990,,12/31/1990,01/01/1990
4,drbc4,DRBCag_M1,DRBCwr_V1,Annual,Withdrawal,Withdrawal_Annual_Public Water Supply_Groundwater,DRBCag_O1,,,,,wadeID1,Groundwater,4326,West Branch Perkiomen Cr,DB-106,,Subbasin,,DE,,354.37369,Public Water Supply,,,,,,,,,,,,,,1990,,12/31/1990,01/01/1990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42931,drbc21463,DRBCag_M1,DRBCwr_V70,Annual,Consumptive,Consumptive_Annual_Unassociated SW_Surface Water,DRBCag_O1,,,,,wadeID2,Surface Water,4326,Maiden Cr Below Saucony Creek,DB-099,,Subbasin,,DE,,0.00000,Unassociated SW,,,,,,,,,,,,,,2017,,12/31/2017,01/01/2017
42932,drbc21464,DRBCag_M1,DRBCwr_V70,Annual,Consumptive,Consumptive_Annual_Unassociated SW_Surface Water,DRBCag_O1,,,,,wadeID2,Surface Water,4326,Upper Tulpehocken Cr Above Blue Marsh Res,DB-100,,Subbasin,,DE,,51.29364,Unassociated SW,,,,,,,,,,,,,,2017,,12/31/2017,01/01/2017
42933,drbc21465,DRBCag_M1,DRBCwr_V70,Annual,Consumptive,Consumptive_Annual_Unassociated SW_Surface Water,DRBCag_O1,,,,,wadeID2,Surface Water,4326,Lower Tulpehocken Cr Below Blue Marsh Res,DB-101,,Subbasin,,DE,,7.94406,Unassociated SW,,,,,,,,,,,,,,2017,,12/31/2017,01/01/2017
42934,drbc21466,DRBCag_M1,DRBCwr_V70,Annual,Consumptive,Consumptive_Annual_Unassociated SW_Surface Water,DRBCag_O1,,,,,wadeID2,Surface Water,4326,Tribs To Middle Schuylkill River Lower,DB-102,,Subbasin,,DE,,42.79847,Unassociated SW,,,,,,,,,,,,,,2017,,12/31/2017,01/01/2017


In [33]:
# Export the output dataframe
outdf.to_csv('Pag_drbcMain.zip', compression=dict(method='zip', archive_name='Pag_drbcMain.csv'), index=False)   # The output, save as a zip
outshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.