# Pre-processing Colorado Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Colorado/WaterAllocation_WaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Colorado/WaterAllocation_WaterUse


## Data Input 1 - DWR_Water_Right_-_Net_Amounts.zip
- water right and site info

In [3]:
# Input File - asdf
fileInput = "RawInputData/DWR_Water_Right_-_Net_Amounts.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/DWR_Water_Right_-_Net_Amounts.zip", compression=dict(method='zip', archive_name="DWR_Water_Right_-_Net_Amounts.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

  dfin1 = pd.read_csv(fileInput).replace(np.nan, "")


171214


Unnamed: 0,WDID,Structure Name,Structure Type,Water Source,GNIS ID,Stream Mile,DIV,WD,County,Q10,Q40,Q160,Section,Township,Range,PM,CoordsEW,CoordsEW Dir,CoordsNS,CoordsNS Dir,UTM x,UTM y,Latitude,Longitude,Location Accuracy,Adjudication Date,Previous Adj Date,Appropriation Date,Admin No,Order No,Priority No,Associated Case Numbers,Decreed Uses,Net Absolute,Net Conditional,Net APEX Absolute,Net APEX Conditional,Decreed Units,Seasonal Limits,Comments,Modified,More Information,Location,WaDEUUID
0,103860,MILLAGE RES,Reservoir,SOUTH PLATTE RIVER,201759.0,167.82,1,1,WELD,,NW,SW,3.0,5.0 N,64.0 W,S,,,,,538747.0,4475216.0,40.42668,-104.54323,GPS,12/31/1977,12/31/1976,03/31/1904,46386.19813,0,,W8623,1,150.0,0.0,0.0,0.0,A,No,,02/28/1992 12:00:00 AM,https://dwr.state.co.us/Tools/WaterRights/NetA...,"(40.426683, -104.543229)",coD0


In [4]:
#Creating Beneficial Use.
#Need to split CO abbreviatoin strings to a workable format.

BenUseDict = {
"0" : "Storage",
"1" : "Irrigation",
"2" : "Municipal",
"3" : "Commercial",
"4" : "Industrial",
"5" : "Recreation",
"6" : "Fishery",
"7" : "Fire",
"8" : "Domestic",
"9" : "Stock",
"A" : "Augmentation",
"B" : "Export from Basin",
"C" : "Cumulative Accretion to River",
"D" : "Cumulative Depletion from River",
"E" : "Evaporative",
"F" : "Federal Reserved",
"G" : "Geothermal",
"H" : "Household Use Only",
"K" : "Snow Making",
"M" : "Minimum Streamflow",
"N" : "Net Effect on River",
"P" : "Power Generation",
"Q" : "Other",
"R" : "Recharge",
"S" : "Export from State",
"T" : "Transmountain Export",
"W" : "Wildlife",
"X" : "All Beneficial Uses"}

def retrieveBenUse(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        String1 = str(colrowValue).strip()
        x=[]
        x[:] = String1
        try:
            outList = []
            for i in range(len(x)):
                y = x[i].strip()
                y = BenUseDict[y]
                outList.append(y)
            outString = ",".join(str(e) for e in outList)
        except:
            outString = ""
    return outString

dfin1['in_BeneficialUseCategory'] = dfin1.apply(lambda row: retrieveBenUse(row['Decreed Uses']), axis=1)
dfin1['in_BeneficialUseCategory'].unique()

array(['Irrigation', 'Irrigation,Stock',
       'Irrigation,Recreation,Fishery,Augmentation,Recharge,Wildlife',
       ..., 'Storage,Recreation,Fire,Wildlife',
       'Fire,Augmentation,Evaporative,Wildlife',
       'Storage,Recreation,Fire,Augmentation,Wildlife'], dtype=object)

In [5]:
#Determining WaterSourceTypeCV

WSTypeDict = {
"Aquifer NNT/NT Reservation" : "Surface Water",
"Ditch" : "Surface Water",
"Ditch System" : "Surface Water",
"Exchange Plan" : "Surface Water",
"Measuring Point" : "Surface Water",
"Mine" : "Surface Water",
"Minimum Flow" : "Surface Water",
"Other" : "Surface Water",
"Pipeline" : "Surface Water",
"Power Plant" : "Surface Water",
"Pump" : "Surface Water",
"Reach" : "Surface Water",
"Reach (Aggregating)" : "Surface Water",
"Recharge Area" : "Surface Water",
"Recharge Area Group" : "Surface Water",
"Reservoir" : "Surface Water",
"Reservoir System" : "Surface Water",
"Seep" : "Surface Water",
"Spring" : "Surface Water",
"Stream Gage" : "Surface Water",
"Well" : "Groundwater",
"Well Field" : "Groundwater",
"Well Group" : "Groundwater",
"Augmentation/Replacement Plan" : "Groundwater"}

def retrieveWaterSourceTypeCV(colrowValue):
    if (colrowValue == "") or (pd.isnull(colrowValue)):
        outString = "WaDE Blank"
    else:
        colrowValue = str(colrowValue).strip()
        try:
            outString = WSTypeDict[colrowValue]
        except:
            outString = "WaDE Blank"
    return outString

dfin1['in_WaterSourceTypeCV'] = dfin1.apply(lambda row: retrieveWaterSourceTypeCV(row['Structure Type']), axis=1)
dfin1['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'WaDE Blank'], dtype=object)

In [6]:
# Allocation_CFS
# If Decreed Units = "C" and Net Absolute != 0, then return Net Absolute
# Elif Decreed Units = "C" and Net Conditional != 0, then return Net Conditional
# Else return blank

# For creating Allocation_CFS
def assignAllocation_CFS(valA, valB, valC):
    valA = str(valA).strip()
    if (valB != 0) and (valC != 0):
        outString = 0
    else:
        if (valA == "C") and (valB != 0):
            outString = valB
        elif (valA  == "C") and (valC != 0):
            outString = valC
        else:
            outString = 0
    return outString

dfin1['in_AllocationFlow_CFS'] = dfin1.apply(lambda row: assignAllocation_CFS(row["Decreed Units"], row["Net Absolute"], row["Net Conditional"]), axis=1)
dfin1['in_AllocationFlow_CFS'].unique()

array([0.000e+00, 5.330e-01, 4.450e-01, ..., 1.114e+03, 7.810e-01,
       2.825e+01])

In [7]:
# AllocationVolume_AF
# If Decreed Units = "A" and Net Absolute != 0, then return Net Absolute
# Elif Decreed Units = "A" and Net Conditional != 0, then return Net Conditional
# Else return blank

# For creating AllocationVolume_AF
def assignAllocationVolume_AF(valA, valB, valC):
    valA = str(valA).strip()
    if (valB != 0) and (valC != 0):
        outString = 0
    else:
        if (valA == "A") and (valB != 0):
            outString = valB
        elif (valA  == "A") and (valC != 0):
            outString = valC
        else:
            outString = 0
    return outString

dfin1['in_AllocationVolume_AF'] = dfin1.apply(lambda row: assignAllocationVolume_AF(row["Decreed Units"], row["Net Absolute"], row["Net Conditional"]), axis=1)
dfin1['in_AllocationVolume_AF'].unique()

array([1.50e+02, 8.25e+01, 0.00e+00, ..., 8.77e+00, 2.03e+04, 7.91e+02])

In [8]:
# For creating AllocationLegalStatusCV
# If Net Absolute = 0 and Net Condontial = 0, then Condtional Aboslute
# Elif Net Absolute = 0 and Net Condontial != 0, then Condtional
# Else, Aboslute

def assignAllocationLegalStatusCV(valA, valB):
    if (valA == 0) and (valB == 0):
        outString = "Conditional Absolute"
    elif (valA == 0) and (valB != 0):
        outString = "Conditional"
    else:
        outString = "Absolute"
    return outString

dfin1['in_AllocationLegalStatusCV'] = dfin1.apply(lambda row: assignAllocationLegalStatusCV(row['Net Absolute'], row['Net Conditional']), axis=1)
dfin1['in_AllocationLegalStatusCV'].unique()

array(['Absolute', 'Conditional Absolute', 'Conditional'], dtype=object)

In [9]:
# Need a unique identifier for WaDE AllocationNativeID.  Combine combine **Admin No**, **Order No**, **Decreed Units**, & **WDID** into single string entry.

# For creating AllocationAmount
def assignAllocationNativeID(colrowValueA, colrowValueB, colrowValueC, colrowValueD):
    colrowValueA = str(colrowValueA).strip()
    colrowValueB = str(colrowValueB).strip()
    colrowValueD = str(colrowValueD).strip()
    outString = "-".join(map(str, [colrowValueA, colrowValueB, colrowValueC, colrowValueD]))
    return outString

dfin1['in_AllocationNativeID'] = dfin1.apply(lambda row: assignAllocationNativeID(row['Admin No'], row['Order No'], row['Decreed Units'], row['WDID']), axis=1)
dfin1['in_AllocationNativeID'].unique()

array(['46386.19813-0-A-103860', '46020.20608-0-A-103894',
       '39446.0-0-C-105485', ..., '62912.0-0-C-7100733',
       '62912.0-0-A-7105073', '62912.0-0-A-7105071'], dtype=object)

In [10]:
# Use list of WDIDs (from Division data) as inputs, retreive time series data.
# Split list into catagories that are 100 long. Issue with CO API timing out after too long.
dfin1_s = dfin1.drop_duplicates(subset=["WDID"], keep=False).reset_index()
wdidList = dfin1_s['WDID'].tolist()
wdidListB = [wdidList[i:i + 100] for i in range(0, len(wdidList), 100)]

## Data Input 2 - time series water use info.zip
-retrieved via api, saved to local zip for easy future acces


In [11]:
# already done
# %%time
# # Time Series Dataframe
# dfts = pd.DataFrame()

# str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecmonth/?format=csv&wdid="
# str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

# for i in range(len(wdidListB)):
#     lstC = wdidListB[i]
#     lstCa = '%2C'.join([str(n) for n in lstC]) 
#     urlInput = str2 + lstCa + str3
#     print(urlInput)
#     try:
#         tempdf = pd.read_csv(urlInput, skiprows=2).replace(np.nan, "")
#         dfts = pd.concat([dfts, tempdf])
#     except:
#         print("bad reponse")

# print(len(dfts))
# dfts.head()

In [12]:
# already done
# dfts.to_csv('RawInputData/TimeSeriesInfo.zip', compression=dict(method='zip', archive_name='TimeSeriesInfo.csv'), index=False)  # The output, save as a zip

In [13]:
# Input File - TimeSeriesInfo.zip
fileInput = "RawInputData/TimeSeriesInfo.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/TimeSeriesInfo.zip", compression=dict(method='zip', archive_name="TimeSeriesInfo.csv"), index=False)

print(len(dfin2))
dfin2.head(1)

984739


Unnamed: 0,wdid,waterClassNum,wcIdentifier,measInterval,measCount,dataMeasDate,dataValue,measUnits,obsCode,approvalStatus,modified,WaDEUUID
0,1100974,94455,1100974 S:1 F: U:1 T: G: To:,Daily,1,1990-09,0.0,ACFT,*,Approved,2015-02-24T14:29:00.0000000,in20


In [14]:
#left merge sites to water use
dfin1 = dfin1.merge(dfin2, left_on='WDID', right_on='wdid', how='left')
print(len(dfin1))
dfin1.head(1)

1152850


Unnamed: 0,WDID,Structure Name,Structure Type,Water Source,GNIS ID,Stream Mile,DIV,WD,County,Q10,Q40,Q160,Section,Township,Range,PM,CoordsEW,CoordsEW Dir,CoordsNS,CoordsNS Dir,UTM x,UTM y,Latitude,Longitude,Location Accuracy,Adjudication Date,Previous Adj Date,Appropriation Date,Admin No,Order No,Priority No,Associated Case Numbers,Decreed Uses,Net Absolute,Net Conditional,Net APEX Absolute,Net APEX Conditional,Decreed Units,Seasonal Limits,Comments,Modified,More Information,Location,WaDEUUID_x,in_BeneficialUseCategory,in_WaterSourceTypeCV,in_AllocationFlow_CFS,in_AllocationVolume_AF,in_AllocationLegalStatusCV,in_AllocationNativeID,wdid,waterClassNum,wcIdentifier,measInterval,measCount,dataMeasDate,dataValue,measUnits,obsCode,approvalStatus,modified,WaDEUUID_y
0,103860,MILLAGE RES,Reservoir,SOUTH PLATTE RIVER,201759.0,167.82,1,1,WELD,,NW,SW,3.0,5.0 N,64.0 W,S,,,,,538747.0,4475216.0,40.42668,-104.54323,GPS,12/31/1977,12/31/1976,03/31/1904,46386.19813,0,,W8623,1,150.0,0.0,0.0,0.0,A,No,,02/28/1992 12:00:00 AM,https://dwr.state.co.us/Tools/WaterRights/NetA...,"(40.426683, -104.543229)",coD0,Irrigation,Surface Water,0.0,150.0,Absolute,46386.19813-0-A-103860,,,,,,,,,,,,


In [15]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "COwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "COwr_V1" # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = "Discharge Flow"

# Organization Info
df['in_OrganizationUUID'] = "COwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = dfin1['Water Source']
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfin1['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = dfin1['Location Accuracy']
df['in_County'] = dfin1['County']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1['Latitude']
df['in_Longitude'] = dfin1['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfin1['Structure Name']
df['in_SiteNativeID'] = dfin1['WDID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin1['Structure Type'].astype(str)
df['in_StateCV'] = "CO"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['in_AllocationFlow_CFS'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['in_AllocationLegalStatusCV']
df['in_AllocationNativeID'] =  dfin1['in_AllocationNativeID'].astype(str)
df['in_AllocationOwner'] = ""
df['in_AllocationPriorityDate'] = dfin1['Appropriation Date']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfin1['in_AllocationVolume_AF'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0 # 1 or 0, if we want this data excempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfin1['More Information']

# Site VariableAmounts Info
df['in_Amount'] = dfin1['dataValue']
df['in_AssociatedNativeAllocationIDs'] = dfin1['in_AllocationNativeID'].astype(str)
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin1['dataMeasDate']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['dataMeasDate']
df['in_TimeframeStart'] = dfin1['dataMeasDate']
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

665262


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,coD0,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,SOUTH PLATTE RIVER,,Surface Water,,GPS,WELD,4326,,,,40.42668,-104.54323,,,POD,MILLAGE RES,103860,,Reservoir,CO,,,,,,,,,,0.0,Absolute,46386.19813-0-A-103860,,03/31/1904,,12/31,01/01,,150.0,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,46386.19813-0-A-103860,,,,,,
1,coD1,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,WILLOW CREEK,,Surface Water,,Digitized,WELD,4326,,,,40.53346,-104.56242,,,POD,MASON RES,103894,,Reservoir,CO,,,,,,,,,,0.0,Absolute,46020.20608-0-A-103894,,06/04/1906,,12/31,01/01,,82.5,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,46020.20608-0-A-103894,,,,,,
2,coD2,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,GROUNDWATER: SOUTH PLATTE RIVER,,Groundwater,,GPS,WELD,4326,,,,40.61786,-104.66634,,,POD,ANDERSEN WELL 2-12608R,105485,,Well,CO,,,,,,,,,,0.533,Absolute,39446.0-0-C-105485,,12/31/1957,,12/31,01/01,,0.0,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,39446.0-0-C-105485,,,,,,
3,coD3,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,GROUNDWATER: SOUTH PLATTE RIVER,,Groundwater,,GPS,WELD,4326,,,,40.62515,-104.67519,,,POD,ANDERSEN WELL 7057,105489,,Well,CO,,,,,,,,,,0.445,Absolute,31529.0-0-C-105489,,04/28/1936,,12/31,01/01,,0.0,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,31529.0-0-C-105489,,,,,,
4,coD4,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,LOST CREEK,,Surface Water,,Digitized,WELD,4326,,,,40.12613,-104.37217,,,POD,SHOENEMAN HOLDING POND 2,103884,,Reservoir,CO,,,,,,,,,,0.0,Absolute,55882.3068-0-A-103884,,12/31/1933,,12/31,01/01,,6.0,"Irrigation,Stock",,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,55882.3068-0-A-103884,,,,,,


## Concatenate POD and POU Data.  Make needed changes

In [16]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [17]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

665262


## Clean Data / data types

In [18]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [19]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['South Platte River', 'Willow Creek',
       'Groundwater: South Platte River', ...,
       'South Branch Boxelder Creek', 'Groundwater: East Beaver Creek',
       'Groundwater: Fourteenmile Creek'], dtype=object)

In [20]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'Wade Blank'], dtype=object)

In [21]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Weld', 'Morgan', 'Adams', 'Elbert', 'Arapahoe', 'El Paso',
       'Washington', 'Douglas', 'Denver', 'Jefferson', 'Broomfield',
       'Lincoln', 'Larimer', 'Unknown', 'Boulder', 'Grand', 'Gilpin',
       'Clear Creek', 'Teller', 'Conejos', 'Park', 'Lake', 'Chaffee',
       'Huerfano', 'Otero', 'Las Animas', '', 'Pueblo', 'Fremont',
       'Custer', 'Saguache', 'Pitkin', 'Crowley', 'Bent', 'Kiowa',
       'Costilla', 'Mineral', 'Rio Grande', 'Alamosa', 'Hinsdale',
       'San Juan', 'Archuleta', 'Phillips', 'Gunnison', 'La Plata',
       'Montezuma', 'Dolores', 'San Miguel', 'Summit', 'Eagle',
       'Garfield', 'Delta', 'Rio Blanco', 'Mesa', 'Routt', 'Montrose',
       'Moffat', 'Ouray', 'Jackson', 'Kit Carson', 'Yuma', 'Cheyenne',
       'Logan', 'Sedgwick', 'Prowers', 'Baca'], dtype=object)

In [22]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Millage Res', 'Mason Res', 'Andersen Well 212608R', ...,
       'Homestake Well No 3', 'Garcia Well', 'Homestake Pond Diversion'],
      dtype=object)

In [23]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Reservoir', 'Well', 'Ditch', 'Spring', 'Recharge Area', 'Other',
       'Pipeline', 'Well Field', 'Pump', 'Seep', 'Reservoir System',
       'Reach', 'Well Group', 'Minimum Flow', 'Mine', 'Power Plant',
       'Measuring Point', 'Augmentationreplacement Plan',
       'Recharge Area Group', 'Stream Gage', 'Exchange Plan',
       'Ditch System', 'Reach Aggregating', 'Aquifer Nntnt Reservation',
       'Entity', 'Stream Confluence', 'Livestock Water Tank'],
      dtype=object)

In [24]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array([''], dtype=object)

In [25]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [26]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['South Platte River', 'Willow Creek',
       'Groundwater: South Platte River', ...,
       'South Branch Boxelder Creek', 'Groundwater: East Beaver Creek',
       'Groundwater: Fourteenmile Creek'], dtype=object)

In [27]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater', 'Wade Blank'], dtype=object)

In [28]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Reservoir', 'Well', 'Ditch', 'Spring', 'Recharge Area', 'Other',
       'Pipeline', 'Well Field', 'Pump', 'Seep', 'Reservoir System',
       'Reach', 'Well Group', 'Minimum Flow', 'Mine', 'Power Plant',
       'Measuring Point', 'Augmentationreplacement Plan',
       'Recharge Area Group', 'Stream Gage', 'Exchange Plan',
       'Ditch System', 'Reach Aggregating', 'Aquifer Nntnt Reservation',
       'Entity', 'Stream Confluence', 'Livestock Water Tank'],
      dtype=object)

In [29]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Millage Res', 'Mason Res', 'Andersen Well 212608R', ...,
       'Homestake Well No 3', 'Garcia Well', 'Homestake Pond Diversion'],
      dtype=object)

In [30]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Weld', 'Morgan', 'Adams', 'Elbert', 'Arapahoe', 'El Paso',
       'Washington', 'Douglas', 'Denver', 'Jefferson', 'Broomfield',
       'Lincoln', 'Larimer', 'Unknown', 'Boulder', 'Grand', 'Gilpin',
       'Clear Creek', 'Teller', 'Conejos', 'Park', 'Lake', 'Chaffee',
       'Huerfano', 'Otero', 'Las Animas', '', 'Pueblo', 'Fremont',
       'Custer', 'Saguache', 'Pitkin', 'Crowley', 'Bent', 'Kiowa',
       'Costilla', 'Mineral', 'Rio Grande', 'Alamosa', 'Hinsdale',
       'San Juan', 'Archuleta', 'Phillips', 'Gunnison', 'La Plata',
       'Montezuma', 'Dolores', 'San Miguel', 'Summit', 'Eagle',
       'Garfield', 'Delta', 'Rio Blanco', 'Mesa', 'Routt', 'Montrose',
       'Moffat', 'Ouray', 'Jackson', 'Kit Carson', 'Yuma', 'Cheyenne',
       'Logan', 'Sedgwick', 'Prowers', 'Baca'], dtype=object)

In [31]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array([''], dtype=object)

In [32]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['All Beneficial Uses',
 'Augmentation',
 'Commercial',
 'Cumulative Accretion to River',
 'Domestic',
 'Evaporative',
 'Export from Basin',
 'Export from State',
 'Federal Reserved',
 'Fire',
 'Fishery',
 'Geothermal',
 'Household Use Only',
 'Industrial',
 'Irrigation',
 'Minimum Streamflow',
 'Municipal',
 'Other',
 'Power Generation',
 'Recharge',
 'Recreation',
 'Snow Making',
 'Stock',
 'Storage',
 'Transmountain Export',
 'Wildlife']

In [33]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([40.426683, 40.53346, 40.617859, ..., 37.703248, 39.304927,
       37.707755], dtype=object)

In [34]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-104.543229, -104.562424, -104.666336, ..., -108.031428,
       -106.260375, -108.032757], dtype=object)

In [35]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array(['', 0.533, 0.445, ..., 1114.0, 0.781, 28.25], dtype=object)

In [36]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([150.0, 82.5, '', ..., 8.77, 20300.0, 791.0], dtype=object)

In [37]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 3.97, 39.67, ..., 126.84, 174.83, 113.36], dtype=object)

In [38]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [39]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
['1904-03-31 00:00:00', '1906-06-04 00:00:00', '1957-12-31 00:00:00',
 '1936-04-28 00:00:00', '1933-12-31 00:00:00', '1888-10-01 00:00:00',
 '1868-04-20 00:00:00', '1934-06-20 00:00:00', '1933-09-20 00:00:00',
 '1955-04-14 00:00:00',
 ...
 '2004-06-22 00:00:00', '2022-12-22 00:00:00', '1872-06-30 00:00:00',
 '2021-05-13 00:00:00', '2021-11-22 00:00:00', '2016-06-18 00:00:00',
 '2023-05-10 00:00:00', '2023-01-24 00:00:00', '1995-07-14 00:00:00',
 '2022-10-28 00:00:00']
Length: 25486, dtype: datetime64[ns]

In [40]:
from pandas.tseries.offsets import MonthBegin, MonthEnd

# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], format="%Y%m") + MonthEnd(1)
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
[                'NaT', '1990-09-30 00:00:00', '2005-05-31 00:00:00',
 '2005-06-30 00:00:00', '2005-07-31 00:00:00', '2005-08-31 00:00:00',
 '2005-09-30 00:00:00', '2006-05-31 00:00:00', '2006-06-30 00:00:00',
 '2006-07-31 00:00:00',
 ...
 '1911-11-30 00:00:00', '1910-05-31 00:00:00', '1910-06-30 00:00:00',
 '1910-07-31 00:00:00', '1910-08-31 00:00:00', '1901-08-31 00:00:00',
 '1901-09-30 00:00:00', '1901-10-31 00:00:00', '1900-04-30 00:00:00',
 '1900-11-30 00:00:00']
Length: 1402, dtype: datetime64[ns]

In [41]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
[                'NaT', '1990-09-01 00:00:00', '2005-05-01 00:00:00',
 '2005-06-01 00:00:00', '2005-07-01 00:00:00', '2005-08-01 00:00:00',
 '2005-09-01 00:00:00', '2006-05-01 00:00:00', '2006-06-01 00:00:00',
 '2006-07-01 00:00:00',
 ...
 '1911-11-01 00:00:00', '1910-05-01 00:00:00', '1910-06-01 00:00:00',
 '1910-07-01 00:00:00', '1910-08-01 00:00:00', '1901-08-01 00:00:00',
 '1901-09-01 00:00:00', '1901-10-01 00:00:00', '1900-04-01 00:00:00',
 '1900-11-01 00:00:00']
Length: 1402, dtype: datetime64[ns]

In [42]:
# extract year out
outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True)
outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

array(['0', '1990', '2005', '2006', '2007', '2008', '2009', '2010',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2022', '2001', '2002', '1998', '1999', '2000', '2003',
       '2011', '2021', '2023', '2004', '2024', '1995', '1996', '1992',
       '1993', '1994', '1997', '1991', '1989', '1969', '1970', '1971',
       '1972', '1946', '1947', '1948', '1956', '1957', '1973', '1982',
       '1980', '1983', '1912', '1913', '1954', '1974', '1975', '1976',
       '1977', '1978', '1985', '1987', '1988', '1979', '1981', '1984',
       '1986', '1927', '1955', '1926', '1933', '1934', '1935', '1936',
       '1937', '1938', '1939', '1940', '1943', '1944', '1965', '1966',
       '1967', '1968', '1941', '1942', '1945', '1949', '1963', '1964',
       '1922', '1923', '1924', '1925', '1928', '1929', '1920', '1930',
       '1931', '1932', '1961', '1962', '1950', '1951', '1952', '1959',
       '1960', '1953', '1958', '1899', '1911', '1915', '1916', '1917',
       '1

In [43]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

array(['Agriculture Irrigation', 'Commercial/Industrial',
       'In-stream Flow', 'Reservoir Storage', 'Livestock',
       'Aquifer Recharge', 'Aquaculture', 'Unspecified', 'Domestic',
       'Public Supply', 'Recreation', 'Other', 'Fire', 'Hydroelectric',
       'Snow', 'Geothermal'], dtype=object)

In [44]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Discharge Flow_Monthly_Agriculture Irrigation_Surface Water',
       'Discharge Flow_Monthly_Agriculture Irrigation_Groundwater',
       'Discharge Flow_Monthly_Commercial/Industrial_Groundwater',
       'Discharge Flow_Monthly_In-Stream Flow_Surface Water',
       'Discharge Flow_Monthly_Reservoir Storage_Surface Water',
       'Discharge Flow_Monthly_Livestock_Groundwater',
       'Discharge Flow_Monthly_Aquifer Recharge_Surface Water',
       'Discharge Flow_Monthly_Livestock_Surface Water',
       'Discharge Flow_Monthly_Aquaculture_Surface Water',
       'Discharge Flow_Monthly_Unspecified_Groundwater',
       'Discharge Flow_Monthly_Domestic_Groundwater',
       'Discharge Flow_Monthly_Aquaculture_Groundwater',
       'Discharge Flow_Monthly_Public Supply_Groundwater',
       'Discharge Flow_Monthly_Domestic_Surface Water',
       'Discharge Flow_Monthly_Commercial/Industrial_Surface Water',
       'Discharge Flow_Monthly_Recreation_Surface Water',
       'Discharge Flow_

In [45]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId3247', 'wadeId3248',
       'wadeId3249'], dtype=object)

In [46]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['103860', '103894', '105485', ..., '7105073', '1108289', '7100733'],
      dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: Conditional

In [47]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Conditional"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

653553


array(['Absolute', 'Conditional Absolute'], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [48]:
# # PoU Shapefile Data
# shapefileInput = "RawInputData/shapefiles/{enter file name here}.zip" # ziped folder of the shp file

# dfPoUshapetemp = gpd.read_file(shapefileInput)
# dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()

In [49]:
# # create temp dataframe to hold native ID and geometry from shapefile input
# columnsList = ['in_SiteNativeID', 'geometry']
# dfPoUshape = pd.DataFrame(columns=columnsList)

# # assing values to temp dataframe based on shapefile input
# # for in_SiteNativeID assure ID value is the same as that listed above for POU info.
# dfPoUshape['in_SiteNativeID'] = "POU" + ""
# dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
# dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
# print(len(dfPoUshape))
# dfPoUshape.head()

## Export Outputs

In [50]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653553 entries, 0 to 653552
Data columns (total 74 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   WaDEUUID                                      653553 non-null  object        
 1   in_MethodUUID                                 653553 non-null  object        
 2   in_VariableSpecificUUID                       653553 non-null  object        
 3   in_AggregationIntervalUnitCV                  653553 non-null  object        
 4   in_VariableCV                                 653553 non-null  object        
 5   in_OrganizationUUID                           653553 non-null  object        
 6   in_Geometry                                   653553 non-null  object        
 7   in_GNISFeatureNameCV                          653553 non-null  object        
 8   in_WaterQualityIndicatorCV                    653553 n

In [51]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,coD0,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,South Platte River,wadeId1,Surface Water,,GPS,Weld,4326,,,,40.42668,-104.54323,,,POD,Millage Res,103860,,Reservoir,CO,,,,,,,,,,,Absolute,46386.19813-0-A-103860,,1904-03-31,,12/31,01/01,,150.00000,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,46386.19813-0-A-103860,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
1,coD1,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Willow Creek,wadeId2,Surface Water,,Digitized,Weld,4326,,,,40.53346,-104.56242,,,POD,Mason Res,103894,,Reservoir,CO,,,,,,,,,,,Absolute,46020.20608-0-A-103894,,1906-06-04,,12/31,01/01,,82.50000,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,46020.20608-0-A-103894,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
2,coD2,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Groundwater: South Platte River,wadeId3,Groundwater,,GPS,Weld,4326,,,,40.61786,-104.66634,,,POD,Andersen Well 212608R,105485,,Well,CO,,,,,,,,,,0.53300,Absolute,39446.0-0-C-105485,,1957-12-31,,12/31,01/01,,,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,39446.0-0-C-105485,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
3,coD3,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Groundwater: South Platte River,wadeId3,Groundwater,,GPS,Weld,4326,,,,40.62515,-104.67519,,,POD,Andersen Well 7057,105489,,Well,CO,,,,,,,,,,0.44500,Absolute,31529.0-0-C-105489,,1936-04-28,,12/31,01/01,,,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,31529.0-0-C-105489,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
4,coD4,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Lost Creek,wadeId4,Surface Water,,Digitized,Weld,4326,,,,40.12613,-104.37217,,,POD,Shoeneman Holding Pond 2,103884,,Reservoir,CO,,,,,,,,,,,Absolute,55882.3068-0-A-103884,,1933-12-31,,12/31,01/01,,6.00000,"Irrigation,Stock",,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,55882.3068-0-A-103884,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653548,coD171201,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Groundwater: South Platte River,wadeId3,Groundwater,,GPS,Logan,4326,,,,40.57040,-103.37314,,,POD,Brunkhardt Well 66513F,6405224,,Well,CO,,,,,,,,,,2.15000,Absolute,42154.0-0-C-6405224,,1965-05-31,,12/31,01/01,,,Irrigation,,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,42154.0-0-C-6405224,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
653549,coD171202,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Groundwater: Arkansas River,wadeId395,Groundwater,,GPS,Pueblo,4326,,,,38.25957,-104.39391,,,POD,Solada Well No 2,1405143,,Well,CO,,,,,,,,,,3.02000,Absolute,31410.0-0-C-1405143,,1935-12-31,,12/31,01/01,,,"Irrigation,Industrial",,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,31410.0-0-C-1405143,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
653550,coD171205,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Groundwater: Arkansas River,wadeId395,Groundwater,,GPS,Pueblo,4326,,,,38.25962,-104.39215,,,POD,Solada Well No 3,1405144,,Well,CO,,,,,,,,,,3.26000,Absolute,31410.0-0-C-1405144,,1935-12-31,,12/31,01/01,,,"Irrigation,Industrial",,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,31410.0-0-C-1405144,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...
653551,coD171208,COwr_M1,COwr_V1,Monthly,Discharge Flow,COwr_O1,,,Fresh,Catamount Creek,wadeId443,Surface Water,,Digitized,Eagle,4326,,,,39.82223,-106.81178,,,POD,Catamount No 1 Ditch,5200536,,Ditch,CO,,,,,,,,,,6.00000,Absolute,43829.40393-0-C-5200536,,1960-08-04,,12/31,01/01,,,"Irrigation,Domestic,All Beneficial Uses",,,,,,0,,,,,,,,,,https://dwr.state.co.us/Tools/WaterRights/NetA...,,43829.40393-0-C-5200536,,Agriculture Irrigation,0,,NaT,NaT,Discharge Flow_Monthly_Agriculture Irrigation_...


In [52]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_wu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.