# Pre-processing Wyoming Water Right data for WaDEQA upload.
Purpose:  To pre-process the Wyoming data into one master file for simple DataFrame creation and extraction

Notes:
- Date Updated: 05/06/2022
- Merging GW and SW sources into one workable input.

In [3]:
import os
import sys
print(os.environ['CONDA_DEFAULT_ENV'])
print(sys.version)

base
3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]


In [4]:
# Needed Libraries / Modules

# ---- working with data ----
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [5]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Wyoming/WaterAllocation/RawInputData"
os.chdir(workingDir)

# POD Data

### POD Groundwater

In [None]:
# Input File
GW_Input = "POD_GW_DepthI_FC_input.zip"
dfwyg = pd.read_csv(GW_Input).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfwyg:
    dfwyg['WaDEUUID'] = "wyDG" + dfwyg.index.astype(str)
    dfwyg.to_csv('POD_GW_DepthI_FC_input.zip', compression=dict(method='zip', archive_name='POD_GW_DepthI_FC_input.csv'), index=False)

print(len(dfwyg))
dfwyg.head()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
dfwyg['PriorityDate'] = pd.to_datetime(dfwyg['PriorityDate'])
dfwyg['PriorityDate'] = pd.to_datetime(dfwyg["PriorityDate"].dt.strftime('%m/%d/%Y'))
dfwyg.head(1)

In [None]:
#Creating WaDE Owner Field.  
#Create from Owner field. If empty, use LastName + FirstName fields.

def retrieveOwner(Com, FN, LN):
    Com = str(Com).strip()
    FN = str(FN).strip()
    LN = str(LN).strip()
    if Com == "" or pd.isnull(Com):
        outList = LN + ", " + FN
    else:
        outList = Com
    return outList
dfwyg['WaDEOwner'] = dfwyg.apply(lambda row: retrieveOwner(row['Company'], row['FirstName'], row['LastName']), axis=1)
dfwyg.head(1)

In [None]:
#Creating Beneficial Use.
#Need to translate WY abbreviatoins to a workable format.

BenUseDict = {
"AESCNG" : "Coal Bed Natural Gas",
"AESFIS" : "Fish Propagation (Aesthetics)",
"AESGWR" : "Ground Water Recharge (Aesthetics)",
"AESREC" : "Recreation (Aesthetics)",
"AESSTK" : "Stock (Aesthetics)",
"AESWET" : "Wetlands (Aesthetics)",
"AESWIL" : "Wildlife (Aesthetics)",
"AQU" : "Aquaculture",
"BOT" : "Bottling Water",
"CAG" : "Commercial Agriculture",
"CBM" : "Coal Bed Methane - Ground Water",
"CHE" : "Chemical",
"CIS" : "Consumptive Instream Flow",
"CMU" : "Combined Uses",
"CNG_SW" : "Coal Bed Natural Gas",
"COM" : "Commercial",
"CUL" : "Culinary",
"DAI" : "Dairy",
"DEW" : "Mine Dewatering",
"DOM_GW" : "Domestic - Ground Water",
"DOM_SW" : "Domestic - Surface Water",
"DPA" : "Domestic (Phase 2 Award)",
"DRI" : "Drilling",
"DSP" : "Domestic Supply",
"DTA" : "Dust Abatement",
"ECAP" : "Existing Capacity",
"ERO" : "Erosion Control",
"FIR" : "Fire Protection",
"FIS" : "Fish Propagation",
"FLO" : "Flood Control",
"FTH" : "Flow Through",
"GWR" : "Ground Water Recharge",
"HEX" : "Heat Extraction",
"HWY" : "Highway Construction",
"HYD" : "Hydropower",
"HYT" : "Hydrostatic Testing",
"ICE" : "Ice Cutting",
"IFA" : "Instream Flow (Phase 2 Award)",
"IND_GW" : "Industrial - Ground Water",
"IND_SW" : "Industrial - Surface Water",
"IRR_GW" : "Irrigation - Ground Water",
"IRR_SW" : "Irrigation - Surface Water",
"ISF" : "Instream Flow",
"LAK" : "Maintain Natural Lake Level (Phase 2 Award)",
"LAW" : "Large Scale Landscape",
"MAI" : "Maintenance (Equipment Washing)",
"MAN" : "Manufacturing",
"MEC" : "Mechanical",
"MED" : "Medicinal",
"MEM" : "Municipal (Emergency)",
"MIL" : "Milling",
"MIN" : "Mining",
"MIS" : "Miscellaneous - Ground Water",
"MON" : "Monitor",
"MUN_GW" : "Municipal - Ground Water",
"MUN_SW" : "Municipal - Surface Water",
"NAT" : "Natural Flow (Phase 2 Award)",
"O&G" : "Oil and Gas Well Drilling",
"OIL" : "Oil",
"OTH" : "Other",
"OTH_CM" : "Other - Commercial",
"OTH_IN" : "Other - Industrial",
"OTH_TM" : "Other - Temporary",
"P&S" : "Potable and Sanitary Supply",
"PCT" : "Pollution Control",
"POW" : "Power",
"RAI" : "Railroad",
"RDC" : "Road Construction",
"REC" : "Recreation",
"REF" : "Refining",
"RES" : "Reservoir Supply",
"REW" : "Reclamation Watering",
"S&D" : "Stock and Domestic",
"SDG" : "Gpm For Domestic or Stock",
"SDU" : "Stock and Domestic",
"SED" : "Sediment Control",
"SNO" : "Snow Making",
"STE" : "Stream",
"STK" : "Stock Watering",
"STKNDMS" : "Stock and Domestic",
"STO" : "Stock",
"STS" : "Stock",
"STW" : "Stock Watering",
"SWD" : "Subdivision",
"SWP" : "Stock Water Pipeline",
"TEM" : "Temporary",
"TENL" : "Total Enlargement",
"TRA" : "Transportation",
"TST" : "Test Well",
"TWR" : "Tree Watering",
"UTL" : "Utilities",
"W&S" : "Wild and Scenic",
"WDR" : "Well Drilling",
"WET" : "Wetlands",
"WHL" : "Water Hauls",
"WL" : "Wildlife"}

def retrieveBenUse(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        String1 = str(colrowValue).strip()
        x = String1.split(';')
        try:
            outList = []
            for i in range(len(x)):
                y = x[i].strip()
                y = BenUseDict[y]
                outList.append(y)
            outString = ", ".join(str(e) for e in outList)
        except:
            outString = ""
    return outString
dfwyg['WaDEBenUse'] = dfwyg.apply(lambda row: retrieveBenUse(row['Uses']), axis=1)

# ----------------------------------------------------------------------------------------------------
# Remove special characters from ben use that will cause issues within our system
def removeBUWSGSSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@<&>.;/\-]", "", Val).title().replace("  ", " ").strip()
    return Val
dfwyg['WaDEBenUse'] = dfwyg.apply(lambda row: removeBUWSGSSpecialCharsFunc(row['WaDEBenUse']), axis=1)

# ----------------------------------------------------------------------------------------------------
for x in dfwyg['WaDEBenUse'].sort_values().unique():
    print( x )                                                        

In [None]:
# For Wild and Scenic River benuse
# suffix metadata, and anything with a Z in it also has a beneficial use = Wild and Scenic River benuse

def splitWord(word):
    return [char for char in word]

def retrieveWildCenicRiver(word):
    word = str(word)
    worldList = splitWord(word)
    if worldList[0] == "Z":
        outString = "Wild and Scenic River"
    else:
        outString = ""
    
    return outString
    
dfwyg['PreffixWaDEBenUse'] = dfwyg.apply(lambda row: retrieveWildCenicRiver(row['Survey_Type_Survey_Number_Survey_Suffix']), axis=1)
dfwyg['PreffixWaDEBenUse'].unique()

In [None]:
# For Wild and Scenic River benuse
# combine WaDEBenUse & PreffixWaDEBenUse columns if not blank
def combineWaDEBenUses(valA, valB):
    if valA == "" or pd.isnull(valA):
        outString = valB
    else:
        outString = valB + ", " + valA
    
    return outString

dfwyg['WaDEBenUse'] = dfwyg.apply(lambda row: combineWaDEBenUses(row['PreffixWaDEBenUse'], row['WaDEBenUse']), axis=1)
dfwyg['WaDEBenUse'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfwyg['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WYwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WYwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WYwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfwyg['Latitude_Double']
df['in_Longitude'] = dfwyg['Longitude_Double']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfwyg['FacilityName'].str.title().replace(",", " ")
df['in_SiteNativeID'] = "" #auto fill in below
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfwyg['Facility_type']
df['in_StateCV'] = "WY"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfwyg['Total_Flow_CFS___Appropriation_GPM_'].astype(float)
df['in_AllocationLegalStatusCV'] = dfwyg['SummaryWRStatus']
df['in_AllocationNativeID'] =  dfwyg['WR_Number'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfwyg['WaDEOwner']
df['in_AllocationPriorityDate'] = dfwyg['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfwyg['WaDEBenUse']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "http://seoweb.wyo.gov/e-Permit/Common/Login.aspx"

dfground = df.copy()
dfground = dfground.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfground))
dfground.head()

### POD Surface Water

In [None]:
# Input File
SW_Input = "POD_SW_DepthI_FC_input.zip"
dfwys = pd.read_csv(SW_Input).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfwys:
    dfwys['WaDEUUID'] = "wyDS" + dfwys.index.astype(str)
    dfwys.to_csv('POD_SW_DepthI_FC_input.zip', compression=dict(method='zip', archive_name='POD_SW_DepthI_FC_input.csv'), index=False)

print(len(dfwys))
dfwys.head()

In [None]:
#Creating WaDE Owner Field.  
#Create from Owner field. If empty, use LastName + FirstName fields.

def retrieveOwner(Com, FN, LN):
    Com = str(Com).strip()
    FN = str(FN).strip()
    LN = str(LN).strip()
    if Com == "" or pd.isnull(Com):
        outList = LN + ", " + FN
    else:
        outList = Com
    return outList
dfwys['WaDEOwner'] = dfwys.apply(lambda row: retrieveOwner(row['Company'], row['FirstName'], row['LastName']), axis=1)
dfwys['WaDEOwner'].unique()

In [None]:
#Creating Beneficial Use.
#Need to translate WY abbreviatoins to a workable format.

BenUseDict = {
"AESCNG" : "Coal Bed Natural Gas",
"AESFIS" : "Fish Propagation (Aesthetics)",
"AESGWR" : "Ground Water Recharge (Aesthetics)",
"AESREC" : "Recreation (Aesthetics)",
"AESSTK" : "Stock (Aesthetics)",
"AESWET" : "Wetlands (Aesthetics)",
"AESWIL" : "Wildlife (Aesthetics)",
"AQU" : "Aquaculture",
"BOT" : "Bottling Water",
"CAG" : "Commercial Agriculture",
"CBM" : "Coal Bed Methane - Ground Water",
"CHE" : "Chemical",
"CIS" : "Consumptive Instream Flow",
"CMU" : "Combined Uses",
"CNG_SW" : "Coal Bed Natural Gas",
"COM" : "Commercial",
"CUL" : "Culinary",
"DAI" : "Dairy",
"DEW" : "Mine Dewatering",
"DOM_GW" : "Domestic - Ground Water",
"DOM_SW" : "Domestic - Surface Water",
"DPA" : "Domestic (Phase 2 Award)",
"DRI" : "Drilling",
"DSP" : "Domestic Supply",
"DTA" : "Dust Abatement",
"ECAP" : "Existing Capacity",
"ERO" : "Erosion Control",
"FIR" : "Fire Protection",
"FIS" : "Fish Propagation",
"FLO" : "Flood Control",
"FTH" : "Flow Through",
"GWR" : "Ground Water Recharge",
"HEX" : "Heat Extraction",
"HWY" : "Highway Construction",
"HYD" : "Hydropower",
"HYT" : "Hydrostatic Testing",
"ICE" : "Ice Cutting",
"IFA" : "Instream Flow (Phase 2 Award)",
"IND_GW" : "Industrial - Ground Water",
"IND_SW" : "Industrial - Surface Water",
"IRR_GW" : "Irrigation - Ground Water",
"IRR_SW" : "Irrigation - Surface Water",
"ISF" : "Instream Flow",
"LAK" : "Maintain Natural Lake Level (Phase 2 Award)",
"LAW" : "Large Scale Landscape",
"MAI" : "Maintenance (Equipment Washing)",
"MAN" : "Manufacturing",
"MEC" : "Mechanical",
"MED" : "Medicinal",
"MEM" : "Municipal (Emergency)",
"MIL" : "Milling",
"MIN" : "Mining",
"MIS" : "Miscellaneous - Ground Water",
"MON" : "Monitor",
"MUN_GW" : "Municipal - Ground Water",
"MUN_SW" : "Municipal - Surface Water",
"NAT" : "Natural Flow (Phase 2 Award)",
"O&G" : "Oil and Gas Well Drilling",
"OIL" : "Oil",
"OTH" : "Other",
"OTH_CM" : "Other - Commercial",
"OTH_IN" : "Other - Industrial",
"OTH_TM" : "Other - Temporary",
"P&S" : "Potable and Sanitary Supply",
"PCT" : "Pollution Control",
"POW" : "Power",
"RAI" : "Railroad",
"RDC" : "Road Construction",
"REC" : "Recreation",
"REF" : "Refining",
"RES" : "Reservoir Supply",
"REW" : "Reclamation Watering",
"S&D" : "Stock and Domestic",
"SDG" : "Gpm For Domestic or Stock",
"SDU" : "Stock and Domestic",
"SED" : "Sediment Control",
"SNO" : "Snow Making",
"STE" : "Stream",
"STK" : "Stock Watering",
"STKNDMS" : "Stock and Domestic",
"STO" : "Stock",
"STS" : "Stock",
"STW" : "Stock Watering",
"SWD" : "Subdivision",
"SWP" : "Stock Water Pipeline",
"TEM" : "Temporary",
"TENL" : "Total Enlargement",
"TRA" : "Transportation",
"TST" : "Test Well",
"TWR" : "Tree Watering",
"UTL" : "Utilities",
"W&S" : "Wild and Scenic",
"WDR" : "Well Drilling",
"WET" : "Wetlands",
"WHL" : "Water Hauls",
"WL" : "Wildlife"}

def retrieveBenUse(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        String1 = str(colrowValue).strip()
        x = String1.split(';')
        try:
            outList = []
            for i in range(len(x)):
                y = x[i].strip()
                y = BenUseDict[y]
                outList.append(y)
            outString = ", ".join(str(e) for e in outList)
        except:
            outString = ""
    return outString
dfwys['WaDEBenUse'] = dfwys.apply(lambda row: retrieveBenUse(row['Uses']), axis=1)

# ----------------------------------------------------------------------------------------------------
# Remove special characters from ben use that will cause issues within our system
def removeBUWSGSSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@<&>.;/\-]", "", Val).title().replace("  ", " ").strip()
    return Val
dfwys['WaDEBenUse'] = dfwys.apply(lambda row: removeBUWSGSSpecialCharsFunc(row['WaDEBenUse']), axis=1)

# ----------------------------------------------------------------------------------------------------
for x in dfwys['WaDEBenUse'].sort_values().unique():
    print( x )                                                        

In [None]:
# For Wild and Scenic River benuse
# suffix metadata, and anything with a Z in it also has a beneficial use = Wild and Scenic River benuse

def splitWord(word):
    return [char for char in word]

def retrieveWildCenicRiver(word):
    word = str(word)
    worldList = splitWord(word)
    try:
        if worldList[0] == "Z":
            outString = "Wild and Scenic River"
        else:
            outString = ""
    except:
        outString = ""
    
    return outString
    
dfwys['PreffixWaDEBenUse'] = dfwys.apply(lambda row: retrieveWildCenicRiver(row['Survey_Type_Survey_Number_Survey_Suffix']), axis=1)
dfwys['PreffixWaDEBenUse'].unique()

In [None]:
# For Wild and Scenic River benuse
# combine WaDEBenUse & PreffixWaDEBenUse columns if not blank
def combineWaDEBenUses(valA, valB):
    if valA == "" or pd.isnull(valA):
        outString = valB
    else:
        outString = valB + ", " + valA
    
    return outString

dfwys['WaDEBenUse'] = dfwys.apply(lambda row: combineWaDEBenUses(row['PreffixWaDEBenUse'], row['WaDEBenUse']), axis=1)
dfwys['WaDEBenUse'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfwys['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WYwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WYwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WYwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfwys['Stream_Source'].str.title().replace(",", " ")
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfwys['Latitude_Double']
df['in_Longitude'] = dfwys['Longitude_Double']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfwys['FacilityName'].str.title().replace(",", " ")
df['in_SiteNativeID'] = "" #auto fill in below
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfwys['Facility_type']
df['in_StateCV'] = "WY"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfwys['Total_Flow_CFS___Appropriation_GPM_'].astype(float)
df['in_AllocationLegalStatusCV'] = dfwys['SummaryWRStatus']
df['in_AllocationNativeID'] =  dfwys['WR_Number'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfwys['WaDEOwner']
df['in_AllocationPriorityDate'] = dfwys['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfwys['WaDEBenUse']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "http://seoweb.wyo.gov/e-Permit/Common/Login.aspx"

df_Surface = df.copy()
df_Surface = df_Surface.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(df_Surface))
df_Surface.head()

In [None]:
# Concatenate dfground, df_Surface dataframes

frames = [dfground, df_Surface]
outPOD = pd.concat(frames)
outPOD = outPOD.replace(np.nan, "").drop_duplicates()
outPOD = outPOD.replace("nan", "").drop_duplicates()

print(len(outPOD))
outPOD.head()

# POU Data

In [None]:
# Input File
POU_Input = "POU_input.zip"
dfinPOU = pd.read_csv(POU_Input).replace(np.nan, "").replace("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "wyU" + dfinPOU.index.astype(str)
    dfinPOU.to_csv('POU_input.zip', compression=dict(method='zip', archive_name='POU_input.csv'), index=False)

dfinPOU = dfinPOU.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(dfinPOU))
dfinPOU.head()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
dfinPOU['PriorityDate'] = pd.to_datetime(dfinPOU['PriorityDate'])
dfinPOU['PriorityDate'] = pd.to_datetime(dfinPOU["PriorityDate"].dt.strftime('%m/%d/%Y'))
dfinPOU['PriorityDate'].unique()

In [None]:
#Creating WaDE Owner Field.  
#Create from Owner field. If empty, use LastName + FirstName fields.

def retrieveOwner(Com, FN, LN):
    Com = str(Com).strip()
    FN = str(FN).strip()
    LN = str(LN).strip()
    if Com == "" or pd.isnull(Com):
        outList = LN + ", " + FN
    else:
        outList = Com
    return outList
dfinPOU['WaDEOwner'] = dfinPOU.apply(lambda row: retrieveOwner(row['Company'], row['FirstName'], row['LastName']), axis=1)
dfinPOU['WaDEOwner'].unique()

In [None]:
#Creating Beneficial Use.
#Need to translate WY abbreviatoins to a workable format.

BenUseDict = {
"AESCNG" : "Coal Bed Natural Gas",
"AESFIS" : "Fish Propagation (Aesthetics)",
"AESGWR" : "Ground Water Recharge (Aesthetics)",
"AESREC" : "Recreation (Aesthetics)",
"AESSTK" : "Stock (Aesthetics)",
"AESWET" : "Wetlands (Aesthetics)",
"AESWIL" : "Wildlife (Aesthetics)",
"AQU" : "Aquaculture",
"BOT" : "Bottling Water",
"CAG" : "Commercial Agriculture",
"CBM" : "Coal Bed Methane - Ground Water",
"CHE" : "Chemical",
"CIS" : "Consumptive Instream Flow",
"CMU" : "Combined Uses",
"CNG_SW" : "Coal Bed Natural Gas",
"COM" : "Commercial",
"CUL" : "Culinary",
"DAI" : "Dairy",
"DEW" : "Mine Dewatering",
"DOM_GW" : "Domestic - Ground Water",
"DOM_SW" : "Domestic - Surface Water",
"DPA" : "Domestic (Phase 2 Award)",
"DRI" : "Drilling",
"DSP" : "Domestic Supply",
"DTA" : "Dust Abatement",
"ECAP" : "Existing Capacity",
"ERO" : "Erosion Control",
"FIR" : "Fire Protection",
"FIS" : "Fish Propagation",
"FLO" : "Flood Control",
"FTH" : "Flow Through",
"GWR" : "Ground Water Recharge",
"HEX" : "Heat Extraction",
"HWY" : "Highway Construction",
"HYD" : "Hydropower",
"HYT" : "Hydrostatic Testing",
"ICE" : "Ice Cutting",
"IFA" : "Instream Flow (Phase 2 Award)",
"IND_GW" : "Industrial - Ground Water",
"IND_SW" : "Industrial - Surface Water",
"IRR_GW" : "Irrigation - Ground Water",
"IRR_SW" : "Irrigation - Surface Water",
"ISF" : "Instream Flow",
"LAK" : "Maintain Natural Lake Level (Phase 2 Award)",
"LAW" : "Large Scale Landscape",
"MAI" : "Maintenance (Equipment Washing)",
"MAN" : "Manufacturing",
"MEC" : "Mechanical",
"MED" : "Medicinal",
"MEM" : "Municipal (Emergency)",
"MIL" : "Milling",
"MIN" : "Mining",
"MIS" : "Miscellaneous - Ground Water",
"MON" : "Monitor",
"MUN_GW" : "Municipal - Ground Water",
"MUN_SW" : "Municipal - Surface Water",
"NAT" : "Natural Flow (Phase 2 Award)",
"O&G" : "Oil and Gas Well Drilling",
"OIL" : "Oil",
"OTH" : "Other",
"OTH_CM" : "Other - Commercial",
"OTH_IN" : "Other - Industrial",
"OTH_TM" : "Other - Temporary",
"P&S" : "Potable and Sanitary Supply",
"PCT" : "Pollution Control",
"POW" : "Power",
"RAI" : "Railroad",
"RDC" : "Road Construction",
"REC" : "Recreation",
"REF" : "Refining",
"RES" : "Reservoir Supply",
"REW" : "Reclamation Watering",
"S&D" : "Stock and Domestic",
"SDG" : "Gpm For Domestic or Stock",
"SDU" : "Stock and Domestic",
"SED" : "Sediment Control",
"SNO" : "Snow Making",
"STE" : "Stream",
"STK" : "Stock Watering",
"STKNDMS" : "Stock and Domestic",
"STO" : "Stock",
"STS" : "Stock",
"STW" : "Stock Watering",
"SWD" : "Subdivision",
"SWP" : "Stock Water Pipeline",
"TEM" : "Temporary",
"TENL" : "Total Enlargement",
"TRA" : "Transportation",
"TST" : "Test Well",
"TWR" : "Tree Watering",
"UTL" : "Utilities",
"W&S" : "Wild and Scenic",
"WDR" : "Well Drilling",
"WET" : "Wetlands",
"WHL" : "Water Hauls",
"WL" : "Wildlife"}

def retrieveBenUse(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        String1 = str(colrowValue).strip()
        x = String1.split(';')
        try:
            outList = []
            for i in range(len(x)):
                y = x[i].strip()
                y = BenUseDict[y]
                outList.append(y)
            outString = ", ".join(str(e) for e in outList)
        except:
            outString = ""
    return outString
dfinPOU['WaDEBenUse'] = dfinPOU.apply(lambda row: retrieveBenUse(row['Uses']), axis=1)

# ----------------------------------------------------------------------------------------------------
# Remove special characters from ben use that will cause issues within our system
def removeBUWSGSSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@<&>.;/\-]", "", Val).title().replace("  ", " ").strip()
    return Val
dfinPOU['WaDEBenUse'] = dfinPOU.apply(lambda row: removeBUWSGSSpecialCharsFunc(row['WaDEBenUse']), axis=1)

# ----------------------------------------------------------------------------------------------------
for x in dfinPOU['WaDEBenUse'].sort_values().unique():
    print( x )                                                        

In [None]:
# For Wild and Scenic River benuse
# suffix metadata, and anything with a Z in it also has a beneficial use = Wild and Scenic River benuse

dfinPOU['Survey'] = dfinPOU['Survey'].replace("", 0).fillna(0).astype(str)

def splitWord(word):
    return [char for char in word]

def retrieveWildCenicRiver(word):
    word = str(word)
    worldList = splitWord(word)
    if worldList[0] == "Z":
        outString = "Wild and Scenic River"
    else:
        outString = ""
    
    return outString
    
dfinPOU['PreffixWaDEBenUse'] = dfinPOU.apply(lambda row: retrieveWildCenicRiver(row['Survey']), axis=1)
dfinPOU['PreffixWaDEBenUse'] .unique()

In [None]:
# For Wild and Scenic River benuse
# combine WaDEBenUse & PreffixWaDEBenUse columns if not blank
def combineWaDEBenUses(valA, valB):
    if valA == "" or pd.isnull(valA):
        outString = valB
    else:
        outString = valB + ", " + valA
    
    return outString

dfinPOU['WaDEBenUse'] = dfinPOU.apply(lambda row: combineWaDEBenUses(row['PreffixWaDEBenUse'], row['WaDEBenUse']), axis=1)
dfinPOU['WaDEBenUse'].unique()

In [None]:
# # check watersource type
# # making assumption that we can guess watersourcetypeCV via benuse

# def guessWaterSourceTypeCV(valString):
#     valString = valString.strip()
#     outString = ""
#     if "Surface Water" in valString:
#         outString = "Surface Water"
#     if "Ground Water" in valString:
#         outString = "Groundwater"
#     return outString

# dfinPOU['in_WaterSourceTypeCV'] = dfinPOU.apply(lambda row: guessWaterSourceTypeCV(row['WaDEBenUse']), axis=1)
# dfinPOU['in_WaterSourceTypeCV'].unique()

In [None]:
# tie POD data to POU data for correct watersource info

dfinPOU = pd.merge(dfinPOU, outPOD[['in_AllocationNativeID', 'in_WaterSourceTypeCV', 'in_WaterSourceName']], left_on='WRNumber', right_on='in_AllocationNativeID', how='left')
print(len(dfinPOU))
dfinPOU.head(1)

In [None]:
dfinPOU['in_WaterSourceTypeCV'].value_counts()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WYwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WYwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WYwr_OR1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
#df['in_WaterSourceName'] = dfinPOU['SupplySource'].str.title().replace(",", " ")
df['in_WaterSourceName'] = dfinPOU['in_WaterSourceName']
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOU['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['Latitude']
df['in_Longitude'] = dfinPOU['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"
df['in_SiteName'] = dfinPOU['FacilityName'].str.title().replace(",", " ")
df['in_SiteNativeID'] = "" #auto fill in below
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOU['FacilityType']
df['in_StateCV'] = "WY"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = dfinPOU['SummaryWRStatus']
df['in_AllocationNativeID'] =  dfinPOU['WRNumber'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfinPOU['WaDEOwner']
df['in_AllocationPriorityDate'] = dfinPOU['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOU['WaDEBenUse']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = dfinPOU['Acres']
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "http://seoweb.wyo.gov/e-Permit/Common/Login.aspx"

outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outPOU))
outPOU.head()

# Concatenate POD & POU

In [None]:
# Concatenate
frames = [dfground, df_Surface, outPOU]
outdf = pd.concat(frames)
outdf = outdf.replace(np.nan, "").drop_duplicates()
outdf = outdf.replace("nan", "").drop_duplicates()

print(len(outdf))
outdf

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# in_Latitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# in_Longitude
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').replace(0,"").fillna("")
outdf['in_IrrigatedAcreage'].unique()

## WaDE Custom Elements (due to missing state site info)

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = outdf['in_Latitude']
dfSiteNativeID['in_Longitude'] = outdf['in_Longitude']
dfSiteNativeID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfSiteNativeID['in_PODorPOUSite'] = outdf['in_PODorPOUSite'].astype(str).str.strip()
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)
dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str) + dfSiteNativeID['in_SiteName'].astype(str) + dfSiteNativeID['in_PODorPOUSite'].astype(str)
# # ----------------------------------------------------------------------------------------------------

SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
def retrieveSiteNativeID(A, B, C, D):
    colrowValue = str(A).strip() + str(B).strip() + str(C).strip() + str(D).strip()
    outList = SiteNativeIDdict[colrowValue]
    return outList


outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveSiteNativeID( row['in_Latitude'], row['in_Longitude'], row['in_SiteName'], row['in_PODorPOUSite']), axis=1)
outdf['in_SiteNativeID'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    colrowValue = str(A).strip() + str(B).strip()
    outList = WaterSourceNativeIDdict[colrowValue]
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

Note:
- none identified for WY, at this time.

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Incomplete", "Unadjudicated", "Suspended", "Rejected"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].value_counts()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('Pwr_wyMain.zip', compression=dict(method='zip', archive_name='Pwr_wyMain.csv'), index=False) # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.