# Pre-processing Wyoming State Geological Survey Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory

# set working directory, if need be
workingDir = "G:\Shared drives\WaDE Data\Wyoming\WaterAllocation_WSGS" # file location
os.chdir(workingDir)
print(os.getcwd())

G:\Shared drives\WaDE Data\Wyoming\WaterAllocation_WSGS


## Point of Diversion Data

In [3]:
# Input File
seo_Wells = "RawInputData/SEOWells09012023.csv"
wells = pd.read_csv(seo_Wells, dtype=str, encoding = "ISO-8859-1").replace(np.nan, "")

seo_Springs = "RawInputData/SEOSprings09012023.csv"
springs = pd.read_csv(seo_Springs, dtype=str, encoding = "ISO-8859-1").replace(np.nan, "")

dfinPOD = pd.concat([wells, springs], axis = 0).reset_index(drop=True)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "wyD" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/CombinedFiles.zip', compression=dict(method='zip', archive_name='CombinedFiles.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

223796


Unnamed: 0,PermitNumb,OrderRecor,PermitPref,PermitSuff,IsActive,SummaryCRS,WR_Number,PriorityDa,PriorityTe,SummaryWRS,Company,FirstName,LastName,FacilityNa,Uses,Twn,Rng,Sec,Qtr_Qtr,Survey_Typ,Total_Flow,Total_dept,StaticWate,Well_Log_Y,DepthOfPum,PrincipalW,Principa_1,Stream_Sou,Total_Capa,Diversion_,Active_Cap,Inactive_C,Size_of_Re,Facility_t,SupplyType,Longitude,Latitude,Chemical_A,CreatedBy,DDLat,DDLon,ORIG_OID,WaDEUUID
0,1.0,,A,G,A,,A1.0G,01/14/1978,01/14/1978,Fully Adjudicated,,JOSEPHINE,WASHAKIE,JOSEPHINE,DOM_GW,001N,003E,22,SE1/4SW1/4,A,15,,,,,,,,,,0,0,0,Well,,-108.51321,43.0416,,External,43.04160000N,108.51321000W,1,wyD0
1,10.0,,A,G,A,,A10.0G,12/20/1978,12/20/1978,Fully Adjudicated,,SALLY I,AMBOH,SALLY,DOM_GW,002N,002W,28,NE1/4SW1/4,A,20,,,,,,,,,,0,0,0,Well,,-109.0062,43.11927,,External,43.11927000N,109.00620000W,2,wyD1
2,100.0,,A,G,A,,A100.0G,05/30/1979,05/30/1979,Fully Adjudicated,,FRANCES,BROWN,FRANCES,DOM_GW,001S,004E,2,NE1/4SW1/4,Z,30,,,,,,,,,,0,0,0,Well,,-108.3777,43.00243,,External,43.00243000N,108.37770000W,3,wyD2
3,101.0,,A,G,A,,A101.0G,07/01/1964,07/01/1964,Fully Adjudicated,,ANNA,BROWN,ANNA,DOM_GW,001S,004E,8,SW1/4NE1/4,Z,24,,,,,,,,,,0,0,0,Well,,-108.42714,42.99207,,External,42.99207000N,108.42714000W,4,wyD3
4,102.0,,A,G,A,,A102.0G,08/03/1982,08/03/1982,Fully Adjudicated,,ANNETTE VALARDE,LAJEUNESSE,ANNETTE,DOM_GW,001N,001E,28,SW1/4NE1/4,A,20,,,,,,,,,,0,0,0,Well,,-108.76136,43.03456,,External,43.03456000N,108.76136000W,5,wyD4


In [4]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WSGSwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WSGSwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WSGSwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = ""

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "Digitized"
df['in_County'] = ""
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['Latitude']
df['in_Longitude'] = dfinPOD['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
df['in_SiteName'] = dfinPOD['Stream_Sou']
df['in_SiteNativeID'] = "POD" + dfinPOD['WR_Number']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD['Facility_t']
df['in_StateCV'] = "WY"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['Total_Flow']
df['in_AllocationLegalStatusCV'] = dfinPOD['SummaryWRS']
df['in_AllocationNativeID'] =  dfinPOD['WR_Number']
df['in_AllocationOwner'] = "" # set below
df['in_AllocationPriorityDate'] = dfinPOD['PriorityDa']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOD['Uses'].astype(str)
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = "9/15/2023"
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "0" # either a 1 or 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://www.wsgs.wyo.gov/"

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

223796


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,wyD0,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,,,,Digitized,,4326,,,,43.0416,-108.51321,,,POD,,PODA1.0G,,Well,WY,,,,,,,,,,15,Fully Adjudicated,A1.0G,,01/14/1978,,,,,,DOM_GW,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
1,wyD1,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,,,,Digitized,,4326,,,,43.11927,-109.0062,,,POD,,PODA10.0G,,Well,WY,,,,,,,,,,20,Fully Adjudicated,A10.0G,,12/20/1978,,,,,,DOM_GW,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
2,wyD2,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,,,,Digitized,,4326,,,,43.00243,-108.3777,,,POD,,PODA100.0G,,Well,WY,,,,,,,,,,30,Fully Adjudicated,A100.0G,,05/30/1979,,,,,,DOM_GW,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
3,wyD3,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,,,,Digitized,,4326,,,,42.99207,-108.42714,,,POD,,PODA101.0G,,Well,WY,,,,,,,,,,24,Fully Adjudicated,A101.0G,,07/01/1964,,,,,,DOM_GW,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
4,wyD4,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,,,,Digitized,,4326,,,,43.03456,-108.76136,,,POD,,PODA102.0G,,Well,WY,,,,,,,,,,20,Fully Adjudicated,A102.0G,,08/03/1982,,,,,,DOM_GW,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/


# Setting Groundwater / Surfacewater

In [5]:
conditions = ["DOM_SW","IND_SW","IRR_SW","MUN_SW"]
outPOD['in_WaterSourceTypeCV'] = df['in_BeneficialUseCategory'].apply(
lambda x: 'Surface Water' if any(substring in x for substring in conditions) else 'Groundwater')
for x in outPOD['in_WaterSourceTypeCV'].sort_values().unique():
    print(x)

Groundwater
Surface Water


# Setting up Benificial Use

In [6]:
#outPOD= outPOD.assign(in_BeneficialUseCategory=outPOD['in_BeneficialUseCategory'].str.split(';')).explode('in_BeneficialUseCategory').reset_index(drop=True)
#outPOD['in_BeneficialUseCategory'] = outPOD['in_BeneficialUseCategory'].str.strip() # remove leading and trailing white space
                                                         
# dictionary for matching BeneficialUseCategory
benUseDict = {
"AESCNG" : "Coal Bed Natural Gas (Aesthetics)",
"AESFIS" : "Fish Propagation (Aesthetics)",
"AESGWR" : "Ground Water Recharge (Aesthetics)",
"AESREC" : "Recreation (Aesthetics)",
"AESSTK" : "Stock (Aesthetics)",
"AESWET" : "Wetlands (Aesthetics)",
"AESWIL" : "Wildlife (Aesthetics)",
"AQU" : "Aquaculture",
"BOT" : "Bottling Water",
"CAG" : "Commercial Agriculture (i.e. feedlots confined swine facilities dairies)",
"CBM" : "Coal Bed Methane-- Ground Water",
"CHE" : "Chemical",
"CIS" : "Consumptive Instream Flow",
"CMU" : "Combined Uses",
"CNG_SW" : "Coal Bed Natural Gas",
"COM" : "Commercial",
"CON" : "Construction",
"CUL" : "Culinary",
"DAI" : "Dairy",
"DEW" : "Dewatering",
"DEW" : "Mine Dewatering",
"DOM_GW" : "Domestic-- Ground Water",
"DOM_SW" : "Domestic-- Surface water",
"DPA" : "Domestic (Phase II Award)",
"DRI" : "Drilling",
"DSP" : "Domestic Supply",
"DTA" : "Dust Abatement",
"ECAP" : "Existing Capacity",
"ERO" : "Erosion Control",
"FIR" : "Fire Protection",
"FIS" : "Fish Propagation",
"FLO" : "Flood Control",
"FTH" : "Flow Through",
"GWR" : "Ground Water Recharge",
"HEX" : "Heat Extraction",
"HWY" : "Highway Construction [temporary]",
"HYD" : "Hydropower",
"HYT" : "Hydrostatic Testing",
"ICE" : "Ice Cutting",
"IFA" : "Instream Flow (Phase II Award)",
"IND_GW" : "Industrial-- Ground Water",
"IND_SW" : "Industrial-- Surface water",
"IRR_GW" : "Irrigation-- Ground Water",
"IRR_SW" : "Irrigation-- Surface water",
"ISF" : "Instream Flow-only State of Wyo can apply",
"LAK" : "Maintain Natural Lake Level (Phase II Award)",
"LAW" : "Large scale landscape/lawn watering greater than 1 acre (golf courses cemeteries recreation areas [parks] etc.)",
"MAI" : "Maintenance (Equipment Washing)",
"MAN" : "Manufacturing",
"MEC" : "Mechanical",
"MED" : "Medicinal",
"MEM" : "Municipal (Emergency)",
"MIL" : "Milling",
"MIN" : "Mining",
"MIS" : "Miscellaneous-- Ground Water",
"MIS_SW" : "Miscellaneous Surface Water",
"MON" : "Monitor Observation",
"MUN_GW" : "Municipal-- Ground Water",
"MUN_SW" : "Municipal-- Surface water",
"NAT" : "Natural Flow (Phase II Award)",
"O&G" : "Oil & Gas Well Drilling",
"OTH" : "Other",
"OTH_CM" : "Other – Commercial",
"OTH_IN" : "Other -- Industrial",
"OTH_TM" : "Other -- Temporary",
"P&S" : "Potable & Sanitary Supply for churches schools mobile home parks campgrounds motels businesses etc.",
"PCT" : "Pollution Control",
"POW" : "Power",
"RAI" : "Railroad",
"RDC" : "Road Construction",
"REC" : "Recreation",
"REF" : "Refining",
"RES" : "Reservoir Supply",
"REW" : "Reclamation Watering",
"SD" : "Stock and Domestic",
"S&D" : "Stock and Domestic",
"SDG" : "25 gpm for Domestic &/or Stock",
"SDU" : "Stock and/or Domestic",
"SED" : "Sediment Control",
"SNO" : "Snow Making",
"STE" : "Steam",
"STK" : "Stock Watering",
"STKNDMS" : "Stock and Domestic",
"STO" : "Stock",
"STW" : "Stock watering",
"SWD" : "Subdivisions Water Districts etc.",
"SWP" : "Stock Water Pipelines",
"TEM" : "Temporary",
"TENL" : "Total Enlargement for this application",
"TRA" : "Transportation",
"TST" : "Test Well",
"TWR" : "Tree watering (non-commercial)",
"UTL" : "Utilities",
"W&S" : "Wild & Scenic-only State of Wyo can apply",
"WDR" : "Well Drilling",
"WET" : "Wetlands",
"WHL" : "Water Hauls",
"WL" : "Wildlife"
}

def assignBenUseCategory(row):
    elements = [e.strip().replace(" ", "") for e in row.split(';')]
    return ','.join([benUseDict.get(element, element) for element in elements])
outPOD['in_BeneficialUseCategory'] = outPOD['in_BeneficialUseCategory'].apply(assignBenUseCategory)

# ----------------------------------------------------------------------------------------------------
# Remove special characters from ben use that will cause issues within our system
def removeBUWSGSSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@<&>.;/\-]", "", Val).title().replace("  ", " ").strip()
    return Val
outPOD['in_BeneficialUseCategory'] = outPOD.apply(lambda row: removeBUWSGSSpecialCharsFunc(row['in_BeneficialUseCategory']), axis=1)

# ----------------------------------------------------------------------------------------------------

for x in outPOD['in_BeneficialUseCategory'].sort_values().unique():
    print( x )                                                        


Bat,Domestic Surface Water,Miscellaneous Surface Water
Coal Bed Methane Ground Water
Coal Bed Methane Ground Water,Domestic Ground Water
Coal Bed Methane Ground Water,Industrial Ground Water
Coal Bed Methane Ground Water,Irrigation Ground Water
Coal Bed Methane Ground Water,Irrigation Ground Water,Miscellaneous Ground Water
Coal Bed Methane Ground Water,Irrigation Ground Water,Miscellaneous Ground Water,Stock Watering
Coal Bed Methane Ground Water,Miscellaneous Ground Water
Coal Bed Methane Ground Water,Miscellaneous Ground Water,Monitor Observation,Stock Watering
Coal Bed Methane Ground Water,Miscellaneous Ground Water,Stock Watering
Coal Bed Methane Ground Water,Monitor Observation
Coal Bed Methane Ground Water,Monitor Observation,Stock Watering
Coal Bed Methane Ground Water,Stock Watering
Coal Bed Methane Ground Water,Stock Watering,Miscellaneous Ground Water
Combined Uses
Commercial
Commercial,Culinary
Commercial,Domestic Supply,Fire Protection,Stock
Commercial,Domestic Supply,Sto

In [7]:
outdf = outPOD
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))
outdf.head(1)

223796


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,wyD0,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,43.0416,-108.51321,,,POD,,PODA1.0G,,Well,WY,,,,,,,,,,15,Fully Adjudicated,A1.0G,,01/14/1978,,,,,,Domestic Ground Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/


 # Setting AllocationOwner
 

In [8]:
def combiningNames(df, names):
    new_df = df[names].copy()
    new_df['combined'] = new_df.apply(lambda row: ' '.join(row),axis =1)
    return new_df
result_df = combiningNames(dfinPOD,['FirstName','LastName'])
outdf['in_AllocationOwner']= result_df['combined']
outdf['in_AllocationOwner'].unique()

array(['JOSEPHINE WASHAKIE', 'SALLY I AMBOH', 'FRANCES BROWN', ...,
       'J STOTTS', 'JOHN E CRATON', 'GARY MEHLING'], dtype=object)

# Creating missing SiteNativeID's

In [9]:
# Creating WaDE Custom site native ID for easy site source identification
# ----------------------------------------------------------------------------------------------------

# Create temp site dataframe of unique water source.
def assignSiteNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = outdf['in_Latitude']
dfSiteNativeID['in_Longitude'] = outdf['in_Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteNativeID(row['Count']), axis=1)
dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str)

# ----------------------------------------------------------------------------------------------------

#Retreive WaDE Custom site  native ID
SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
# def retrieveSiteNativeID(A, B):
#     if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
#         outList = ''
#     else:
#         colrowValue = str(A).strip() + str(B).strip()
#         try:
#             outList = SiteNativeIDdict[colrowValue]
#         except:
#             outList = ''
#     return outList

def retrieveSiteNativeID(A, B):
    colrowValue = str(A).strip() + str(B).strip()
    outList = SiteNativeIDdict[colrowValue]
    return outList

for index, row in outdf.iterrows():
    if pd.isnull(row['in_SiteNativeID']) or row['in_SiteNativeID'] == '':
        temp_ID = retrieveSiteNativeID(row['in_Latitude'],row['in_Longitude'])
        outdf.at[index, 'in_SiteNativeID'] = temp_ID 

## Clean Data / data types

In [10]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [11]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [12]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['', 'Bright No 1 Well', 'Greenwald No 1 Well', ...,
       'Stateline Spring', 'Mckay Spring No 2', 'Springs 104660'],
      dtype=object)

In [13]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Josephine Washakie', 'Sally I Amboh', 'Frances Brown', ...,
       'J Stotts', 'John E Craton', 'Gary Mehling'], dtype=object)

In [14]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [15]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [16]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [17]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Well', 'Spring'], dtype=object)

In [18]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['', 'Bright No 1 Well', 'Greenwald No 1 Well', ...,
       'Stateline Spring', 'Mckay Spring No 2', 'Springs 104660'],
      dtype=object)

In [19]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Josephine Washakie', 'Sally I Amboh', 'Frances Brown', ...,
       'J Stotts', 'John E Craton', 'Gary Mehling'], dtype=object)

In [20]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
for x in outdf['in_BeneficialUseCategory'].sort_values().unique():
    print(f'"' + x + '",')

"",
"Bat,Domestic Surface Water,Miscellaneous Surface Water",
"Coal Bed Methane Ground Water",
"Coal Bed Methane Ground Water,Domestic Ground Water",
"Coal Bed Methane Ground Water,Industrial Ground Water",
"Coal Bed Methane Ground Water,Irrigation Ground Water",
"Coal Bed Methane Ground Water,Irrigation Ground Water,Miscellaneous Ground Water",
"Coal Bed Methane Ground Water,Irrigation Ground Water,Miscellaneous Ground Water,Stock Watering",
"Coal Bed Methane Ground Water,Miscellaneous Ground Water",
"Coal Bed Methane Ground Water,Miscellaneous Ground Water,Monitor Observation,Stock Watering",
"Coal Bed Methane Ground Water,Miscellaneous Ground Water,Stock Watering",
"Coal Bed Methane Ground Water,Monitor Observation",
"Coal Bed Methane Ground Water,Monitor Observation,Stock Watering",
"Coal Bed Methane Ground Water,Stock Watering",
"Coal Bed Methane Ground Water,Stock Watering,Miscellaneous Ground Water",
"Combined Uses",
"Commercial",
"Commercial,Culinary",
"Commercial,Domestic Supp

In [21]:
# Ensure Latitude entry is either numireic or blank, no 0 entries
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([43.0416  , 43.11927 , 43.00243 , ..., 44.312778, 44.0286  ,
       41.840092])

In [22]:
# Ensure Longitude entry is either numireic or blank, no 0 entries
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-108.51321 , -109.0062  , -108.3777  , ..., -104.05616 ,
       -104.055061, -104.054881])

In [23]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array(['1978-01-14T00:00:00.000000000', '1978-12-20T00:00:00.000000000',
       '1979-05-30T00:00:00.000000000', ...,
       '1895-10-10T00:00:00.000000000', '1897-10-22T00:00:00.000000000',
       '1893-02-13T00:00:00.000000000'], dtype='datetime64[ns]')

In [24]:
# Ensure Flow entry is either numireic or blank, no 0 entries
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([15.0, 20.0, 30.0, ..., 0.091, 2.56, 37.13], dtype=object)

In [25]:
# Ensure Volume entry is either numireic or blank, no 0 entries
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([''], dtype=object)

In [26]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    colrowValue = str(A).strip() + str(B).strip()
    outList = WaterSourceNativeIDdict[colrowValue]
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

In [27]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Cancelled","Abandoned","Incomplete","Expired","Suspended","Rejected"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

140058


array(['Fully Adjudicated', 'Complete', '', 'Unadjudicated',
       'Partially Adjudicated'], dtype=object)

## Export Data

In [28]:
outdf.info

<bound method DataFrame.info of          WaDEUUID in_MethodUUID in_VariableSpecificUUID in_OrganizationUUID  \
0            wyD0     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
1            wyD1     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
2            wyD2     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
3            wyD3     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
4            wyD4     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
...           ...           ...                     ...                 ...   
140053  wyD223790     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
140054  wyD223792     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
140055  wyD223793     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
140056  wyD223794     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   
140057  wyD223795     WSGSwr_M1               WSGSwr_V1           WSGSwr_O1   

       in_Geometry 

In [29]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,wyD0,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,43.04160,-108.51321,,,POD,,PODA1.0G,,Well,WY,,,,,,,,,,15.00000,Fully Adjudicated,A1.0G,Josephine Washakie,1978-01-14,,,,,,Domestic Ground Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
1,wyD1,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,43.11927,-109.00620,,,POD,,PODA10.0G,,Well,WY,,,,,,,,,,20.00000,Fully Adjudicated,A10.0G,Sally I Amboh,1978-12-20,,,,,,Domestic Ground Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
2,wyD2,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,43.00243,-108.37770,,,POD,,PODA100.0G,,Well,WY,,,,,,,,,,30.00000,Fully Adjudicated,A100.0G,Frances Brown,1979-05-30,,,,,,Domestic Ground Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
3,wyD3,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,42.99207,-108.42714,,,POD,,PODA101.0G,,Well,WY,,,,,,,,,,24.00000,Fully Adjudicated,A101.0G,Anna Brown,1964-07-01,,,,,,Domestic Ground Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
4,wyD4,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,43.03456,-108.76136,,,POD,,PODA102.0G,,Well,WY,,,,,,,,,,20.00000,Fully Adjudicated,A102.0G,Annette Valarde Lajeunesse,1982-08-03,,,,,,Domestic Ground Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140053,wyD223790,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID2,Surface Water,,Digitized,,4326,,,,43.97895,-104.05712,,,POD,Springs 104660,PODCR CA02/060,,Spring,WY,,,,,,,,,,0.34000,Fully Adjudicated,CR CA02/060,W Keyes,1897-10-22,,,,,,Irrigation Surface Water,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
140054,wyD223792,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,44.31278,-104.05694,,,POD,,PODP190491.0W,,Spring,WY,,,,,,,,,,2.00000,Complete,P190491.0W,,2009-03-09,,,,,,Stock Watering,,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
140055,wyD223793,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID2,Surface Water,,Digitized,,4326,,,,41.67513,-104.05616,,,POD,State Line Spring Creek,PODCR CC56/110,,Spring,WY,,,,,,,,,,0.70000,Fully Adjudicated,CR CC56/110,John E Craton,1935-03-07,,,,,,"Irrigation Surface Water,Stock And Domestic",,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/
140056,wyD223794,WSGSwr_M1,WSGSwr_V1,WSGSwr_O1,,,Fresh,,wadeID2,Surface Water,,Digitized,,4326,,,,44.02860,-104.05506,,,POD,Maris Spring Bear Canyon Spring,PODCR CC58/464,,Spring,WY,,,,,,,,,,0.03000,Fully Adjudicated,CR CC58/464,,1911-08-21,,,,,,"Municipal Surface Water,Other,Railroad,Steam",,,,9/15/2023,,0,,,,,,,,,,https://www.wsgs.wyo.gov/


In [30]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_wsgsMain.zip', compression=dict(method='zip', archive_name='Pwr_wsgsMain.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.