# Pre-processing Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Oklahoma/WaterAllocation_WaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Water Right Data
- POD Division data (groundwater wells, surface water division)
- POU data (Permitted_Areas_of_Use, Permitted_Dedicated_Lands)

In [None]:
# Input File - groundwater POD
PGW_Input = "RawInputData/water_right/Permitted_GW_Wells.zip"
df_PGW = pd.read_csv(PGW_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PGW:
    df_PGW['WaDEUUID'] = "okGD" + df_PGW.index.astype(str)
    df_PGW.to_csv('RawInputData/water_right/Permitted_GW_Wells.zip', compression=dict(method='zip', archive_name='Permitted_GW_Wells.csv'), index=False)

print(len(df_PGW))
df_PGW.head(1)

In [None]:
# Input File - surface Water POD
PSWDP_Input = "RawInputData/water_right/Permitted_SW_Diversions.zip"
df_PSWDP = pd.read_csv(PSWDP_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PSWDP:
    df_PSWDP['WaDEUUID'] = "okSD" + df_PSWDP.index.astype(str)
    df_PSWDP.to_csv('RawInputData/water_right/Permitted_SW_Diversions.zip', compression=dict(method='zip', archive_name='Permitted_SW_Diversions.csv'), index=False)

print(len(df_PSWDP))
df_PSWDP.head(1)

In [None]:
# Concatenate - Both datasets share the same columns.
dfPOD = pd.concat([df_PGW, df_PSWDP], ignore_index=True).reset_index(drop=True)

print(len(dfPOD))
dfPOD.head(1)

In [None]:
# POD specific
dfPOD['in_Latitude'] = dfPOD['LATITUDE']
dfPOD['in_Longitude'] = dfPOD['LONGITUDE']
dfPOD['in_PODorPOUSite'] = "POD"

dfPOD['in_SiteNativeID'] = "pod" + dfPOD['RECORD_ID'].astype(float).astype("int64").astype(str).str.strip()
dfPOD['in_SiteNativeID'].unique()

In [None]:
# Input File - Permitted_Areas_of_Use POU
PAU_Input = "RawInputData/water_right/shapefiles/Permitted_Areas_of_Use.zip"
df_PAU = gpd.read_file(PAU_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PAU:
    df_PAU['WaDEUUID'] = "okPAU" + df_PAU.index.astype(str)
    df_PAU.to_csv('RawInputData/water_right/Permitted_Areas_of_Use.zip', compression=dict(method='zip', archive_name='Permitted_Areas_of_Use.csv'), index=False)

print(len(df_PAU))
df_PAU.head(1)

In [None]:
# Input File - Permitted_Dedicated_Lands POU
PDL_Input = "RawInputData/water_right/shapefiles/Permitted_Dedicated_Lands.zip"
df_PDL = gpd.read_file(PDL_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PDL:
    df_PDL['WaDEUUID'] = "okPDL" + df_PDL.index.astype(str)
    df_PDL.to_csv('RawInputData/water_right/Permitted_Dedicated_Lands.zip', compression=dict(method='zip', archive_name='Permitted_Dedicated_Lands.csv'), index=False)

print(len(df_PDL))
df_PDL.head(1)

In [None]:
# Concatenate - Both datasets share the same columns.
dfPOU = pd.concat([df_PAU, df_PDL], ignore_index=True).reset_index(drop=True)

print(len(dfPOU))
dfPOU.head(1)

In [None]:
# POIU specific
dfPOU['in_Latitude'] = dfPOU['wadeLat']
dfPOU['in_Longitude'] = dfPOU['wadeLong']
dfPOU['in_PODorPOUSite'] = "POU"

dfPOU['in_SiteNativeID'] = "pou" + dfPOU['RECORD_ID'].astype(float).astype("int64").astype(str).str.strip()
dfPOU['in_SiteNativeID'].unique()

In [None]:
# Concatenate POD and POU together

dfPOD['geometry'] = "" # as filler
df_wr = pd.concat([dfPOD, dfPOU], ignore_index=True).reset_index(drop=True).replace(np.nan, '')

print(len(df_wr))
df_wr.head(1)

In [None]:
#Fixing Beneficial Uses PRIMARY_PURPOSE
def fixRecFishWild(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == 'Recreation, Fish, Wildlife':
        outList = 'Recreation Fish & Wildlife'
    else:
        outList = colrowValue
    return outList

df_wr['PRIMARY_PU'] = df_wr.apply(lambda row: fixRecFishWild(row['PRIMARY_PU']), axis=1)
df_wr['PRIMARY_PU'].value_counts()

## Water Use Data
- Groundwater permit use
- Surface water permit use

In [None]:
# Input File - Groundwater Use 2000-2020
InputFile = "RawInputData/water_use/Groundwater Use 2000-2020.zip"
df_uGW = pd.read_csv(InputFile).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_uGW:
    df_uGW['WaDEUUID'] = "okuGW" + df_uGW.index.astype(str)
    df_uGW.to_csv('RawInputData/water_use/Groundwater Use 2000-2020.zip', compression=dict(method='zip', archive_name='Groundwater Use 2000-2020.csv'), index=False)

print(len(df_uGW))
df_uGW.head(1)

In [None]:
# Input File - Surface Water Use 2000-2020
InputFile = "RawInputData/water_use/Surface Water Use 2000-2020.zip"
df_uSW = pd.read_csv(InputFile).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_uSW:
    df_uSW['WaDEUUID'] = "okuSW" + df_uSW.index.astype(str)
    df_uSW.to_csv('RawInputData/water_use/Surface Water Use 2000-2020.zip', compression=dict(method='zip', archive_name='Surface Water Use 2000-2020.csv'), index=False)

print(len(df_uSW))
df_uSW.head(1)

In [None]:
# Concatenate groundwater with surface water use
# datasets do differ by PERMIT_NUM

df_u = pd.concat([df_uGW, df_uSW], ignore_index=True).reset_index(drop=True).replace(np.nan, '')

print(len(df_u))
df_u.head(1)

In [None]:
# Pivot data, set each recorded ben use amount to in_Amount input

# temp base dataframe
df_u2 = df_u[['PERMIT_NUM', 'Year']]

# ben use list for amount value & column name
benUseList = ['Irrigation', 'Public Supply', 'Industrial', 'Power', 'Mining', 'Commercial', 'Recreation Fish & Wildlife', 'Agriculture', 'Other']

# output Dataframne
df_u3 = pd.DataFrame()
for x in benUseList:
    x = str(x)
    df_temp = df_u2.copy()
    df_temp['benuseListValue'] = x
    df_temp['in_Amount'] = df_u[x]
    df_u3 = pd.concat([df_u3, df_temp], ignore_index=True).reset_index(drop=True).replace(np.nan, '')
    
print(len(df_u3))
df_u3.head(1)

In [None]:
# left-join merge wr sites to water use data by permit number

dfin = df_wr.merge(df_u3, left_on='PERMIT_NUM', right_on='PERMIT_NUM', how='left').replace(np.nan, "")
print(len(dfin))
dfin.head(1)

## WaDE Input

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "OKwrwu_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "OKwrwu_V1" # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = "Water Use"

# Organization Info
df['in_OrganizationUUID'] = "OKwrwu_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfin['WATER']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = dfin['HYDRO_UNIT'].replace("", 0).replace(" ", 0).fillna(0).astype(float).astype("int64").astype(str).replace("0", "")
df['in_Latitude'] = dfin['in_Latitude'] # see above
df['in_Longitude'] = dfin['in_Longitude'] # see above
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfin['in_PODorPOUSite'] # see above
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfin['in_SiteNativeID'].replace("", 0).fillna(0).astype(str) # see above
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "OK"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin['DATE_FILED']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = dfin['STATUS']
df['in_AllocationNativeID'] =  dfin['PERMIT_NUM'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin['ENTITY_NAM']
df['in_AllocationPriorityDate'] = dfin['DATE_ISSUE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfin['RECORD_TYP']
df['in_AllocationVolume_AF'] = dfin['TOTAL_PERM']
df['in_BeneficialUseCategory'] = dfin['PRIMARY_PU']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin['in_Amount']
df['in_AssociatedNativeAllocationIDs'] = dfin['PERMIT_NUM'].replace("", 0).fillna(0).astype(str)
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = dfin['benuseListValue']
df['in_ReportYearCV'] = dfin['Year'].replace("", 0).fillna(0).astype(float).astype(int).astype(str)
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = "12/31/" + df['in_ReportYearCV'].astype(str)
df['in_TimeframeStart'] = "01/01/" + df['in_ReportYearCV'].astype(str)
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round().replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
#Update datatype of iAllocation Application Date to fit WaDE 2.0 structure
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'], errors = 'coerce')
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_AllocationApplicationDate'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

In [None]:
# # Assign Primary Use Category

# import sys
# sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
# import AssignPrimaryUseCategoryFile # Use Custom import file

# outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
# outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = [""] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# PoU Shapefile Data
shapefileInput =  "RawInputData/water_right/shapefiles/Permitted_Areas_of_Use.zip" # ziped folder of the shp file

dfPoUshapetemp = gpd.read_file(shapefileInput)
dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(dfPoUshapetemp))
dfPoUshapetemp.head()

In [None]:
# create temp dataframe to hold native ID and geometry from shapefile input
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)

# assing values to temp dataframe based on shapefile input
# for in_SiteNativeID assure ID value is the same as that listed above for POU info.
dfPoUshape['in_SiteNativeID'] = "pou" + dfPoUshapetemp['RECORD_ID'].replace("", 0).fillna(0).astype(float).astype("int64").astype(str).str.strip()
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwrwu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.