# Pre-processing "XX" Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Colorado/WaterAllocation_WaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Data Input 1 - DWR_Water_Right_-_Net_Amounts.zip
- water right and site info

In [None]:
# Input File - asdf
fileInput = "RawInputData/DWR_Water_Right_-_Net_Amounts.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/DWR_Water_Right_-_Net_Amounts.zip", compression=dict(method='zip', archive_name="DWR_Water_Right_-_Net_Amounts.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

In [None]:
#Creating Beneficial Use.
#Need to split CO abbreviatoin strings to a workable format.

BenUseDict = {
"0" : "Storage",
"1" : "Irrigation",
"2" : "Municipal",
"3" : "Commercial",
"4" : "Industrial",
"5" : "Recreation",
"6" : "Fishery",
"7" : "Fire",
"8" : "Domestic",
"9" : "Stock",
"A" : "Augmentation",
"B" : "Export from Basin",
"C" : "Cumulative Accretion to River",
"D" : "Cumulative Depletion from River",
"E" : "Evaporative",
"F" : "Federal Reserved",
"G" : "Geothermal",
"H" : "Household Use Only",
"K" : "Snow Making",
"M" : "Minimum Streamflow",
"N" : "Net Effect on River",
"P" : "Power Generation",
"Q" : "Other",
"R" : "Recharge",
"S" : "Export from State",
"T" : "Transmountain Export",
"W" : "Wildlife",
"X" : "All Beneficial Uses"}

def retrieveBenUse(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outString = ""
    else:
        String1 = str(colrowValue).strip()
        x=[]
        x[:] = String1
        try:
            outList = []
            for i in range(len(x)):
                y = x[i].strip()
                y = BenUseDict[y]
                outList.append(y)
            outString = ",".join(str(e) for e in outList)
        except:
            outString = ""
    return outString

dfin1['in_BeneficialUseCategory'] = dfin1.apply(lambda row: retrieveBenUse(row['Decreed Uses']), axis=1)
dfin1['in_BeneficialUseCategory'].unique()

In [None]:
#Determining WaterSourceTypeCV

WSTypeDict = {
"Aquifer NNT/NT Reservation" : "Surface Water",
"Ditch" : "Surface Water",
"Ditch System" : "Surface Water",
"Exchange Plan" : "Surface Water",
"Measuring Point" : "Surface Water",
"Mine" : "Surface Water",
"Minimum Flow" : "Surface Water",
"Other" : "Surface Water",
"Pipeline" : "Surface Water",
"Power Plant" : "Surface Water",
"Pump" : "Surface Water",
"Reach" : "Surface Water",
"Reach (Aggregating)" : "Surface Water",
"Recharge Area" : "Surface Water",
"Recharge Area Group" : "Surface Water",
"Reservoir" : "Surface Water",
"Reservoir System" : "Surface Water",
"Seep" : "Surface Water",
"Spring" : "Surface Water",
"Stream Gage" : "Surface Water",
"Well" : "Groundwater",
"Well Field" : "Groundwater",
"Well Group" : "Groundwater",
"Augmentation/Replacement Plan" : "Groundwater"}

def retrieveWaterSourceTypeCV(colrowValue):
    if (colrowValue == "") or (pd.isnull(colrowValue)):
        outString = "WaDE Blank"
    else:
        colrowValue = str(colrowValue).strip()
        try:
            outString = WSTypeDict[colrowValue]
        except:
            outString = "WaDE Blank"
    return outString

dfin1['in_WaterSourceTypeCV'] = dfin1.apply(lambda row: retrieveWaterSourceTypeCV(row['Structure Type']), axis=1)
dfin1['in_WaterSourceTypeCV'].unique()

In [None]:
# Allocation_CFS
# If Decreed Units = "C" and Net Absolute != 0, then return Net Absolute
# Elif Decreed Units = "C" and Net Conditional != 0, then return Net Conditional
# Else return blank

# For creating Allocation_CFS
def assignAllocation_CFS(valA, valB, valC):
    valA = str(valA).strip()
    if (valB != 0) and (valC != 0):
        outString = 0
    else:
        if (valA == "C") and (valB != 0):
            outString = valB
        elif (valA  == "C") and (valC != 0):
            outString = valC
        else:
            outString = 0
    return outString

dfin1['in_AllocationFlow_CFS'] = dfin1.apply(lambda row: assignAllocation_CFS(row["Decreed Units"], row["Net Absolute"], row["Net Conditional"]), axis=1)
dfin1['in_AllocationFlow_CFS'].unique()

In [None]:
# AllocationVolume_AF
# If Decreed Units = "A" and Net Absolute != 0, then return Net Absolute
# Elif Decreed Units = "A" and Net Conditional != 0, then return Net Conditional
# Else return blank

# For creating AllocationVolume_AF
def assignAllocationVolume_AF(valA, valB, valC):
    valA = str(valA).strip()
    if (valB != 0) and (valC != 0):
        outString = 0
    else:
        if (valA == "A") and (valB != 0):
            outString = valB
        elif (valA  == "A") and (valC != 0):
            outString = valC
        else:
            outString = 0
    return outString

dfin1['in_AllocationVolume_AF'] = dfin1.apply(lambda row: assignAllocationVolume_AF(row["Decreed Units"], row["Net Absolute"], row["Net Conditional"]), axis=1)
dfin1['in_AllocationVolume_AF'].unique()

In [None]:
# For creating AllocationLegalStatusCV
# If Net Absolute = 0 and Net Condontial = 0, then Condtional Aboslute
# Elif Net Absolute = 0 and Net Condontial != 0, then Condtional
# Else, Aboslute

def assignAllocationLegalStatusCV(valA, valB):
    if (valA == 0) and (valB == 0):
        outString = "Conditional Absolute"
    elif (valA == 0) and (valB != 0):
        outString = "Conditional"
    else:
        outString = "Absolute"
    return outString

dfin1['in_AllocationLegalStatusCV'] = dfin1.apply(lambda row: assignAllocationLegalStatusCV(row['Net Absolute'], row['Net Conditional']), axis=1)
dfin1['in_AllocationLegalStatusCV'].unique()

In [None]:
# Need a unique identifier for WaDE AllocationNativeID.  Combine combine **Admin No**, **Order No**, **Decreed Units**, & **WDID** into single string entry.

# For creating AllocationAmount
def assignAllocationNativeID(colrowValueA, colrowValueB, colrowValueC, colrowValueD):
    colrowValueA = str(colrowValueA).strip()
    colrowValueB = str(colrowValueB).strip()
    colrowValueD = str(colrowValueD).strip()
    outString = "-".join(map(str, [colrowValueA, colrowValueB, colrowValueC, colrowValueD]))
    return outString

dfin1['in_AllocationNativeID'] = dfin1.apply(lambda row: assignAllocationNativeID(row['Admin No'], row['Order No'], row['Decreed Units'], row['WDID']), axis=1)
dfin1['in_AllocationNativeID'].unique()

In [None]:
# Use list of WDIDs (from Division data) as inputs, retreive time series data.
# Split list into catagories that are 100 long. Issue with CO API timing out after too long.
dfin1_s = dfin1.drop_duplicates(subset=["WDID"], keep=False).reset_index()
wdidList = dfin1_s['WDID'].tolist()
wdidListB = [wdidList[i:i + 100] for i in range(0, len(wdidList), 100)]

## Data Input 2 - time series water use info.zip
-retrieved via api, saved to local zip for easy future acces


In [None]:
# already done
# %%time
# # Time Series Dataframe
# dfts = pd.DataFrame()

# str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecmonth/?format=csv&wdid="
# str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

# for i in range(len(wdidListB)):
#     lstC = wdidListB[i]
#     lstCa = '%2C'.join([str(n) for n in lstC]) 
#     urlInput = str2 + lstCa + str3
#     print(urlInput)
#     try:
#         tempdf = pd.read_csv(urlInput, skiprows=2).replace(np.nan, "")
#         dfts = pd.concat([dfts, tempdf])
#     except:
#         print("bad reponse")

# print(len(dfts))
# dfts.head()

In [None]:
# already done
# dfts.to_csv('RawInputData/TimeSeriesInfo.zip', compression=dict(method='zip', archive_name='TimeSeriesInfo.csv'), index=False)  # The output, save as a zip

In [None]:
# Input File - TimeSeriesInfo.zip
fileInput = "RawInputData/TimeSeriesInfo.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/TimeSeriesInfo.zip", compression=dict(method='zip', archive_name="TimeSeriesInfo.csv"), index=False)

print(len(dfin2))
dfin2.head(1)

In [None]:
#left merge sites to water use
dfin1 = dfin1.merge(dfin2, left_on='WDID', right_on='wdid', how='left')
print(len(dfin1))
dfin1.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "COwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "COwr_V1" # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = "Discharge Flow"

# Organization Info
df['in_OrganizationUUID'] = "COwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = dfin1['Water Source']
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = dfin1['in_WaterSourceTypeCV']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = dfin1['Location Accuracy']
df['in_County'] = dfin1['County']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1['Latitude']
df['in_Longitude'] = dfin1['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfin1['Structure Name']
df['in_SiteNativeID'] = dfin1['WDID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin1['Structure Type'].astype(str)
df['in_StateCV'] = "CO"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['in_AllocationFlow_CFS'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['in_AllocationLegalStatusCV']
df['in_AllocationNativeID'] =  dfin1['in_AllocationNativeID'].astype(str)
df['in_AllocationOwner'] = ""
df['in_AllocationPriorityDate'] = dfin1['Appropriation Date']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfin1['in_AllocationVolume_AF'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['in_BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0 # 1 or 0, if we want this data excempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfin1['More Information']

# Site VariableAmounts Info
df['in_Amount'] = dfin1['dataValue']
df['in_AssociatedNativeAllocationIDs'] = dfin1['in_AllocationNativeID'].astype(str)
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin1['dataMeasDate']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['dataMeasDate']
df['in_TimeframeStart'] = dfin1['dataMeasDate']
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: Conditional

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Conditional"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# # PoU Shapefile Data
# shapefileInput = "RawInputData/shapefiles/{enter file name here}.zip" # ziped folder of the shp file

# dfPoUshapetemp = gpd.read_file(shapefileInput)
# dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()

In [None]:
# # create temp dataframe to hold native ID and geometry from shapefile input
# columnsList = ['in_SiteNativeID', 'geometry']
# dfPoUshape = pd.DataFrame(columns=columnsList)

# # assing values to temp dataframe based on shapefile input
# # for in_SiteNativeID assure ID value is the same as that listed above for POU info.
# dfPoUshape['in_SiteNativeID'] = "POU" + ""
# dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
# dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
# print(len(dfPoUshape))
# dfPoUshape.head()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_wu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.