# Pre-processing California State Water Resource Control Board Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/California/WaterAllocation_WaterUse_CSWRCB" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Data Input

In [None]:
# Input File #1 - ewrims_flat_file_pod
fileInput = "RawInputData/water_right/ewrims_flat_file_pod.zip"
dfin1 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/ewrims_flat_file_pod.zip", compression=dict(method='zip', archive_name='ewrims_flat_file_pod.csv'), index=False)

print(len(dfin1))
dfin1.head()

In [None]:
# Input File #2 - water-rights-water-use-reported-short
fileInput = "RawInputData/water_use/water-rights-water-use-reported-short.zip"
dfin2 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/water-rights-water-use-reported-short.zip", compression=dict(method='zip', archive_name='water-rights-water-use-reported-short.csv'), index=False)

print(len(dfin2))
dfin2.head()

In [None]:
#POD_TYPE fix. 
# we only want the first listed POD_TYPE value

def firstPOD_TYPEFunc(val):
    val = str(val).strip()
    if "," in val:
        outString = val.split(",")
        # outString = val[0]
        outString = ''.join(outString)
    else:
        outString = val
    return outString

dfin1['POD_TYPE'] = dfin1.apply(lambda row: firstPOD_TYPEFunc(row['POD_TYPE']), axis=1)
dfin1['POD_TYPE'].unique()

In [None]:
# Clean data a little
dfin1 = dfin1.fillna("") # remove nan values
dfin1['USE_DIRECT_DIVERSION_RATE'] = pd.to_numeric(dfin1['USE_DIRECT_DIVERSION_RATE'], errors='coerce').fillna(0) # make sure this is numeric.

In [None]:
# Create VariableSpecificCv value
def createVariableSpecificUUID(unit):
    outString = ""
    if unit == "Cubic Feet per Second":
        outString = "CSWRCBwr_V1"
    if unit == "Gallons per Day":
        outString = "CSWRCBwr_V1"
    if unit == "Acre-feet per Year":
        outString = "CSWRCBwr_V1"
    if unit == "Gallons per Minute":
        outString = "CSWRCBwr_V1"
    if unit == 'Acre-feet':
        outString = "CSWRCBwr_V2"
    if unit == 'Gallons':
        outString = "CSWRCBwr_V2"
    else:
        outString = "CSWRCBwr_V1"

    return(outString)

dfin1['in_VariableSpecificUUID'] = dfin1.apply(lambda row: createVariableSpecificUUID(row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_VariableSpecificUUID'].unique()

In [None]:
# convert all flow values to CFS
def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet per Second":
        CFS_Value = val
    if unit == "Gallons per Day":
        CFS_Value = val / (646316.883)
    if unit == "Acre-feet per Year":
        CFS_Value = val / (723.968)
    if unit == "Gallons per Minute":
        CFS_Value = val / (448.83117)
    return(CFS_Value)

dfin1['in_AllocationFlow_CFS'] = dfin1.apply(lambda row: convertFlowFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_AllocationFlow_CFS'].unique()

In [None]:
# convert all volume values to AF
def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-feet':
        AF_Value = val
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    return(AF_Value)

dfin1['in_AllocationVolume_AF'] = dfin1.apply(lambda row: convertVolumeFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_AllocationVolume_AF'].unique()

In [None]:
# remove speicial characters from SUB_TYPE
def cleanupSubTypeFunc(val):
    val = str(val).strip()
    val = val.rstrip('_') # remove trailing "_"
    val = val.rstrip(',') # remove trailing commas
    if val == "" or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        val = val.replace("," ," ")
        outString = "" + val
    return outString

dfin1['wade_SUB_TYPE'] = dfin1.apply(lambda row: cleanupSubTypeFunc(row['SUB_TYPE']), axis=1)
dfin1['wade_SUB_TYPE'].unique()

In [None]:
#left merge sites to water use
dfin1 = dfin1.merge(dfin2, left_on='APPLICATION_NUMBER', right_on='APPL_ID', how='left')
print(len(dfin1))
dfin1.head()

In [None]:
# # convert units to WaDE Approproiate values (CFS or AF)
# def convertAmountToUnitFunc(val, unit):
#     outValue = None
#     if unit == "Cubic Feet per Second":
#         outValue = val
#     if unit == "Gallons per Day":
#         outValue = val / (646316.883)
#     if unit == "Acre-feet per Year":
#         outValue = val / (723.968)
#     if unit == "Gallons per Minute":
#         outValue = val / (448.83117)
#     if unit == 'Acre-feet':
#         outValue = val
#     if unit == 'Gallons':
#         outValue = val / (325850.943)
#     return(outValue)

# dfin1['in_Amount'] = dfin1.apply(lambda row: convertAmountToUnitFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
# dfin1['in_Amount'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "CSWRCBwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = dfin1['in_VariableSpecificUUID'] # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = "Water Use"

# Organization Info
df['in_OrganizationUUID'] = "CSWRCBwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfin1['SOURCE_NAME'].str.title()
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = dfin1['SOURCE_TYPE'].str.title()

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = dfin1['LOCATION_METHOD']
df['in_County'] = dfin1['COUNTY'].str.title()
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = dfin1['HUC_12_NUMBER'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_HUC8'] = dfin1['HUC_8_NUMBER'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_Latitude'] = dfin1['LATITUDE']
df['in_Longitude'] = dfin1['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfin1['POD_NAME'].str.title()
df['in_SiteNativeID'] = dfin1['ï»¿POD_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin1['POD_TYPE'].astype(str).str.title()
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin1['APPLICATION_RECD_DATE']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['in_AllocationFlow_CFS'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['WATER_RIGHT_STATUS'].str.title()
df['in_AllocationNativeID'] =  dfin1['APPLICATION_NUMBER'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin1['PRIMARY_OWNER_NAME']
df['in_AllocationPriorityDate'] = dfin1['PRIORITY_DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfin1['DIRECT_DIV_SEASON_END']
df['in_AllocationTimeframeStart'] = dfin1['DIRECT_DIV_SEASON_START']
df['in_AllocationTypeCV'] = dfin1['WATER_RIGHT_TYPE'].astype(str) + " " + dfin1['wade_SUB_TYPE'].astype(str)
df['in_AllocationTypeCV'] = df['in_AllocationTypeCV'].astype(str).str.strip()
df['in_AllocationVolume_AF'] = dfin1['in_AllocationVolume_AF'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['USE_CODE'].str.title()
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # we want this data to be exempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = "" # temp fix, leave blank for now
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = "" # temp fix, leave blank for now
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/EWServlet?Redirect_Page=EWPublicAppSummary.jsp&Purpose=getEwrimsPublicSummary&wrWaterRightID=" + dfin1['WR_WATER_RIGHT_ID'].replace("", 0).fillna(0).astype(int).astype(str)


# Site VariableAmounts Info
df['in_Amount'] = dfin1['AMOUNT']
df['in_AssociatedNativeAllocationIDs'] = dfin1['APPLICATION_NUMBER']
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = "" # see below
df['in_ReportYearCV'] = dfin1['YEAR'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['MONTH FORMATTED']
df['in_TimeframeStart'] = dfin1['MONTH FORMATTED']
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# ----------------------------------------------------------------------------------------------------

def createWaterSourceTypeCV(inWST):
    inWST = str(inWST).strip()
    
    if inWST == "":
        outString = "Unspecified"
    elif inWST == "Subsurfacer":
        outString = "Groundwater"
    else:
        outString =  "Surface Water"
      
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# Fill empty ben use values

def fillEmptyBenUseFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "Unspecified"
    else:
        outString = val
    return outString
    
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fillEmptyBenUseFunc(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: fillEmptyBenUseFunc(row['in_PrimaryUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# Update datatype of Priority Date to fit WaDE 2.0 structure
def formatDateString(inString1):
    inString = str(inString1).strip()
    try:
        if inString == "" or pd.isnull(inString):
            valndf = ""
        else:
            valD = pd.to_datetime(inString)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d/%Y')
    except:
        valndf = ""
    return valndf

outdf['in_AllocationPriorityDate'] = outdf.apply(lambda row: formatDateString(row['in_AllocationPriorityDate']), axis=1)
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For CA, we don't want water rights that are considered: "Cancelled", "Closed", "Inactive", "Pending", "Rejected", "Revoked"

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

print(f'length of df before removing non-active rights: ', len(outdf))

# drop list
dropLegalStatusList = ["Cancelled", "Closed", "Inactive", "Pending", "Rejected", "Revoked"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(f'length of df after removing non-active rights: ', len(outdf))
for x in outdf['in_AllocationLegalStatusCV'].sort_values().unique():
    print(f'"' + x + '",')

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# N/A

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_wu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip