# Pre-processing (state / organization Name) Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Arizona/WaterAllocation" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Input Data

In [None]:
# Input File - all SW QUERY BY SURFACE WATERSHEDS csv files

# use glob to get all the csv files in the folder 
import glob 
path = "RawInputData/SW QUERY BY SURFACE WATERSHEDS/"
csv_files = glob.glob(os.path.join(path, "*.zip"))

# loop over the list of csv files
dfin1 = pd.DataFrame()
for f in csv_files:  
    # read the csv file
    dftemp = pd.read_csv(f).replace(np.nan, "")
    dfin1 = pd.concat([dfin1, dftemp]) 

dfin1['WaDEUUID'] = "azSW" + dfin1.index.astype(str)

print(len(dfin1))
dfin1.head(1)

In [None]:
# Input File - Filing_POD shp file, for sw
inputFile = "RawInputData/Filing_POD.zip"
df_FPOD = gpd.read_file(inputFile).replace(np.nan, "")

df_FPOD['geometry'] = df_FPOD['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
df_FPOD["wade_lattitude"] = df_FPOD.centroid.y.round(5)
df_FPOD["wade_longitude"] = df_FPOD.centroid.x.round(5)
print(len(df_FPOD))
df_FPOD.head(1)

In [None]:
# Input File - Filing_POU shp file, for sw
inputFile = "RawInputData/Filing_POU.zip"
df_FPOU = gpd.read_file(inputFile).replace(np.nan, "")

df_FPOU = df_FPOU[df_FPOU['X_UTMNAD83'] != 0.00000].reset_index(drop=True)

df_FPOU['geometry'] = df_FPOU['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
df_FPOU["wade_lattitude"] = df_FPOU.centroid.y.round(5)
df_FPOU["wade_longitude"] = df_FPOU.centroid.x.round(5)
print(len(df_FPOU))
df_FPOU.head(1)

In [None]:
# merge POD and POU shp files together for single dataframe
df_fill = pd.concat([df_FPOD, df_FPOU])
df_fill = df_fill.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(df_fill))
df_fill.head(1)

In [None]:
# Input File - Well_Registry, for gw
inputFile = "RawInputData/Well_Registry.zip"
dfin2 = gpd.read_file(inputFile).replace(np.nan, "")

dfin2['geometry'] = dfin2['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
dfin2["wade_lattitude"] = dfin2.centroid.y.round(5)
dfin2["wade_longitude"] = dfin2.centroid.x.round(5)

dfin2['WaDEUUID'] = "azGW" + dfin2.index.astype(str)

print(len(dfin2))
dfin2.head(1)

## Surface Water Data (POD & POU)

In [None]:
# Merge Surface Water Query by Watershed water records with Filling.
dfin1 = pd.merge(dfin1, df_fill[['FILENO', 'CADASTRAL', 'wade_lattitude', 'wade_longitude', 'POU_POD']], left_on='REG. NO', right_on='FILENO', how='left')
print(len(dfin1))
dfin1.head(1)

In [None]:
# fixing 'REG. NO' format to match 'FILNO' in FILINGS shp file.

def fixREGNO(val):
   
    ### first fix
    # Create testVal to search for length, split on '-' & '.'
    testVal = str(val).strip()
    sep1 = '-'
    testVal = testVal.split(sep1, 1)[1]
    sep2 = '.'
    testVal = testVal.split(sep2, 1)[0]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 2:
        val = val.replace("-", "-0000")
    if len(testVal) == 3:
        val = val.replace("-", "-000")
    if len(testVal) == 4:
        val = val.replace("-", "-00")
    if len(testVal) == 5:
        val = val.replace("-", "-0")
        
        
    ### second fix
    # Create testVal to search for length, split on '.' at the end
    testVal = str(val).strip()
    sep1 = '.'
    testVal = testVal.split(sep1, 1)[1]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 1:
        val = val.replace(".", ".00" + testVal)
    if len(testVal) == 2:
        val = val.replace(".", ".0" + testVal)   
    
    return val

dfin1['REG. NO'] = dfin1.apply(lambda row: fixREGNO(row['REG. NO']), axis=1)
exList = dfin1['REG. NO'].unique().tolist()
exList.sort()
for x in exList:
    print(x)

In [None]:
# Split 'QUANTITY' into 'Amount' and 'UNIT'
dfin1[['Amount', 'Unit']] = dfin1.QUANTITY.str.split("  ", expand = True)
dfin1['Amount'] = pd.to_numeric(dfin1['Amount'], errors='coerce').fillna(0).astype(float) # make sure this is numeric.
dfin1.head(1)

In [None]:
# temp fix - remove recods with these 'Units'
# AZ not provding us with metadata for these.
dropList = ['ACRES',
            'Amount Required for Maintenance',
            'Feet',
            'MIT - Miners Inches Total',
            'Miners Inches Per Annum', 
            'XX - Unknown Code at Load time',
            'None',
            '',
            " "]

dfin1 = dfin1[~dfin1['Unit'].isin(dropList)]
print(len(dfin1))
dfin1.head(1)

In [None]:
# convert all flow values to CFS
def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet Per Second":
        CFS_Value = val
    if unit == "Acre-Feet Per Annum":
        CFS_Value = val / (723.968)
    if unit == "Gallons Per Annum":
        CFS_Value = val / (235905662.34)
    else:
        CFS_Value = 0.0
    return(CFS_Value)

dfin1['CFS_Value'] = dfin1.apply(lambda row: convertFlowFunc(row['Amount'], row['Unit']), axis=1)
dfin1['CFS_Value'].unique()

In [None]:
# convert all volume values to AF
def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-Feet':
        AF_Value = val
    if unit == 'Acre-Feet Total':
        AF_Value = val
    if unit == "CFT - Cubic Feet Total":
        AF_Value = val / (43559.9)
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    else:
        AF_Value = 0.0
    return(AF_Value)

dfin1['AF_Value'] = dfin1.apply(lambda row: convertVolumeFunc(row['Amount'], row['Unit']), axis=1)
dfin1['AF_Value'].unique()

In [None]:
dfin1.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "AZwr_M2" # for surface water

# Variable Info
df['in_VariableSpecificUUID'] = "AZwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "AZwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "WaDE Blank"
df['in_County'] = dfin1['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1['wade_lattitude']
df['in_Longitude'] = dfin1['wade_longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfin1['POU_POD']
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfin1['POU_POD'].str.strip() + dfin1['CADASTRAL'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['CFS_Value'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['STATUS']
df['in_AllocationNativeID'] =  dfin1['REG. NO'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin1['NAME']
df['in_AllocationPriorityDate'] = dfin1['PRIOR DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfin1['AF_Value'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['WATER USE'].str.title()
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0 # we want these sw records to be as normal
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

dfswOut = df.copy()
dfswOut = dfswOut.drop_duplicates().reset_index(drop=True)
print(len(dfswOut))
dfswOut.head()

## Groundwater Data (POD)

In [None]:
# create WaDE Registration Number	
dfin2['wade_RegistrationN'] = dfin2['PROGRAM'].astype(str) + "-" + dfin2['REGISTRY_I'].astype(str)

exList = dfin2['wade_RegistrationN'].unique().tolist()
exList.sort()
for x in exList:
    print(x)

In [None]:
# AZwr Groundwater PUMPRATE is in GPM, need to convert to CFS
# 448.8 CFS = 1 GPM

# Clean owner name up
def ConvertGPMToCFSFunc(Val):
    Val = Val / 448.8 
    return Val

dfin2['PUMPRATE'] = dfin2.apply(lambda row: ConvertGPMToCFSFunc(row['PUMPRATE']), axis=1)
dfin2['PUMPRATE'].unique()

In [None]:
dfin2.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "AZwr_M1" # for goundwater

# Variable Info
df['in_VariableSpecificUUID'] =  "AZwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "AZwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "WaDE Blank"
df['in_County'] = dfin2['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin2['wade_lattitude']
df['in_Longitude'] = dfin2['wade_longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POD" + dfin2['CADASTRAL'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Well" # these should all be well records
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin2['PUMPRATE'].astype(float)
df['in_AllocationLegalStatusCV'] = ""
df['in_AllocationNativeID'] =  dfin2['wade_RegistrationN'] # see above for creation
df['in_AllocationOwner'] = dfin2['OWNER_NAME']
df['in_AllocationPriorityDate'] = ""
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfin2['WATER_USE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 1 # all these gw records should be considered exempt for us.
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://app.azwater.gov/WellRegistry/Detail.aspx?RegID=?" + dfin2['REGISTRY_I'].replace("", 0).fillna(0).astype(int).astype(str)


dfgwOut = df.copy()
dfgwOut = dfgwOut.drop_duplicates().reset_index(drop=True)
print(len(dfgwOut))
dfgwOut.head()

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# Concatenate dataframes
frames = [dfswOut, dfgwOut]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is either numireic or a 0
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is either numireic or a 0
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Ensure Flow entry is either numireic or a 0
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Ensure Volume entry is either numireic or a 0
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this ADWR, we don't want water rights that are considered: "INACTIVE - WITHDRAWN",
                       "INACTIVE - CONSOLIDATED",
                       "INACTIVE - AMENDED",
                       "INACTIVE - CANCELLED",
                       "INACTIVE - REJECTED",
                       "INACTIVE - PARTIAL T&S",
                       "INACTIVE - RELINQUISHED",
                       "INACTIVE - FULL T&S",
                       "INACTIVE - INACTIVE",
                       "INACTIVE - FULL ASSIGNMENT",
                       "INACTIVE - PARTIAL ASSIGNMENT"

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["INACTIVE - WITHDRAWN",
                       "INACTIVE - CONSOLIDATED",
                       "INACTIVE - AMENDED",
                       "INACTIVE - CANCELLED",
                       "INACTIVE - REJECTED",
                       "INACTIVE - PARTIAL T&S",
                       "INACTIVE - RELINQUISHED",
                       "INACTIVE - FULL T&S",
                       "INACTIVE - INACTIVE",
                       "INACTIVE - FULL ASSIGNMENT",
                       "INACTIVE - PARTIAL ASSIGNMENT"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# N/A, all data in POU are considered points for AZwr

## Export Data

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_azMain.zip', compression=dict(method='zip', archive_name='Pwr_azMain.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.

In [None]:
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_County'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList