# Preprocessing Washington Allocation data for WaDEQA upload.
- Purpose:  To preprocess the Washington data into one master file for simple DataFrame creation and extraction

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Washington/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data

In [3]:
# Input File, contains PoD info
d_pointFile = "D_PointTable.zip"
df_1 = pd.read_csv(d_pointFile, compression='zip', encoding = "ISO-8859-1")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_1:
    df_1['WaDEUUID'] = "waD" + df_1.index.astype(str)
    df_1.to_csv('D_PointTable.zip', compression=dict(method='zip', archive_name='D_PointTable.csv'), index=False)

print(len(df_1))
df_1.head(1)

155880


  df_1 = pd.read_csv(d_pointFile, compression='zip', encoding = "ISO-8859-1")


Unnamed: 0,OBJECTID,D_Point_ID,D_Point_Ty,Location_C,Assoc_FL,Misc_CD,Position_W,Active_DT,Inactive_D,Update_TD,Update_Use,Comment_DS,Created_TD,Created_Us,POINT_X,POINT_Y,WaDEUUID
0,1,200801.0,WL,U,N,,S,,,3/28/2013 9:58:00,"ECY\DKRO461""""",,,,1665873.05897,454923.23627,waD0


In [4]:
# Input File, Bridge table
D_Point_WR_DocFile = "D_Point_WR_Doc.zip"
df_2 = pd.read_csv(D_Point_WR_DocFile, compression='zip', encoding = "ISO-8859-1")
print(len(df_2))
df_2.head(1)

175196


  df_2 = pd.read_csv(D_Point_WR_DocFile, compression='zip', encoding = "ISO-8859-1")


Unnamed: 0,OBJECTID,D_Point_WR_Doc_ID,D_Point_ID,WR_Doc_NR,WR_Doc_ID,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Created_TD,Created_User_ID
0,1566,,204903.0,G4-26991C,,,,,,,


In [5]:
# Input File, Contains water use and owner info
Person_Plus_EXTRACT_FromWRTSnotGWISFile = "Person_Plus_EXTRACT_FromWRTSnotGWIS.zip"
df_3 = pd.read_csv(Person_Plus_EXTRACT_FromWRTSnotGWISFile, compression='zip', encoding = "ISO-8859-1")
print(len(df_3))
df_3.head(1)

301470


Unnamed: 0,ÃÂ¯ÃÂ»ÃÂ¿OID_,OBJECTID_1,WaRecId,WaRecId_1,WR_Doc_ID,WaRecPhaseId,PartyRoleTypeCode,PersonLastOrOrganizationNM,PersonFirstNM,PersonMINM,PersonAddressLine1AD,PersonAddressLine2AD,PersonAddressLine3AD,PersonAddressCityAD,PersonAddressStateCode,PersonAddressZipCodeAD,WaRecRCWClassTypeCode,EcologyRegionCode,WaRecPrimaryNumber,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,1,260791,2144735,2144735,2144735,-1,Primary,Bengen Farms Dba Juniper Road Farms,,,591 Bengen Lane,,,Pasco,WA,99301,groundwater,ERO,G3-28565,11/2/1988 0:00,Active,,,,,,,,


In [6]:
# Merging dataframes into one, using left-join.
df = pd.DataFrame()
df = pd.merge(df_1, df_2, left_on='D_Point_ID', right_on='D_Point_ID', how='left') # Joinning PoD data
df = pd.merge(df,   df_3, left_on='WR_Doc_ID', right_on='WR_Doc_ID', how='left') # Joinning PoD data

df = df.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(df))
df.head(1)

198075


Unnamed: 0,OBJECTID_x,D_Point_ID,D_Point_Ty,Location_C,Assoc_FL,Misc_CD,Position_W,Active_DT_x,Inactive_D,Update_TD_x,Update_Use,Comment_DS,Created_TD_x,Created_Us,POINT_X,POINT_Y,WaDEUUID,OBJECTID_y,D_Point_WR_Doc_ID,WR_Doc_NR,WR_Doc_ID,Active_DT_y,Inactive_DT,Update_TD_y,Update_User_ID,Created_TD_y,Created_User_ID,ÃÂ¯ÃÂ»ÃÂ¿OID_,OBJECTID_1,WaRecId,WaRecId_1,WaRecPhaseId,PartyRoleTypeCode,PersonLastOrOrganizationNM,PersonFirstNM,PersonMINM,PersonAddressLine1AD,PersonAddressLine2AD,PersonAddressLine3AD,PersonAddressCityAD,PersonAddressStateCode,PersonAddressZipCodeAD,WaRecRCWClassTypeCode,EcologyRegionCode,WaRecPrimaryNumber,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,1,200801.0,WL,U,N,,S,,,3/28/2013 9:58:00,"ECY\DKRO461""""",,,,1665873.05897,454923.23627,waD0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
# For converting projection latitude & longitude

from pyproj import Transformer, transform
transformer = Transformer.from_proj(2927, 4326)  # A trick to drastically optimize the Transformer of pyproj.
# Washignton projection = EPSG:2927. WGS84 projection used by WaDE 2.0 = epsg:4326.

def assignLat(colrowValueLat, colrowValueLong):
    if colrowValueLat == '' or pd.isnull(colrowValueLat):
        lat = ""
    else:
        lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return lat

# For converting projection longitude.
def assignLong(colrowValueLat, colrowValueLong):
    if colrowValueLong == '' or pd.isnull(colrowValueLong):
        long = ""
    else:
        lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return long

df['in_Latitude'] = df.apply(lambda row: assignLat(row['POINT_X'], row['POINT_Y']), axis=1)
df['in_Longitude'] = df.apply(lambda row: assignLong(row['POINT_X'], row['POINT_Y']), axis=1)

In [8]:
import re

def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = re.sub("[$@&.;,/\)(-]", "", outlist).strip()
    
    return outlist


df['Owner'] = df.apply(lambda row: assignOwner(row['PersonFirstNM'],
                                               row['PersonMINM'],
                                               row['PersonLastOrOrganizationNM']), axis=1)
df['Owner'].unique()

array(['', 'Mackie Sandy', 'Ste Michelle Wine Estates Ltd Paterson', ...,
       'Larkspur SR LLC', 'Taggares Fruit Company',
       'Western Farmland LLC'], dtype=object)

In [9]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = 0
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

df['in_AllocationFlow_CFS'] =  df.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)
df['in_AllocationFlow_CFS'].unique()

array([ 0.        ,  3.34201389,  4.79021991, ..., 13.81      ,
       86.94      ,  8.62      ])

In [10]:
# #############################################################################################
# # Data Assessment UUID
# dfPOD['WaDEUUID'] = df['WaDEUUID']

# #WaterSource
# dfPOD['in_WaterSourceTypeCV'] = df['WaRecRCWClassTypeCode']
                                    
# #Site
# dfPOD['in_CoordinateAccuracy'] = df['Location_C']
# dfPOD['in_Latitude'] = df['in_Latitude']
# dfPOD['in_Longitude'] = df['in_Longitude']
# dfPOD['in_SiteNativeID'] = "POD" + df['D_Point_ID'].astype(str)
# dfPOD['in_SiteTypeCV'] = df['D_Point_Ty']
# dfPOD['in_PODorPOUSite'] = "POD"

# #AllocationAmount_fact
# dfPOD['in_AllocationFlow_CFS'] = df['in_AllocationFlow_CFS']
# dfPOD['in_AllocationLegalStatusCV'] = df['WaRecProcessStatusTypeCode'].astype(str)
# dfPOD['in_AllocationNativeID'] = df['WR_Doc_ID'].astype(str)
# dfPOD['in_AllocationOwner'] = df['Owner'].astype(str)
# dfPOD['in_AllocationPriorityDate'] = df['PriorityDate']
# dfPOD['in_AllocationTypeCV'] = df['WaRecPhaseTypeCode']
# dfPOD['in_AllocationVolume_AF'] = df['AnnualVolumeQuantity']
# dfPOD['in_BeneficialUseCategory'] = df['PurposeOfUseTypeCodes'].astype(str)
# dfPOD['in_IrrigatedAcreage'] = df['IrrigatedAreaQuantity']

# dfPOD = dfPOD.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
# print(len(dfPOD))
# dfPOD

In [11]:
# create output POD dataframe
dfPOD = pd.DataFrame()

# Data Assessment UUID
dfPOD['WaDEUUID'] = df['WaDEUUID']

# Variable
dfPOD["in_VariableSpecificUUID"] = "WAwr_V1"

# WaterSource Info
dfPOD['in_Geometry'] = ""
dfPOD['in_GNISFeatureNameCV'] = ""
dfPOD['in_WaterQualityIndicatorCV'] = ""
dfPOD['in_WaterSourceName'] = "WaDE Unspecified"
dfPOD['in_WaterSourceNativeID'] = "" # create customID for temp solution
dfPOD['in_WaterSourceTypeCV'] = df['WaRecRCWClassTypeCode']

# Site Info
dfPOD['in_RegulatoryOverlayUUIDs'] = ""
dfPOD['in_WaterSourceUUID'] = "" # ???
dfPOD['in_CoordinateAccuracy'] = "WaDE Unspecified"
dfPOD['in_CoordinateMethodCV'] = df['Location_C']
dfPOD['in_County'] = "WaDE Unspecified"
dfPOD['in_EPSGCodeCV'] = 4326
dfPOD['in_Geometry'] = ""
dfPOD['in_GNISCodeCV'] = ""
dfPOD['in_HUC12'] = ""
dfPOD['in_HUC8'] = ""
dfPOD['in_Latitude'] = df['in_Latitude']
dfPOD['in_Longitude'] = df['in_Longitude']
dfPOD['in_NHDNetworkStatusCV'] = ""
dfPOD['in_NHDProductCV'] = ""
dfPOD['in_PODorPOUSite'] = "POD"
dfPOD['in_SiteName'] = "WaDE Unspecified"
dfPOD['in_SiteNativeID'] = "POD" + df['D_Point_ID'].astype(str)
dfPOD['in_SitePoint'] = ""
dfPOD['in_SiteTypeCV'] = df['D_Point_Ty'].astype(str)
dfPOD['in_StateCV'] = "WA"
dfPOD['in_USGSSiteID'] = ""

# AllocationAmount Info
dfPOD['in_VariableSpecificUUID'] =  ""
dfPOD['in_AllocationApplicationDate'] = ""
dfPOD['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
dfPOD['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
dfPOD['in_AllocationBasisCV'] = ""
dfPOD['in_AllocationChangeApplicationIndicator'] = ""
dfPOD['in_AllocationCommunityWaterSupplySystem'] = ""
dfPOD['in_AllocationCropDutyAmount'] = ""
dfPOD['in_AllocationExpirationDate'] = ""
dfPOD['in_AllocationFlow_CFS'] = df['in_AllocationFlow_CFS'].astype(float) # see above for conversion
dfPOD['in_AllocationLegalStatusCV'] = df['WaRecProcessStatusTypeCode']
dfPOD['in_AllocationNativeID'] =  df['WR_Doc_ID']
dfPOD['in_AllocationOwner'] = df['Owner']
dfPOD['in_AllocationPriorityDate'] = df['PriorityDate']
dfPOD['in_AllocationTimeframeEnd'] = ""
dfPOD['in_AllocationTimeframeStart'] = ""
dfPOD['in_AllocationTypeCV'] = df['WaRecPhaseTypeCode'].astype(str)
dfPOD['in_AllocationVolume_AF'] = df['AnnualVolumeQuantity']
dfPOD['in_BeneficialUseCategory'] = df['PurposeOfUseTypeCodes']
dfPOD['in_CommunityWaterSupplySystem'] = ""
dfPOD['in_CropTypeCV'] = ""
dfPOD['in_CustomerTypeCV'] = ""
dfPOD['in_DataPublicationDate'] = ""
dfPOD['in_DataPublicationDOI'] = ""
dfPOD['in_ExemptOfVolumeFlowPriority'] = 0
dfPOD['in_GeneratedPowerCapacityMW'] = ""
dfPOD['in_IrrigatedAcreage'] = df['IrrigatedAreaQuantity']
dfPOD['in_IrrigationMethodCV'] = ""
dfPOD['in_LegacyAllocationIDs'] = ""
dfPOD['in_OwnerClassificationCV'] = ""
dfPOD['in_PopulationServed'] = ""
dfPOD['in_PowerType'] = ""
dfPOD['in_PrimaryBeneficialUseCategory'] = ""
dfPOD['in_SDWISIdentifierCV'] = ""
dfPOD['in_WaterAllocationNativeURL'] = ""

dfPOD = dfPOD.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfPOD))
dfPOD.head()

191813


Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,waD0,,,,WaDE Unspecified,,,,,WaDE Unspecified,U,WaDE Unspecified,4326,,,,46.58081,-120.39878,,,POD,WaDE Unspecified,POD200801.0,,WL,WA,,,,,,,,,,,0.0,,,,,,,,,,,,,,,0,,,,,,,,,,
1,waD1,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,G,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889.0,,MW,WA,,,,,,,,,,,3.34201,Active,4271597.0,Mackie Sandy,4/27/2006 0:00,,,ChangeROE,345.26,IR,,,,,,0,,283.0,,,,,,,,
2,waD1,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,G,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889.0,,MW,WA,,,,,,,,,,,3.34201,Active,4271597.0,Ste Michelle Wine Estates Ltd Paterson,4/27/2006 0:00,,,ChangeROE,345.26,IR,,,,,,0,,283.0,,,,,,,,
3,waD1,,,,WaDE Unspecified,,,,,WaDE Unspecified,G,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889.0,,MW,WA,,,,,,,,,,,0.0,,,,,,,,,,,,,,,0,,,,,,,,,,
4,waD1,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,G,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889.0,,MW,WA,,,,,,,,,,,4.79022,Active,2085743.0,Stimson Lane Limited,6/6/1990 0:00,,,NewApp,,IR,,,,,,0,,175.0,,,,,,,,


## POU Data

In [12]:
# Input File, contains POU info
pouInput = "WA_POU_Input.zip" 
df_1u = pd.read_csv(pouInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_1u:
    df_1u['WaDEUUID'] = "waU" + df_1u.index.astype(str)
    df_1u.to_csv('WA_POU_Input.zip', compression=dict(method='zip', archive_name='WA_POU_Input.csv'), index=False)

print(len(df_1u))
df_1u.head(1)

156369


  df_1u = pd.read_csv(pouInput)


Unnamed: 0,OID_,WR_DOC_ID,WR_Doc_POU_ID,Fill_CD,WR_Doc_NR,WR_Doc_Type_CD,Quality_CD,Misc_CD,Position_With_CD,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Comment_DS,Created_TD,Created_User_ID,Shape_Length,Shape_Area,Latitude,Longitude,WaDEUUID
0,1,2084118.0,,7.0,GWC01066-D,CE,G,RECHECKED\WWT,S,,,1/23/2009 11:18:38,"ECY\DKRO461""""",,,,32011.93492,35404849.05028,46.59121,-119.73537,waU0


In [13]:
# Merging dataframes into one, using left-join.
# df_u1 and df3
dfu = pd.DataFrame()
dfu = pd.merge(df_1u, df_3, left_on='WR_DOC_ID', right_on='WR_Doc_ID', how='left')

dfu = dfu.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfu))
dfu.head(1)

177340


Unnamed: 0,OID_,WR_DOC_ID,WR_Doc_POU_ID,Fill_CD,WR_Doc_NR,WR_Doc_Type_CD,Quality_CD,Misc_CD,Position_With_CD,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Comment_DS,Created_TD,Created_User_ID,Shape_Length,Shape_Area,Latitude,Longitude,WaDEUUID,ÃÂ¯ÃÂ»ÃÂ¿OID_,OBJECTID_1,WaRecId,WaRecId_1,WR_Doc_ID,WaRecPhaseId,PartyRoleTypeCode,PersonLastOrOrganizationNM,PersonFirstNM,PersonMINM,PersonAddressLine1AD,PersonAddressLine2AD,PersonAddressLine3AD,PersonAddressCityAD,PersonAddressStateCode,PersonAddressZipCodeAD,WaRecRCWClassTypeCode,EcologyRegionCode,WaRecPrimaryNumber,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,1,2084118.0,,7.0,GWC01066-D,CE,G,RECHECKED\WWT,S,,,1/23/2009 11:18:38,"ECY\DKRO461""""",,,,32011.93492,35404849.05028,46.59121,-119.73537,waU0,58522.0,84235.0,2084118.0,2084118.0,2084118.0,59585.0,Primary,USARMY Corps Engineers,,,,,,,,,groundwater,CRO,G4-*01105SWRIS,4/1/1927 0:00,Active,,Certificate,,1375.0,800.0,200.0,GPM,DG IR


In [14]:
def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = re.sub("[$@&.;,/\)(-]", "", outlist).strip()
    
    return outlist


dfu['Owner'] = dfu.apply(lambda row: assignOwner(row['PersonFirstNM'], row['PersonMINM'], row['PersonLastOrOrganizationNM']), axis=1)
dfu['Owner'].unique()

array(['USARMY Corps Engineers', 'S Martinez Livestock Inc',
       'TRAINOR RUSSELL C', ..., 'Aspect Consulting  Price',
       'Richardson Monica', 'Richardson Thomas'], dtype=object)

In [15]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

dfu['in_AllocationFlow_CFS'] =  dfu.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)
dfu['in_AllocationFlow_CFS'].unique()

array([3.0635127324999996, 3.787615742, 0.33420138899999996, ..., 0.124,
       0.329, 0.5715], dtype=object)

In [16]:
%%time
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfu['Latitude']
dfSiteNativeID['in_Longitude'] = dfu['Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfu['in_SiteNativeID'] = dfu.apply(lambda row: retrieveSiteNativeID(row['Latitude'], row['Longitude']), axis=1)
dfu['in_SiteNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID136197',
       'wadeID136198', 'wadeID136199'], dtype=object)

In [17]:
# #############################################################################################
# # Data Assessment UUID
# dfPOU['WaDEUUID'] = dfu['WaDEUUID']

# #WaterSource
# dfPOU['in_WaterSourceTypeCV'] = dfu['WaRecRCWClassTypeCode']
                                    
# #Site
# dfPOU['in_CoordinateAccuracy'] = ""
# dfPOU['in_Latitude'] = dfu['Latitude']
# dfPOU['in_Longitude'] = dfu['Longitude']
# dfPOU['in_SiteNativeID'] = "POU" + dfu['in_SiteNativeID'].astype(str)
# dfPOU['in_SiteTypeCV'] = "Unspecified"
# dfPOU['in_PODorPOUSite'] = "POU"

# #AllocationAmount_fact
# dfPOU['in_AllocationFlow_CFS'] = dfu['in_AllocationFlow_CFS']
# dfPOU['in_AllocationLegalStatusCV'] = dfu['WaRecProcessStatusTypeCode'].astype(str)
# dfPOU['in_AllocationNativeID'] = dfu['WR_Doc_ID'].astype(str)
# dfPOU['in_AllocationOwner'] = dfu['Owner'].astype(str)
# dfPOU['in_AllocationPriorityDate'] = df['PriorityDate']
# dfPOU['in_AllocationTypeCV'] = dfu['WaRecPhaseTypeCode']
# dfPOU['in_AllocationVolume_AF'] = dfu['AnnualVolumeQuantity']
# dfPOU['in_BeneficialUseCategory'] = dfu['PurposeOfUseTypeCodes'].astype(str)
# dfPOU['in_IrrigatedAcreage'] = dfu['IrrigatedAreaQuantity']

# dfPOU = dfPOU.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
# print(len(dfPOU))
# dfPOU

In [19]:
# create output POD dataframe
dfPOU = pd.DataFrame()

# Data Assessment UUID
dfPOU['WaDEUUID'] = dfu['WaDEUUID']

# Variable
dfPOU["in_VariableSpecificUUID"] = "WAwr_V1"

# WaterSource Info
dfPOU['in_Geometry'] = ""
dfPOU['in_GNISFeatureNameCV'] = ""
dfPOU['in_WaterQualityIndicatorCV'] = ""
dfPOU['in_WaterSourceName'] = "WaDE Unspecified"
dfPOU['in_WaterSourceNativeID'] = "" # create customID for temp solution
dfPOU['in_WaterSourceTypeCV'] = dfu['WaRecRCWClassTypeCode']

# Site Info
dfPOU['in_RegulatoryOverlayUUIDs'] = ""
dfPOU['in_WaterSourceUUID'] = "" # ???
dfPOU['in_CoordinateAccuracy'] = "WaDE Unspecified"
dfPOU['in_CoordinateMethodCV'] = "WaDE Unspecified"
dfPOU['in_County'] = "WaDE Unspecified"
dfPOU['in_EPSGCodeCV'] = 4326
dfPOU['in_Geometry'] = ""
dfPOU['in_GNISCodeCV'] = ""
dfPOU['in_HUC12'] = ""
dfPOU['in_HUC8'] = ""
dfPOU['in_Latitude'] = dfu['Latitude']
dfPOU['in_Longitude'] = dfu['Longitude']
dfPOU['in_NHDNetworkStatusCV'] = ""
dfPOU['in_NHDProductCV'] = ""
dfPOU['in_PODorPOUSite'] = "POU"
dfPOU['in_SiteName'] = "WaDE Unspecified"
dfPOU['in_SiteNativeID'] =  "POU" + dfu['in_SiteNativeID'].astype(str)
dfPOU['in_SitePoint'] = ""
dfPOU['in_SiteTypeCV'] = "WaDE Unspecified"
dfPOU['in_StateCV'] = "WA"
dfPOU['in_USGSSiteID'] = ""

# AllocationAmount Info
dfPOU['in_VariableSpecificUUID'] = ""
dfPOU['in_AllocationApplicationDate'] = ""
dfPOU['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
dfPOU['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
dfPOU['in_AllocationBasisCV'] = ""
dfPOU['in_AllocationChangeApplicationIndicator'] = ""
dfPOU['in_AllocationCommunityWaterSupplySystem'] = ""
dfPOU['in_AllocationCropDutyAmount'] = ""
dfPOU['in_AllocationExpirationDate'] = ""
dfPOU['in_AllocationFlow_CFS'] = dfu['in_AllocationFlow_CFS']
dfPOU['in_AllocationLegalStatusCV'] = dfu['WaRecProcessStatusTypeCode']
dfPOU['in_AllocationNativeID'] =  dfu['WR_Doc_ID'].astype(str)
dfPOU['in_AllocationOwner'] = dfu['Owner']
dfPOU['in_AllocationPriorityDate'] = dfu['PriorityDate']
dfPOU['in_AllocationTimeframeEnd'] = ""
dfPOU['in_AllocationTimeframeStart'] = ""
dfPOU['in_AllocationTypeCV'] = dfu['WaRecPhaseTypeCode']
dfPOU['in_AllocationVolume_AF'] = dfu['AnnualVolumeQuantity']
dfPOU['in_BeneficialUseCategory'] = dfu['PurposeOfUseTypeCodes']
dfPOU['in_CommunityWaterSupplySystem'] = ""
dfPOU['in_CropTypeCV'] = ""
dfPOU['in_CustomerTypeCV'] = ""
dfPOU['in_DataPublicationDate'] = ""
dfPOU['in_DataPublicationDOI'] = ""
dfPOU['in_ExemptOfVolumeFlowPriority'] = 0
dfPOU['in_GeneratedPowerCapacityMW'] = ""
dfPOU['in_IrrigatedAcreage'] = dfu['IrrigatedAreaQuantity']
dfPOU['in_IrrigationMethodCV'] = ""
dfPOU['in_LegacyAllocationIDs'] = ""
dfPOU['in_OwnerClassificationCV'] = ""
dfPOU['in_PopulationServed'] = ""
dfPOU['in_PowerType'] = ""
dfPOU['in_PrimaryBeneficialUseCategory'] = ""
dfPOU['in_SDWISIdentifierCV'] = ""
dfPOU['in_WaterAllocationNativeURL'] = ""

dfPOU = dfPOU.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfPOU))
dfPOU.head()

176894


Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,waU0,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.59121,-119.73537,,,POU,WaDE Unspecified,POUwadeID1,,WaDE Unspecified,WA,,,,,,,,,,,3.06351,Active,2084118.0,USARMY Corps Engineers,4/1/1927 0:00,,,Certificate,800.0,DG IR,,,,,,0,,200.0,,,,,,,,
1,waU1,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58282,-119.76184,,,POU,WaDE Unspecified,POUwadeID2,,WaDE Unspecified,WA,,,,,,,,,,,3.78762,Active,2084120.0,USARMY Corps Engineers,4/1/1927 0:00,,,Certificate,1760.0,DG IR,,,,,,0,,440.0,,,,,,,,
2,waU2,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58628,-119.77845,,,POU,WaDE Unspecified,POUwadeID3,,WaDE Unspecified,WA,,,,,,,,,,,0.3342,Active,2084124.0,USARMY Corps Engineers,4/1/1927 0:00,,,Certificate,160.0,DG IR,,,,,,0,,40.0,,,,,,,,
3,waU3,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58077,-119.78103,,,POU,WaDE Unspecified,POUwadeID4,,WaDE Unspecified,WA,,,,,,,,,,,1.78241,Active,2084121.0,USARMY Corps Engineers,4/1/1927 0:00,,,Certificate,320.0,DG IR,,,,,,0,,80.0,,,,,,,,
4,waU4,,,,WaDE Unspecified,,groundwater,,,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.57352,-119.76516,,,POU,WaDE Unspecified,POUwadeID5,,WaDE Unspecified,WA,,,,,,,,,,,0.8912,Active,2084122.0,USARMY Corps Engineers,4/1/1927 0:00,,,Certificate,320.0,DG IR,,,,,,0,,80.0,,,,,,,,


## Concatenate POD & POU

In [23]:
# Concatenate
frames = [dfPOD, dfPOU]
outdf = pd.concat(frames)
outdf = outdf.replace(np.nan, "").replace("nan", "")
outdf = outdf.drop_duplicates().replace(np.nan, "").reset_index(drop=True)

print(len(outdf))
outdf.head(1)

368707


Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,waD0,,,,WaDE Unspecified,,,,,WaDE Unspecified,U,WaDE Unspecified,4326,,,,46.58081,-120.39878,,,POD,WaDE Unspecified,POD200801.0,,WL,WA,,,,,,,,,,,0.0,,,,,,,,,,,,,,,0,,,,,,,,,,


## Clean output dataframes

In [24]:
# Fixing empty string names

def fixEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [26]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['WaDE Unspecified', 'groundwater', 'surfaceWater', 'reservoir'],
      dtype=object)

In [27]:
outdf['in_CoordinateMethodCV'] = outdf.apply(lambda row: fixEmptyString(row['in_CoordinateMethodCV']), axis=1)
outdf['in_CoordinateMethodCV'].unique()

array(['U', 'G', 'P', 'C', 'UD', 'PX', 'W', 'UA', 'UX', 'WX', 'PM', 'WD',
       'PD', 'PA', 'UM', 'WA', 'WaDE Unspecified'], dtype=object)

In [28]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['WL', 'MW', 'GC', 'HW', 'PM', 'RD', 'ID', 'WaDE Unspecified'],
      dtype=object)

In [29]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['WaDE Unspecified', 'Active', 'Inactive', 'ChangeInProgress',
       'InTrustTemp'], dtype=object)

In [30]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['WaDE Unspecified', 'Mackie Sandy',
       'Ste Michelle Wine Estates Ltd Paterson', ...,
       'Aspect Consulting  Price', 'Richardson Monica',
       'Richardson Thomas'], dtype=object)

In [31]:
outdf['in_AllocationTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationTypeCV']), axis=1)
outdf['in_AllocationTypeCV'].unique()

array(['WaDE Unspecified', 'ChangeROE', 'NewApp', 'ChangeApplication',
       'Certificate', 'Claim', 'SupersedingCertificate',
       'TemporaryDonation', 'ClaimAmendment', 'Permit',
       'SupersedingPermit', 'ROE', 'AdjudicatedCertificate',
       'CertificateOfChange', 'QuincyBasinPermit',
       'SupersedingQuincyBasinPermit', 'TemporaryUse', 'MitigatedPermit',
       'ShortTerm', 'SupersedingAdjudicatedCert',
       'SupersedingCertificateOfChange', 'ConditionalFinalOrder',
       'DroughtAuthorization'], dtype=object)

In [32]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['WaDE Unspecified', 'IR', 'DY DM', ..., 'IR FR IFlow',
       'IR IFlow MT', 'DC OT'], dtype=object)

In [33]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.        , 3.34201389, 4.79021991, ..., 0.124     , 0.329     ,
       0.5715    ])

In [34]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([  0.  , 345.26, 420.9 , ...,  16.09, 127.35,  15.88])

In [35]:
# Convert in_IrrigatedAcreage to numeric
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').fillna(0).astype(float)
outdf['in_IrrigatedAcreage'].unique()

array([   0. ,  283. ,  175. , ...,  860. , 1094.6,   26.3])

In [36]:
#Changing datatype of used date fields
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array([                          'NaT', '2006-04-27T00:00:00.000000000',
       '1990-06-06T00:00:00.000000000', ...,
       '2020-06-15T00:00:00.000000000', '2018-11-20T00:00:00.000000000',
       '2020-06-05T00:00:00.000000000'], dtype='datetime64[ns]')

In [38]:
%%time
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

Wall time: 1min 11s


array(['', 'wadeID2', 'wadeID3', 'wadeID4'], dtype=object)

## Shapefile Data
- For attaching gemetry to csv inputs.

In [39]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/WA_PoU2.shp')
dfPoUshapetemp.head(3)

Unnamed: 0,WR_DOC_ID,WR_Doc_POU,Fill_CD,WR_Doc_NR,WR_Doc_Typ,Quality_CD,Misc_CD,Position_W,Active_DT,Inactive_D,Update_TD,Update_Use,Comment_DS,Created_TD,Created_Us,Shape_Leng,Shape_Area,Latitude,Longitude,geometry
0,2084118,0,7,GWC01066-D,CE,G,RECHECKED\WWT,S,,,2009-01-23,"""ECY\DKRO461""",,,,32011.93492,35404849.0503,46.59121,-119.73537,"POLYGON ((-119.74933 46.58447, -119.74899 46.5..."
1,2084120,0,14,GWC01067-D,CE,G,,S,,,2011-05-27,"""ECY\ATRO461""",,,,21354.7979,19626630.8507,46.58282,-119.76184,"POLYGON ((-119.74933 46.58447, -119.74966 46.5..."
2,2084124,0,39,GWC01070-D,CE,G,,S,,,2011-05-27,"""ECY\ATRO461""",,,,5267.28459,1733864.72901,46.58628,-119.77845,"POLYGON ((-119.77580 46.58451, -119.78105 46.5..."


In [40]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfPoUshapetemp['Latitude']
dfSiteNativeID['in_Longitude'] = dfPoUshapetemp['Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPoUshapetemp['in_SiteNativeID'] = dfPoUshapetemp.apply(lambda row: retrieveSiteNativeID( row['Latitude'], row['Longitude']), axis=1)
dfPoUshapetemp.head(2)

Unnamed: 0,WR_DOC_ID,WR_Doc_POU,Fill_CD,WR_Doc_NR,WR_Doc_Typ,Quality_CD,Misc_CD,Position_W,Active_DT,Inactive_D,Update_TD,Update_Use,Comment_DS,Created_TD,Created_Us,Shape_Leng,Shape_Area,Latitude,Longitude,geometry,in_SiteNativeID
0,2084118,0,7,GWC01066-D,CE,G,RECHECKED\WWT,S,,,2009-01-23,"""ECY\DKRO461""",,,,32011.93492,35404849.0503,46.59121,-119.73537,"POLYGON ((-119.74933 46.58447, -119.74899 46.5...",wadeID1
1,2084120,0,14,GWC01067-D,CE,G,,S,,,2011-05-27,"""ECY\ATRO461""",,,,21354.7979,19626630.8507,46.58282,-119.76184,"POLYGON ((-119.74933 46.58447, -119.74966 46.5...",wadeID2


In [41]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['in_SiteNativeID'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

Unnamed: 0,in_SiteNativeID,geometry
0,POUwadeID1,"POLYGON ((-119.74933 46.58447, -119.74899 46.5..."
1,POUwadeID2,"POLYGON ((-119.74933 46.58447, -119.74966 46.5..."
2,POUwadeID3,"POLYGON ((-119.77580 46.58451, -119.78105 46.5..."


## Export Outputs

In [42]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368707 entries, 0 to 368706
Data columns (total 62 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   WaDEUUID                                      368707 non-null  object        
 1   in_Geometry                                   368707 non-null  object        
 2   in_GNISFeatureNameCV                          368707 non-null  object        
 3   in_WaterQualityIndicatorCV                    368707 non-null  object        
 4   in_WaterSourceName                            368707 non-null  object        
 5   in_WaterSourceNativeID                        368707 non-null  object        
 6   in_WaterSourceTypeCV                          368707 non-null  object        
 7   in_RegulatoryOverlayUUIDs                     368707 non-null  object        
 8   in_WaterSourceUUID                            368707 n

In [43]:
outdf.to_csv('Pwr_waMain.zip', compression=dict(method='zip', archive_name='Pwr_waMain.csv'), index=False)  # The output
dfPoUshape.to_csv('P_WashingtonGeometry.zip', compression=dict(method='zip', archive_name='P_WashingtonGeometry.csv'), index=False)  # The output geometry.