# Preprocessing Washington Allocation data for WaDE upload.
- Purpose:  To preprocess the Washington data into one master file for simple DataFrame creation and extraction

Useful Links to Data:
- The Data - Geographic Water Information System (GWIS)Data from the WA stat: https://fortress.wa.gov/ecy/gispublic/DataDownload/wr/GWIS_Data/
- Data dictionary - https://fortress.wa.gov/ecy/gispublic/DataDownload/wr/GWIS_Data/GWIS_Data_Dictionary/
- Public website   - https://ecology.wa.gov/Water-Shorelines/Water-supply/Water-rights

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Washington/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data

In [3]:
# # Dataframe creation
# df_1 = pd.read_csv(d_pointFile, encoding = "ISO-8859-1") #Input
# df_2 = pd.read_csv(D_Point_WR_DocFile, encoding = "ISO-8859-1") #Input
# df_3 = pd.read_csv(Person_Plus_EXTRACT_FromWRTSnotGWISFile, encoding = "ISO-8859-1") #Input

In [4]:
# Input File, contains PoD info
d_pointFile = "D_PointTable.zip"
df_1 = pd.read_csv(d_pointFile, encoding = "ISO-8859-1")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_1:
    df_1['WaDEUUID'] = "waD" + df_1.index.astype(str)
    df_1.to_csv('D_PointTable.zip', compression=dict(method='zip', archive_name='D_PointTable.csv'), index=False)

print(len(df_1))
df_1.head()

155880


  df_1 = pd.read_csv(d_pointFile, encoding = "ISO-8859-1")


Unnamed: 0,OBJECTID,D_Point_ID,D_Point_Ty,Location_C,Assoc_FL,Misc_CD,Position_W,Active_DT,Inactive_D,Update_TD,Update_Use,Comment_DS,Created_TD,Created_Us,POINT_X,POINT_Y,WaDEUUID
0,1,200801.0,WL,U,N,,S,,,3/28/2013 9:58:00,"ECY\DKRO461""""",,,,1665873.05897,454923.23627,waD0
1,2,200889.0,MW,G,Y,,S,,,,,,,,1816741.2031,456744.05513,waD1
2,3,200890.0,MW,G,Y,,S,,,,,,,,1813847.79419,456733.00627,waD2
3,4,201092.0,WL,U,Y,,S,,,,,,,,1571475.12802,457350.13069,waD3
4,5,201191.0,MW,G,Y,,S,,,6/11/2010 11:28:19,"ECY\DKRO461""""",,,,1715391.54721,402037.7843,waD4


In [5]:
# Input File, Bridge table
D_Point_WR_DocFile = "D_Point_WR_Doc.zip"
df_2 = pd.read_csv(D_Point_WR_DocFile, encoding = "ISO-8859-1")
print(len(df_2))
df_2.head()

175196


  df_2 = pd.read_csv(D_Point_WR_DocFile, encoding = "ISO-8859-1")


Unnamed: 0,OBJECTID,D_Point_WR_Doc_ID,D_Point_ID,WR_Doc_NR,WR_Doc_ID,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Created_TD,Created_User_ID
0,1566,,204903.0,G4-26991C,,,,,,,
1,1567,,204904.0,G4-26830C,,,,,,,
2,1568,,204905.0,G4-29018C,,,,,,,
3,1569,,204906.0,G4-29018C,,,,,,,
4,1570,,204907.0,G4-25116C,,,,,,,


In [6]:
# Input File, Contains water use and owner info
Person_Plus_EXTRACT_FromWRTSnotGWISFile = "Person_Plus_EXTRACT_FromWRTSnotGWIS.zip"
df_3 = pd.read_csv(Person_Plus_EXTRACT_FromWRTSnotGWISFile, encoding = "ISO-8859-1")
print(len(df_3))
df_3.head()

301470


Unnamed: 0,ÃÂ¯ÃÂ»ÃÂ¿OID_,OBJECTID_1,WaRecId,WaRecId_1,WR_Doc_ID,WaRecPhaseId,PartyRoleTypeCode,PersonLastOrOrganizationNM,PersonFirstNM,PersonMINM,PersonAddressLine1AD,PersonAddressLine2AD,PersonAddressLine3AD,PersonAddressCityAD,PersonAddressStateCode,PersonAddressZipCodeAD,WaRecRCWClassTypeCode,EcologyRegionCode,WaRecPrimaryNumber,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,1,260791,2144735,2144735,2144735,-1,Primary,Bengen Farms Dba Juniper Road Farms,,,591 Bengen Lane,,,Pasco,WA,99301,groundwater,ERO,G3-28565,11/2/1988 0:00,Active,,,,,,,,
1,2,267032,2144672,2144672,2144672,-1,Primary,Bengen Farms Dba Juniper Road Farms,,,591 Bengen Lane,,,Pasco,WA,99301,groundwater,ERO,G3-24977,7/24/1976 0:00,Active,,,,,,,,
2,3,177316,2142777,2142777,2142777,-1,Primary,Creston Town,,,,,,,,,groundwater,ERO,G3-*00582S,9/1/1942 0:00,Inactive,,,,,,,,
3,4,260780,2145783,2145783,2145783,-1,Primary,Bengen Farms Dba Juniper Road Farms,,,591 Bengen Lane,,,Pasco,WA,99301,groundwater,ERO,G3-25099(A),11/4/1976 0:00,Active,,,,,,,,
4,5,275652,6500899,6500899,6500899,-1,Primary,Farmland Reserve Inc,,,6716 W Rio Grande Ave,PO Box 2308,,Pasco,WA,99302-2308,surfaceWater,CRO,CS4-16571(A1)@1,2/17/2015 0:00,Inactive,,,,,,,,


In [7]:
# Merging dataframes into one, using left-join.
dfinPOD = pd.DataFrame()
dfinPOD = pd.merge(df_1, df_2, left_on='D_Point_ID', right_on='D_Point_ID', how='left') # Joinning PoD data
dfinPOD = pd.merge(dfinPOD, df_3, left_on='WR_Doc_ID', right_on='WR_Doc_ID', how='left') # Joinning PoD data

dfinPOD = dfinPOD.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfinPOD))
dfinPOD.head(1)

198075


Unnamed: 0,OBJECTID_x,D_Point_ID,D_Point_Ty,Location_C,Assoc_FL,Misc_CD,Position_W,Active_DT_x,Inactive_D,Update_TD_x,Update_Use,Comment_DS,Created_TD_x,Created_Us,POINT_X,POINT_Y,WaDEUUID,OBJECTID_y,D_Point_WR_Doc_ID,WR_Doc_NR,WR_Doc_ID,Active_DT_y,Inactive_DT,Update_TD_y,Update_User_ID,Created_TD_y,Created_User_ID,ÃÂ¯ÃÂ»ÃÂ¿OID_,OBJECTID_1,WaRecId,WaRecId_1,WaRecPhaseId,PartyRoleTypeCode,PersonLastOrOrganizationNM,PersonFirstNM,PersonMINM,PersonAddressLine1AD,PersonAddressLine2AD,PersonAddressLine3AD,PersonAddressCityAD,PersonAddressStateCode,PersonAddressZipCodeAD,WaRecRCWClassTypeCode,EcologyRegionCode,WaRecPrimaryNumber,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,1,200801.0,WL,U,N,,S,,,3/28/2013 9:58:00,"ECY\DKRO461""""",,,,1665873.05897,454923.23627,waD0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
# For converting projection latitude.
from pyproj import Transformer, transform
transformer = Transformer.from_proj(2927, 4326)  # A trick to drastically optimize the Transformer of pyproj.
# Washignton projection = EPSG:2927. WGS84 projection used by WaDE 2.0 = epsg:4326.

def assignLat(colrowValueLat, colrowValueLong):
    if colrowValueLat == '' or pd.isnull(colrowValueLat):
        lat = ""
    else:
        lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return lat

# For converting projection longitude.
def assignLong(colrowValueLat, colrowValueLong):
    if colrowValueLong == '' or pd.isnull(colrowValueLong):
        long = ""
    else:
        lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return long

dfinPOD['in_Latitude'] = dfinPOD.apply(lambda row: assignLat(row['POINT_X'], row['POINT_Y']), axis=1)
dfinPOD['in_Longitude'] = dfinPOD.apply(lambda row: assignLong(row['POINT_X'], row['POINT_Y']), axis=1)

In [9]:
# fix owner name

def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = re.sub("[$@&.;,/\)(-]", "", outlist).title().strip()
    
    return outlist


dfinPOD['Owner'] = dfinPOD.apply(lambda row: assignOwner(row['PersonFirstNM'],
                                               row['PersonMINM'],
                                               row['PersonLastOrOrganizationNM']), axis=1)
dfinPOD['Owner'].unique()

array(['', 'Mackie Sandy', 'Ste Michelle Wine Estates Ltd Paterson', ...,
       'Larkspur Sr Llc', 'Taggares Fruit Company',
       'Western Farmland Llc'], dtype=object)

In [10]:
#Manually filling in empty class code with ‘Unspecified’ value.
def assignWaRecRCWClassTypeCode(colValue):
    if colValue == "" or pd.isnull(colValue):
        outlist = "WaDE Unspecified"
    else:
        outlist = colValue.strip()
    return outlist


dfinPOD['WaRecRCWClassTypeCode'] = dfinPOD.apply(lambda row: assignWaRecRCWClassTypeCode(row['WaRecRCWClassTypeCode']), axis=1)
dfinPOD['WaRecRCWClassTypeCode'].unique()

array(['WaDE Unspecified', 'groundwater', 'surfaceWater', 'reservoir'],
      dtype=object)

In [11]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

dfinPOD['in_AllocationFlow_CFS'] = dfinPOD.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)
dfinPOD['in_AllocationFlow_CFS'].unique()

array(['', 3.3420138899999996, 4.790219908999999, ..., 13.81, 86.94, 8.62],
      dtype=object)

In [12]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WAwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WAwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WAwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Unspecified"
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOD['WaRecRCWClassTypeCode']

# Site Info
df['in_CoordinateAccuracy'] = dfinPOD['Location_C']
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['in_Latitude']
df['in_Longitude'] = dfinPOD['in_Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = "WaDE Unspecified"
df['in_SiteNativeID'] = "POD" + dfinPOD['D_Point_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD['D_Point_Ty']
df['in_StateCV'] = "WA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['in_AllocationFlow_CFS']
df['in_AllocationLegalStatusCV'] = dfinPOD['WaRecProcessStatusTypeCode']
df['in_AllocationNativeID'] =  dfinPOD['WR_Doc_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfinPOD['Owner']
df['in_AllocationPriorityDate'] = dfinPOD['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfinPOD['WaRecPhaseTypeCode']
df['in_AllocationVolume_AF'] = dfinPOD['AnnualVolumeQuantity']
df['in_BeneficialUseCategory'] = dfinPOD['PurposeOfUseTypeCodes'].astype(str)
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOD['IrrigatedAreaQuantity']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://appswr.ecology.wa.gov/waterrighttrackingsystem/WaterRights/WaterRightRecord.aspx?waRecId=" + dfinPOD['WaRecId'].replace("", 0).fillna(0).astype(int).astype(str)

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

191810


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,waD0,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,WaDE Unspecified,U,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58081,-120.39878,,,POD,WaDE Unspecified,POD200801,,WL,WA,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
1,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,G,WaDE Unspecified,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,3.34201,Active,4271597,Mackie Sandy,4/27/2006 0:00,,,,ChangeROE,345.26,IR,,,,,,0,,283.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
2,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,G,WaDE Unspecified,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,3.34201,Active,4271597,Ste Michelle Wine Estates Ltd Paterson,4/27/2006 0:00,,,,ChangeROE,345.26,IR,,,,,,0,,283.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
3,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,WaDE Unspecified,G,WaDE Unspecified,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,,,0,,,,,,,,,,,,,,0,,,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
4,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,G,WaDE Unspecified,WaDE Unspecified,4326,,,,46.5837,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,4.79022,Active,2085743,Stimson Lane Limited,6/6/1990 0:00,,,,NewApp,,IR,,,,,,0,,175.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...


## POU Data

In [13]:
# Input File, contains POU info
pouInput = "WA_POU_Input.zip" 
df_1u = pd.read_csv(pouInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_1u:
    df_1u['WaDEUUID'] = "waU" + df_1u.index.astype(str)
    df_1u.to_csv('WA_POU_Input.zip', compression=dict(method='zip', archive_name='WA_POU_Input.csv'), index=False)

print(len(df_1u))
df_1u.head(1)

156369


  df_1u = pd.read_csv(pouInput)


Unnamed: 0,OID_,WR_DOC_ID,WR_Doc_POU_ID,Fill_CD,WR_Doc_NR,WR_Doc_Type_CD,Quality_CD,Misc_CD,Position_With_CD,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Comment_DS,Created_TD,Created_User_ID,Shape_Length,Shape_Area,Latitude,Longitude,WaDEUUID
0,1,2084118.0,,7.0,GWC01066-D,CE,G,RECHECKED\WWT,S,,,1/23/2009 11:18:38,"ECY\DKRO461""""",,,,32011.93492,35404849.05028,46.59121,-119.73537,waU0


In [14]:
# Merging dataframes into one, using left-join.
# df_u1 and df3
dfinPOU = pd.DataFrame()
dfinPOU = pd.merge(df_1u, df_3, left_on='WR_DOC_ID', right_on='WR_Doc_ID', how='left')

dfinPOU = dfinPOU.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfinPOU))
dfinPOU.head(1)

177340


Unnamed: 0,OID_,WR_DOC_ID,WR_Doc_POU_ID,Fill_CD,WR_Doc_NR,WR_Doc_Type_CD,Quality_CD,Misc_CD,Position_With_CD,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Comment_DS,Created_TD,Created_User_ID,Shape_Length,Shape_Area,Latitude,Longitude,WaDEUUID,ÃÂ¯ÃÂ»ÃÂ¿OID_,OBJECTID_1,WaRecId,WaRecId_1,WR_Doc_ID,WaRecPhaseId,PartyRoleTypeCode,PersonLastOrOrganizationNM,PersonFirstNM,PersonMINM,PersonAddressLine1AD,PersonAddressLine2AD,PersonAddressLine3AD,PersonAddressCityAD,PersonAddressStateCode,PersonAddressZipCodeAD,WaRecRCWClassTypeCode,EcologyRegionCode,WaRecPrimaryNumber,PriorityDate,WaRecProcessStatusTypeCode,WaRecClaimTypeCode,WaRecPhaseTypeCode,WaRecPhaseStageTypeCode,InstantaneousQuantity,AnnualVolumeQuantity,IrrigatedAreaQuantity,InstantaneousUnitCode,PurposeOfUseTypeCodes
0,1,2084118.0,,7.0,GWC01066-D,CE,G,RECHECKED\WWT,S,,,1/23/2009 11:18:38,"ECY\DKRO461""""",,,,32011.93492,35404849.05028,46.59121,-119.73537,waU0,58522.0,84235.0,2084118.0,2084118.0,2084118.0,59585.0,Primary,USARMY Corps Engineers,,,,,,,,,groundwater,CRO,G4-*01105SWRIS,4/1/1927 0:00,Active,,Certificate,,1375.0,800.0,200.0,GPM,DG IR


In [15]:
def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = re.sub("[$@&.;,/\)(-]", "", outlist).title().strip()
    
    return outlist


dfinPOU['Owner'] = dfinPOU.apply(lambda row: assignOwner(row['PersonFirstNM'], row['PersonMINM'], row['PersonLastOrOrganizationNM']), axis=1)
dfinPOU['Owner'].unique()

array(['Usarmy Corps Engineers', 'S Martinez Livestock Inc',
       'Trainor Russell C', ..., 'Aspect Consulting  Price',
       'Richardson Monica', 'Richardson Thomas'], dtype=object)

In [16]:
#Manually filling in empty class code with ‘unknown’ value.
def assignWaRecRCWClassTypeCode(colValue):
    if colValue == "" or pd.isnull(colValue):
        outlist = "WaDE Unspecified"
    else:
        outlist = colValue.strip()
    return outlist


dfinPOU['WaRecRCWClassTypeCode'] = dfinPOU.apply(lambda row: assignWaRecRCWClassTypeCode(row['WaRecRCWClassTypeCode']), axis=1)
dfinPOU['WaRecRCWClassTypeCode'].unique()

array(['groundwater', 'surfaceWater', 'reservoir', 'WaDE Unspecified'],
      dtype=object)

In [17]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

dfinPOU['in_AllocationFlow_CFS'] =  dfinPOU.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)
dfinPOU['in_AllocationFlow_CFS'].unique()

array([3.0635127324999996, 3.787615742, 0.33420138899999996, ..., 0.124,
       0.329, 0.5715], dtype=object)

In [18]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfinPOU['Latitude']
dfSiteNativeID['in_Longitude'] = dfinPOU['Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfinPOU['in_SiteNativeID'] = dfinPOU.apply(lambda row: retrieveSiteNativeID(row['Latitude'], row['Longitude']), axis=1)
dfinPOU['in_SiteNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID136197',
       'wadeID136198', 'wadeID136199'], dtype=object)

In [19]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WAwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WAwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WAwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Unspecified"
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOU['WaRecRCWClassTypeCode']

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Unspecified"
df['in_CoordinateMethodCV'] = "WaDE Unspecified"
df['in_County'] = "WaDE Unspecified"
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['Latitude']
df['in_Longitude'] = dfinPOU['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"
df['in_SiteName'] = "WaDE Unspecified"
df['in_SiteNativeID'] = "POU" + dfinPOU['in_SiteNativeID'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE Unspecified"
df['in_StateCV'] = "WA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOU['in_AllocationFlow_CFS']
df['in_AllocationLegalStatusCV'] = dfinPOU['WaRecProcessStatusTypeCode']
df['in_AllocationNativeID'] =  dfinPOU['WR_Doc_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfinPOU['Owner']
df['in_AllocationPriorityDate'] = dfinPOU['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfinPOU['WaRecPhaseTypeCode']
df['in_AllocationVolume_AF'] = dfinPOU['AnnualVolumeQuantity']
df['in_BeneficialUseCategory'] = dfinPOU['PurposeOfUseTypeCodes'].astype(str)
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOU['IrrigatedAreaQuantity']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://appswr.ecology.wa.gov/waterrighttrackingsystem/WaterRights/WaterRightRecord.aspx?waRecId=" + dfinPOU['WaRecId'].replace("", 0).fillna(0).astype(int).astype(str)

outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True)
print(len(outPOU))
outPOU.head()

176890


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,waU0,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.59121,-119.73537,,,POU,WaDE Unspecified,POUwadeID1,,WaDE Unspecified,WA,,,,,,,,,,3.06351,Active,2084118,Usarmy Corps Engineers,4/1/1927 0:00,,,,Certificate,800.0,DG IR,,,,,,0,,200.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
1,waU1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58282,-119.76184,,,POU,WaDE Unspecified,POUwadeID2,,WaDE Unspecified,WA,,,,,,,,,,3.78762,Active,2084120,Usarmy Corps Engineers,4/1/1927 0:00,,,,Certificate,1760.0,DG IR,,,,,,0,,440.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
2,waU2,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58628,-119.77845,,,POU,WaDE Unspecified,POUwadeID3,,WaDE Unspecified,WA,,,,,,,,,,0.3342,Active,2084124,Usarmy Corps Engineers,4/1/1927 0:00,,,,Certificate,160.0,DG IR,,,,,,0,,40.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
3,waU3,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58077,-119.78103,,,POU,WaDE Unspecified,POUwadeID4,,WaDE Unspecified,WA,,,,,,,,,,1.78241,Active,2084121,Usarmy Corps Engineers,4/1/1927 0:00,,,,Certificate,320.0,DG IR,,,,,,0,,80.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...
4,waU4,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,46.57352,-119.76516,,,POU,WaDE Unspecified,POUwadeID5,,WaDE Unspecified,WA,,,,,,,,,,0.8912,Active,2084122,Usarmy Corps Engineers,4/1/1927 0:00,,,,Certificate,320.0,DG IR,,,,,,0,,80.0,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...


## Concatenate POD & POU

In [20]:
# Concatenate dataframes
frames = [outPOD, outPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

368700


In [21]:
# For creating WaterSourceTypeCV
wsTypeDict = {
    "Unspecified" : "Unspecified",
    "groundwater" : "Groundwater",
    "surfaceWater" : "Surface Water",
    "reservoir" : "reservoir"}
def assignWaterSourceTypeCV(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = colrowValue.strip()
        try:
            outList = wsTypeDict[String1]
        except:
            outList = "WaDE Unspecified"
    return outList

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: assignWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)

In [22]:
# For Creating CoordinateAccuracy
coordinateAccuracyDictWA = {
    "C":"field checked (without GPS)",
    "G":"field checked with GPS",
    "P":"proposed (does not exist in real world)",
    "PA":"proposed and All-right (does not exist in real world)",
    "PD":"proposed and Dubious (does not exist in real world)",
    "PM":"proposed and Multiple Dubious (does not exist in real world)",
    "PX":"proposed and Centroid Dubious (does not exist in real world)",
    "U":"unchecked",
    "UA":"unchecked and All-right",
    "UD":"unchecked and Dubious",
    "UM":"unchecked and Multiple Dubious",
    "UX":"unchecked and Centroid Dubious",
    "W":"from well log, unchecked",
    "WA":"from well log, unchecked and All-right",
    "WD":"from well log, unchecked and Dubious",
    "WX":"from well log, unchecked and Centroid Dubious"}
def assignCoordinateAccuracy(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        keyStr = colrowValue.strip()
        try:
            outList = coordinateAccuracyDictWA[keyStr]
        except:
            outList = "WaDE Unspecified"
    return outList

outdf['in_CoordinateAccuracy'] = outdf.apply(lambda row: assignCoordinateAccuracy(row['in_CoordinateAccuracy']), axis=1)

In [23]:
# For creating SiteTypeCV
UnknownSTCVDict = {
    "GC":"ground water collector",
    "HW":"headworks gravity flow (or surface water device unknown)",
    "ID":"irrigation dam",
    "MW":"monitoring well",
    "PM":"surface water pump",
    "RD":"reservoir dam",
    "WL":"well (or ground water device unknown)"}
def assignSiteTypeCV(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        try:
            outList = UnknownSTCVDict[String1]
        except:
            outList = "WaDE Unspecified"

    return outList

outdf['SiteTypeCV'] = outdf.apply(lambda row: assignSiteTypeCV(row['in_SiteTypeCV']), axis=1)

In [49]:
# For creating BeneficialUseCategory
benUseDict = {
    "508-14":"508-14",
    "AI":"Agricultural Irrigation",
    "CI":"Commercial & indust",
    "CM":"Commercial",
    "CO":"Cooling for indust proces",
    "DC":"Dust Control",
    "DG":"Domestic general",
    "DM":"Domestic multiple",
    "DS":"Domestic single",
    "DY":"Dairy",
    "EN":"Environmental quality",
    "FP":"Frost protection",
    "FR":"Fire protection",
    "FS":"Fish propagation",
    "GP":"Groundwater Preservation",
    "HE":"Heat Exchange",
    "HP":"Heat protection for crops",
    "HW":"Highway",
    "IFlow":"Instream Flow",
    "II":"Individual Irrigation",
    "IR":"Irrigation",
    "IT":"Municipal inter-tie system",
    "IU":"Irrigation Unknown",
    "MI":"Mining",
    "MT":"Mitigation",
    "MU":"Municipal",
    "NR":"No Purpose Identified",
    "OT":"Other",
    "PO":"Power",
    "PR":"Parks and Recreation",
    "RE":"Recreation - beautification",
    "RW":"Railway",
    "SA":"Stream augmentation",
    "SR":"Storage",
    "ST":"Stock water",
    "TS":"Test Well",
    "TW-P":"Trust water, Permanent",
    "TW-T":"Trust water, Temporary",
    "WL":"Wildlife refuge"}
def assignBenUseCategory(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        keyStr = colrowValue.strip()
        try:
            benUseListStr = keyStr.split()  # Need to split WA csv data
            outList = ", ".join(benUseDict[inx] for inx in benUseListStr)
        except:
            outList = "WaDE Unspecified"
    return outList

outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: assignBenUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['WaDE Unspecified', 'Irrigation', 'Dairy, Domestic multiple', ...,
       'Irrigation, Fire protection, Instream Flow',
       'Irrigation, Instream Flow, Mitigation', 'Dust Control, Other'],
      dtype=object)

In [25]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [26]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['WaDE Unspecified'], dtype=object)

In [27]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['WaDE Unspecified', 'Groundwater', 'Surface Water', 'reservoir'],
      dtype=object)

In [28]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['WL', 'MW', 'GC', 'HW', 'PM', 'RD', 'ID', 'WaDE Unspecified'],
      dtype=object)

In [29]:
outdf['in_AllocationTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationTypeCV']), axis=1)
outdf['in_AllocationTypeCV'].unique()

array(['WaDE Unspecified', 'ChangeROE', 'NewApp', 'ChangeApplication',
       'Certificate', 'Claim', 'SupersedingCertificate',
       'TemporaryDonation', 'ClaimAmendment', 'Permit',
       'SupersedingPermit', 'ROE', 'AdjudicatedCertificate',
       'CertificateOfChange', 'QuincyBasinPermit',
       'SupersedingQuincyBasinPermit', 'TemporaryUse', 'MitigatedPermit',
       'ShortTerm', 'SupersedingAdjudicatedCert',
       'SupersedingCertificateOfChange', 'ConditionalFinalOrder',
       'DroughtAuthorization'], dtype=object)

In [30]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['WaDE Unspecified', 'Active', 'Inactive', 'ChangeInProgress',
       'InTrustTemp'], dtype=object)

In [31]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['WaDE Unspecified', 'Mackie Sandy',
       'Ste Michelle Wine Estates Ltd Paterson', ...,
       'Aspect Consulting  Price', 'Richardson Monica',
       'Richardson Thomas'], dtype=object)

In [51]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['WaDE Unspecified', 'Irrigation', 'Dairy, Domestic multiple', ...,
       'Irrigation, Fire protection, Instream Flow',
       'Irrigation, Instream Flow, Mitigation', 'Dust Control, Other'],
      dtype=object)

In [33]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,SiteTypeCV,BeneficialUseCategory
0,waD0,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,,WaDE Unspecified,unchecked,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58081,-120.39878,,,POD,WaDE Unspecified,POD200801,,WL,WA,,,,,,,,,,,WaDE Unspecified,0,WaDE Unspecified,,,,,WaDE Unspecified,,WaDE Unspecified,,,,,,0,,,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,well (or ground water device unknown),WaDE Unspecified


In [35]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array([                          'NaT', '2006-04-27T00:00:00.000000000',
       '1990-06-06T00:00:00.000000000', ...,
       '2020-06-15T00:00:00.000000000', '2018-11-20T00:00:00.000000000',
       '2020-06-05T00:00:00.000000000'], dtype='datetime64[ns]')

In [36]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.        , 3.34201389, 4.79021991, ..., 0.124     , 0.329     ,
       0.5715    ])

In [37]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([  0.  , 345.26, 420.9 , ...,  16.09, 127.35,  15.88])

In [38]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').fillna(0)
outdf['in_IrrigatedAcreage'].unique()

array([   0. ,  283. ,  175. , ...,  860. , 1094.6,   26.3])

In [39]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', 'wadeID4'], dtype=object)

## Shapefile Data
- For attaching gemetry to csv inputs.

In [54]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/WA_PoU2.shp')
dfPoUshapetemp.head(3)

Unnamed: 0,WR_DOC_ID,WR_Doc_POU,Fill_CD,WR_Doc_NR,WR_Doc_Typ,Quality_CD,Misc_CD,Position_W,Active_DT,Inactive_D,Update_TD,Update_Use,Comment_DS,Created_TD,Created_Us,Shape_Leng,Shape_Area,Latitude,Longitude,geometry
0,2084118,0,7,GWC01066-D,CE,G,RECHECKED\WWT,S,,,2009-01-23,"""ECY\DKRO461""",,,,32011.93492,35404849.0503,46.59121,-119.73537,"POLYGON ((-119.74933 46.58447, -119.74899 46.5..."
1,2084120,0,14,GWC01067-D,CE,G,,S,,,2011-05-27,"""ECY\ATRO461""",,,,21354.7979,19626630.8507,46.58282,-119.76184,"POLYGON ((-119.74933 46.58447, -119.74966 46.5..."
2,2084124,0,39,GWC01070-D,CE,G,,S,,,2011-05-27,"""ECY\ATRO461""",,,,5267.28459,1733864.72901,46.58628,-119.77845,"POLYGON ((-119.77580 46.58451, -119.78105 46.5..."


In [55]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfPoUshapetemp['Latitude']
dfSiteNativeID['in_Longitude'] = dfPoUshapetemp['Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPoUshapetemp['in_SiteNativeID'] = dfPoUshapetemp.apply(lambda row: retrieveSiteNativeID( row['Latitude'], row['Longitude']), axis=1)
dfPoUshapetemp.head(2)

Unnamed: 0,WR_DOC_ID,WR_Doc_POU,Fill_CD,WR_Doc_NR,WR_Doc_Typ,Quality_CD,Misc_CD,Position_W,Active_DT,Inactive_D,Update_TD,Update_Use,Comment_DS,Created_TD,Created_Us,Shape_Leng,Shape_Area,Latitude,Longitude,geometry,in_SiteNativeID
0,2084118,0,7,GWC01066-D,CE,G,RECHECKED\WWT,S,,,2009-01-23,"""ECY\DKRO461""",,,,32011.93492,35404849.0503,46.59121,-119.73537,"POLYGON ((-119.74933 46.58447, -119.74899 46.5...",wadeID1
1,2084120,0,14,GWC01067-D,CE,G,,S,,,2011-05-27,"""ECY\ATRO461""",,,,21354.7979,19626630.8507,46.58282,-119.76184,"POLYGON ((-119.74933 46.58447, -119.74966 46.5...",wadeID2


In [56]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['in_SiteNativeID'].replace("", 0).fillna(0).astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

Unnamed: 0,in_SiteNativeID,geometry
0,POUwadeID1,"POLYGON ((-119.74933 46.58447, -119.74899 46.5..."
1,POUwadeID2,"POLYGON ((-119.74933 46.58447, -119.74966 46.5..."
2,POUwadeID3,"POLYGON ((-119.77580 46.58451, -119.78105 46.5..."


## Export Outputs

In [43]:
outdf.info

<bound method DataFrame.info of          WaDEUUID in_MethodUUID in_VariableSpecificUUID in_OrganizationUUID  \
0            waD0       WAwr_M1                 WAwr_V1             WAwr_O1   
1            waD1       WAwr_M1                 WAwr_V1             WAwr_O1   
2            waD1       WAwr_M1                 WAwr_V1             WAwr_O1   
3            waD1       WAwr_M1                 WAwr_V1             WAwr_O1   
4            waD1       WAwr_M1                 WAwr_V1             WAwr_O1   
...           ...           ...                     ...                 ...   
368695  waU156366       WAwr_M1                 WAwr_V1             WAwr_O1   
368696  waU156367       WAwr_M1                 WAwr_V1             WAwr_O1   
368697  waU156367       WAwr_M1                 WAwr_V1             WAwr_O1   
368698  waU156367       WAwr_M1                 WAwr_V1             WAwr_O1   
368699  waU156368       WAwr_M1                 WAwr_V1             WAwr_O1   

       in_Geometry 

In [53]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,SiteTypeCV,BeneficialUseCategory
0,waD0,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID1,WaDE Unspecified,unchecked,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58081,-120.39878,,,POD,WaDE Unspecified,POD200801,,WL,WA,,,,,,,,,,0.00000,WaDE Unspecified,0,WaDE Unspecified,NaT,,,,WaDE Unspecified,0.00000,WaDE Unspecified,,,,,,0,,0.00000,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,well (or ground water device unknown),WaDE Unspecified
1,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,field checked with GPS,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58370,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,3.34201,Active,4271597,Mackie Sandy,2006-04-27,,,,ChangeROE,345.26000,Irrigation,,,,,,0,,283.00000,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,monitoring well,Irrigation
2,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,field checked with GPS,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58370,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,3.34201,Active,4271597,Ste Michelle Wine Estates Ltd Paterson,2006-04-27,,,,ChangeROE,345.26000,Irrigation,,,,,,0,,283.00000,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,monitoring well,Irrigation
3,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID1,WaDE Unspecified,field checked with GPS,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58370,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,0.00000,WaDE Unspecified,0,WaDE Unspecified,NaT,,,,WaDE Unspecified,0.00000,WaDE Unspecified,,,,,,0,,0.00000,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,monitoring well,WaDE Unspecified
4,waD1,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,field checked with GPS,WaDE Unspecified,WaDE Unspecified,4326,,,,46.58370,-119.79874,,,POD,WaDE Unspecified,POD200889,,MW,WA,,,,,,,,,,4.79022,Active,2085743,Stimson Lane Limited,1990-06-06,,,,NewApp,0.00000,Irrigation,,,,,,0,,175.00000,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,monitoring well,Irrigation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368695,waU156366,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,47.16677,-121.05319,,,POU,WaDE Unspecified,POUwadeID136197,,WaDE Unspecified,WA,,,,,,,,,,0.00000,Active,6803048,Aspect Consulting Price,2020-05-22,,,,NewApp,0.41400,"Domestic single, Irrigation",,,,,,0,,0.01100,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,WaDE Unspecified,"Domestic single, Irrigation"
368696,waU156367,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,47.16921,-121.08017,,,POU,WaDE Unspecified,POUwadeID136198,,WaDE Unspecified,WA,,,,,,,,,,0.00000,Active,6803092,Aspect Consulting Price,2020-06-05,,,,NewApp,0.41400,"Domestic single, Irrigation",,,,,,0,,0.01100,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,WaDE Unspecified,"Domestic single, Irrigation"
368697,waU156367,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,47.16921,-121.08017,,,POU,WaDE Unspecified,POUwadeID136198,,WaDE Unspecified,WA,,,,,,,,,,0.00000,Active,6803092,Richardson Monica,2020-06-05,,,,NewApp,0.41400,"Domestic single, Irrigation",,,,,,0,,0.01100,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,WaDE Unspecified,"Domestic single, Irrigation"
368698,waU156367,WAwr_M1,WAwr_V1,WAwr_O1,,,,WaDE Unspecified,wadeID2,Groundwater,WaDE Unspecified,WaDE Unspecified,WaDE Unspecified,4326,,,,47.16921,-121.08017,,,POU,WaDE Unspecified,POUwadeID136198,,WaDE Unspecified,WA,,,,,,,,,,0.00000,Active,6803092,Richardson Thomas,2020-06-05,,,,NewApp,0.41400,"Domestic single, Irrigation",,,,,,0,,0.01100,,,,,,,,https://appswr.ecology.wa.gov/waterrighttracki...,WaDE Unspecified,"Domestic single, Irrigation"


In [57]:
# Export the output dataframe
outdf.to_csv('Pwr_waMain.zip', index=False, compression="zip")  # The output, save as a zip
dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.