# Preprocessing Washington Allocation data for WaDE upload.
- Purpose:  To preprocess the Washington data into one master file for simple DataFrame creation and extraction

Useful Links to Data:
- The Data - Geographic Water Information System (GWIS)Data from the WA stat: https://fortress.wa.gov/ecy/gispublic/DataDownload/wr/GWIS_Data/
- Data dictionary - https://fortress.wa.gov/ecy/gispublic/DataDownload/wr/GWIS_Data/GWIS_Data_Dictionary/
- Public website   - https://ecology.wa.gov/Water-Shorelines/Water-supply/Water-rights

In [None]:
import os
import sys
print(os.environ['CONDA_DEFAULT_ENV'])
print(sys.version)

In [None]:
#Needed Libraries

# working with data
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Washington/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data

In [None]:
# Input File, contains PoD info
d_pointFile = "shapefiles/D_Point.zip"
#df_1 = gpd.read_file(d_pointFile, encoding = "ISO-8859-1").replace(np.nan, "")
df_1 = gpd.read_file(d_pointFile)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_1:
    df_1['WaDEUUID'] = "waD" + df_1.index.astype(str)
    df_1.to_csv('D_Point.zip', compression=dict(method='zip', archive_name='D_Point.csv'), index=False)

df_1['D_Point_ID'] = df_1['D_Point_ID'].astype(int)
print(len(df_1))
df_1.head()

In [None]:
# Input File, Bridge table
D_Point_WR_DocFile = "D_Point_WR_Doc.zip"
df_2 = pd.read_csv(D_Point_WR_DocFile, encoding = "ISO-8859-1").replace(np.nan, "")

df_2['D_Point_ID'] = df_2['D_Point_ID'].replace("", 0).fillna(0).astype(float).astype(int)
print(len(df_2))
df_2.head()

In [None]:
# Input File, Contains water use and owner info
Person_Plus_EXTRACT_FromWRTSnotGWISFile = "Person_Plus_EXTRACT_FromWRTSnotGWIS.zip"
df_3 = pd.read_csv(Person_Plus_EXTRACT_FromWRTSnotGWISFile, encoding = "ISO-8859-1").replace(np.nan, "")
print(len(df_3))
df_3.head()

In [None]:
# Merging dataframes into one, using left-join.
dfinPOD = pd.DataFrame()
dfinPOD = pd.merge(df_1, df_2, left_on='D_Point_ID', right_on='D_Point_ID', how='left') # Joinning PoD data
dfinPOD = pd.merge(dfinPOD, df_3, left_on=dfinPOD.WR_Doc_ID.replace("", 0).fillna(0).astype(int).astype(str).str.strip(), right_on=df_3.WR_Doc_ID.replace("", 0).fillna(0).astype(int).astype(str).str.strip(), how='left') # Joinning PoD data

dfinPOD = dfinPOD.drop_duplicates().replace(np.nan, "").replace("nan,nan", "").reset_index(drop=True)
print(len(dfinPOD))
dfinPOD.head(1)

In [None]:
# fix owner name

def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = re.sub("[$@&.;,/\)(-]", "", outlist).title().replace("  ", " ").strip()
    
    return outlist


dfinPOD['Owner'] = dfinPOD.apply(lambda row: assignOwner(row['PersonFirstNM'],
                                               row['PersonMINM'],
                                               row['PersonLastOrOrganizationNM']), axis=1)
dfinPOD['Owner'].unique()

In [None]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

dfinPOD['in_AllocationFlow_CFS'] = dfinPOD.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)
dfinPOD['in_AllocationFlow_CFS'].unique()

In [None]:
# remove Sites where ASSOC_FL != 'Y'
# remove Polygons where WR_Doc_ID_x == ""
dfinPOD = dfinPOD[dfinPOD['Assoc_FL'] == 'Y'].reset_index(drop=True)
dfinPOD = dfinPOD[dfinPOD['WR_Doc_ID_x'] != ""].reset_index(drop=True)
print(len(dfinPOD))
dfinPOD.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WAwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WAwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WAwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOD['WaRecRCWClassTypeCode'].str.title().str.strip()

# Site Info
df['in_CoordinateAccuracy'] = dfinPOD['Location_C']
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['Latitude']
df['in_Longitude'] = dfinPOD['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POD" + dfinPOD['D_Point_ID'].replace("", 0).fillna(0).astype(float).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD['D_Point_Ty']
df['in_StateCV'] = "WA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['in_AllocationFlow_CFS']
df['in_AllocationLegalStatusCV'] = dfinPOD['WaRecProcessStatusTypeCode']
df['in_AllocationNativeID'] =  "wa" + dfinPOD['WR_Doc_ID_x'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfinPOD['Owner']
df['in_AllocationPriorityDate'] = dfinPOD['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfinPOD['WaRecPhaseTypeCode']
df['in_AllocationVolume_AF'] = dfinPOD['AnnualVolumeQuantity']
df['in_BeneficialUseCategory'] = dfinPOD['PurposeOfUseTypeCodes'].astype(str)
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOD['IrrigatedAreaQuantity']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://appswr.ecology.wa.gov/waterrighttrackingsystem/WaterRights/WaterRightRecord.aspx?waRecId=" + dfinPOD['WaRecId'].replace("", 0).fillna(0).astype(int).astype(str)

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

## POU Data

In [None]:
# Input File - PoU Shapefile Data
# export dataframe as zipped csv
pouInput = 'shapefiles/WR_Doc_POU1.zip'
df_1u = gpd.read_file(pouInput)
#df_1u = df_1u.drop(['Field', 'geometry'], axis=1) # want to create a csv without the geometry in it, for visual inspection reasons.

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_1u:
    df_1u['WaDEUUID'] = "waU" + df_1u.index.astype(str)
    df_1u.to_csv('smWR_Doc_POU1.zip', compression=dict(method='zip', archive_name='smWR_Doc_POU1.csv'), index=False)

df_1u = df_1u.drop_duplicates().reset_index(drop=True)
print(len(df_1u))
df_1u.head(1)

In [None]:
# Create WaDE Specific SiteNativeID
# temp fix for lack of site id value

df_1u['in_SiteNativeID'] = "wadeID" + df_1u.index.astype(str)
df_1u.head()

In [None]:
# Merging dataframes into one, using left-join.
# df_u1 and df3
dfinPOU = pd.DataFrame()
dfinPOU = pd.merge(df_1u, df_3, left_on=df_1u.WR_DOC_ID.replace("", 0).fillna(0).astype(float).astype(int).astype(str), 
                   right_on=df_3.WR_Doc_ID.replace("", 0).fillna(0).astype(float).astype(int).astype(str), how='left')

dfinPOU = dfinPOU.drop_duplicates().reset_index(drop=True)
print(len(dfinPOU))
dfinPOU.head(1)

In [None]:
def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = re.sub("[$@&.;,/\)(-]", "", outlist).title().replace("  ", " ").strip()
    
    return outlist


dfinPOU['Owner'] = dfinPOU.apply(lambda row: assignOwner(row['PersonFirstNM'], row['PersonMINM'], row['PersonLastOrOrganizationNM']), axis=1)
dfinPOU['Owner'].unique()

In [None]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

dfinPOU['in_AllocationFlow_CFS'] =  dfinPOU.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)
dfinPOU['in_AllocationFlow_CFS'].unique()

In [None]:
print(len(dfinPOU))
dfinPOU.head(1)

In [None]:
# remove Polygons where WR_Doc_NR != '' (meaning their id field is blank)
# remove Polygons where WR_DOC_ID == 99999 (99999 is their 'blank' value field)
dfinPOU = dfinPOU[dfinPOU['WR_Doc_NR'] != ""].reset_index(drop=True)
dfinPOU = dfinPOU[dfinPOU['WR_DOC_ID'] != 99999].reset_index(drop=True)
print(len(dfinPOU))
dfinPOU.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "WAwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "WAwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "WAwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfinPOU['WaRecRCWClassTypeCode'].str.title().str.strip()

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['Latitude']
df['in_Longitude'] = dfinPOU['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfinPOU['in_SiteNativeID']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "WA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOU['in_AllocationFlow_CFS']
df['in_AllocationLegalStatusCV'] = dfinPOU['WaRecProcessStatusTypeCode']
df['in_AllocationNativeID'] = "wa" + dfinPOU['WR_Doc_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfinPOU['Owner']
df['in_AllocationPriorityDate'] = dfinPOU['PriorityDate']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfinPOU['WaRecPhaseTypeCode']
df['in_AllocationVolume_AF'] = dfinPOU['AnnualVolumeQuantity']
df['in_BeneficialUseCategory'] = dfinPOU['PurposeOfUseTypeCodes'].astype(str)
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOU['IrrigatedAreaQuantity']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://appswr.ecology.wa.gov/waterrighttrackingsystem/WaterRights/WaterRightRecord.aspx?waRecId=" + dfinPOU['WaRecId'].replace("", 0).fillna(0).astype(int).astype(str)

outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True)
print(len(outPOU))
outPOU.head()

## Concatenate POD & POU

In [None]:
# Concatenate dataframes
frames = [outPOD, outPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

In [None]:
# For creating WaterSourceTypeCV
wsTypeDict = {
    "Unspecified" : "Unspecified",
    "Groundwater" : "Groundwater",
    "Surfacewater" : "Surface Water",
    "Reservoir" : "Reservoir"}
def assignWaterSourceTypeCV(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = colrowValue.strip()
        try:
            outList = wsTypeDict[String1]
        except:
            outList = String1
    return outList

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: assignWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# For Creating CoordinateAccuracy
coordinateAccuracyDictWA = {
    "C":"field checked (without GPS)",
    "G":"field checked with GPS",
    "P":"proposed (does not exist in real world)",
    "PA":"proposed and All-right (does not exist in real world)",
    "PD":"proposed and Dubious (does not exist in real world)",
    "PM":"proposed and Multiple Dubious (does not exist in real world)",
    "PX":"proposed and Centroid Dubious (does not exist in real world)",
    "U":"unchecked",
    "UA":"unchecked and All-right",
    "UD":"unchecked and Dubious",
    "UM":"unchecked and Multiple Dubious",
    "UX":"unchecked and Centroid Dubious",
    "W":"from well log unchecked",
    "WA":"from well log unchecked and All-right",
    "WD":"from well log unchecked and Dubious",
    "WX":"from well log unchecked and Centroid Dubious"}
def assignCoordinateAccuracy(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = colrowValue.strip()
        try:
            outList = coordinateAccuracyDictWA[String1]
        except:
            outList = String1
    return outList

outdf['in_CoordinateAccuracy'] = outdf.apply(lambda row: assignCoordinateAccuracy(row['in_CoordinateAccuracy']), axis=1)
outdf['in_CoordinateAccuracy'].unique()

In [None]:
# For creating SiteTypeCV
UnknownSTCVDict = {
    "GC":"Ground Water Collector",
    "HW":"Headworks Gravity Flow (or surface water device unknown)",
    "ID":"Irrigation Dam",
    "MW":"Monitoring Well",
    "PM":"Surface Water Pump",
    "RD":"Reservoir Dam",
    "WL":"Well (or ground water device unknown)"}
def assignSiteTypeCV(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ""
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        try:
            outList = UnknownSTCVDict[String1]
        except:
            outList = String1

    return outList

outdf['in_SiteTypeCV'] = outdf.apply(lambda row: assignSiteTypeCV(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
# For creating BeneficialUseCategory
benUseDict = {
    "508-14":"508-14",
    "AI":"Agricultural Irrigation",
    "CI":"Commercial & indust",
    "CM":"Commercial",
    "CO":"Cooling for indust proces",
    "DC":"Dust Control",
    "DG":"Domestic general",
    "DM":"Domestic multiple",
    "DS":"Domestic single",
    "DY":"Dairy",
    "EN":"Environmental quality",
    "FP":"Frost protection",
    "FR":"Fire protection",
    "FS":"Fish propagation",
    "GP":"Groundwater Preservation",
    "HE":"Heat Exchange",
    "HP":"Heat protection for crops",
    "HW":"Highway",
    "IFlow":"Instream Flow",
    "II":"Individual Irrigation",
    "IR":"Irrigation",
    "IT":"Municipal inter-tie system",
    "IU":"Irrigation Unknown",
    "MI":"Mining",
    "MT":"Mitigation",
    "MU":"Municipal",
    "NR":"No Purpose Identified",
    "OT":"Other",
    "PO":"Power",
    "PR":"Parks and Recreation",
    "RE":"Recreation - beautification",
    "RW":"Railway",
    "SA":"Stream augmentation",
    "SR":"Storage",
    "ST":"Stock water",
    "TS":"Test Well",
    "TW-P":"Trust water Permanent",
    "TW-T":"Trust water Temporary",
    "WL":"Wildlife refuge"}
def assignBenUseCategory(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ""
    else:
        keyStr = colrowValue.strip()
        try:
            benUseListStr = keyStr.split()  # Need to split WA csv data
            outList = ", ".join(benUseDict[inx] for inx in benUseListStr)
        except:
            outList = ""
    return outList

outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: assignBenUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# Ensure Empty String

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_AllocationTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationTypeCV']), axis=1)
outdf['in_AllocationTypeCV'].unique()

In [None]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# in_Latitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna("")
outdf['in_Latitude'].unique()

In [None]:
# in_Longitude
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna("")
outdf['in_Longitude'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Fixing in_IrrigatedAcreage datatype
outdf['in_IrrigatedAcreage'] = pd.to_numeric(outdf['in_IrrigatedAcreage'], errors='coerce').replace(0,"").fillna("")
outdf['in_IrrigatedAcreage'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For WA, we don't want water rights that are considered: Inactive

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop list
dropLegalStatusList = ["Inactive"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Shapefile Data
- For attaching gemetry to csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
# dfPoUshapetemp = gpd.read_file('shapefiles/smWR_Doc_POU1.zip')

# # remove Polygons where WR_Doc_NR != '' (meaning their id field is blank)
# # remove Polygons where WR_DOC_ID == 99999 (99999 is their 'blank' value field)
# dfPoUshapetemp = dfPoUshapetemp[dfPoUshapetemp['WR_Doc_NR'] != ""].reset_index(drop=True)
# dfPoUshapetemp = dfPoUshapetemp[dfPoUshapetemp['WR_DOC_ID'] != 99999].reset_index(drop=True)

dfPoUshapetemp = dfinPOU.copy()

print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = dfPoUshapetemp['in_SiteNativeID'].replace("", 0).fillna(0).astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('Pwr_waMain.zip', compression=dict(method='zip', archive_name='Pwr_waMain.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.