# Preprocessing Washington Allocation data for WaDEQA upload.
- Date Updated: 04/01/2020
- Purpose:  To preprocess the Washington data into one master file for simple DataFrame creation and extraction

Useful Links to Data:
- The Data - Geographic Water Information System (GWIS)Data from the WA stat: https://fortress.wa.gov/ecy/gispublic/DataDownload/wr/GWIS_Data/
- Data dictionary - https://fortress.wa.gov/ecy/gispublic/DataDownload/wr/GWIS_Data/GWIS_Data_Dictionary/
- Public website   - https://ecology.wa.gov/Water-Shorelines/Water-supply/Water-rights

In [None]:
# Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

from pyproj import Transformer, transform
transformer = Transformer.from_proj(2927, 4326)  # A trick to drastically optimize the Transformer of pyproj.
# Washignton projection = EPSG:2927. WGS84 projection used by WaDE 2.0 = epsg:4326.

# Working Directory
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Washington/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data

In [None]:
# Input Files
d_pointFile = "D_PointTable.csv"  # contains PoD info
D_Point_WR_DocFile = "D_Point_WR_Doc.csv"  #Bridge table
Person_Plus_EXTRACT_FromWRTSnotGWISFile = "Person_Plus_EXTRACT_FromWRTSnotGWIS.csv"  #Contains water use and owner info

In [None]:
# Dataframe creation
df_1 = pd.read_csv(d_pointFile, encoding = "ISO-8859-1") #Input
df_2 = pd.read_csv(D_Point_WR_DocFile, encoding = "ISO-8859-1") #Input
df_3 = pd.read_csv(Person_Plus_EXTRACT_FromWRTSnotGWISFile, encoding = "ISO-8859-1") #Input

In [None]:
# Merging dataframes into one, using left-join.
df = pd.DataFrame()
df = pd.merge(df_1, df_2, left_on='D_Point_ID', right_on='D_Point_ID', how='left') # Joinning PoD data
df = pd.merge(df,   df_3, left_on='WR_Doc_ID', right_on='WR_Doc_ID', how='left') # Joinning PoD data

df = df.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(df))
df

In [None]:
# For converting projection latitude.
def assignLat(colrowValueLat, colrowValueLong):
    if colrowValueLat == '' or pd.isnull(colrowValueLat):
        lat = ""
    else:
        lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return lat

# For converting projection longitude.
def assignLong(colrowValueLat, colrowValueLong):
    if colrowValueLong == '' or pd.isnull(colrowValueLong):
        long = ""
    else:
        lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return long

df['in_Latitude'] = df.apply(lambda row: assignLat(row['POINT_X'], row['POINT_Y']), axis=1)
df['in_Longitude'] = df.apply(lambda row: assignLong(row['POINT_X'], row['POINT_Y']), axis=1)

In [None]:
#Changing datatype of used date fields. 
df['PriorityDate'] = pd.to_datetime(df['PriorityDate'], errors = 'coerce')
df['PriorityDate'] = pd.to_datetime(df["PriorityDate"].dt.strftime('%m/%d/%Y'))

In [None]:
def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = outlist.strip()
    return outlist


df['Owner'] = df.apply(lambda row: assignOwner(row['PersonFirstNM'],
                                               row['PersonMINM'],
                                               row['PersonLastOrOrganizationNM']), axis=1)

In [None]:
#Manually filling in empty class code with ‘Unspecified’ value.
def assignWaRecRCWClassTypeCode(colValue):
    if colValue == "" or pd.isnull(colValue):
        outlist = "Unspecified"
    else:
        outlist = colValue.strip()
    return outlist


df['WaRecRCWClassTypeCode'] = df.apply(lambda row: assignWaRecRCWClassTypeCode(row['WaRecRCWClassTypeCode']), axis=1)

In [None]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

df['in_AllocationFlow_CFS'] =  df.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)

In [None]:
# Create output dataframe for ground water
columnslist = [   
    ### Water Source Info ###
    "in_WaterSourceTypeCV",
    
    ### Site Info ###
    "in_CoordinateAccuracy",
    "in_Latitude",
    "in_Longitude",
    "in_SiteTypeCV",
    "in_PODorPOUSite",
    
    ### AllocationAmount_fact Info ###
    "in_AllocationFlow_CFS",
     "in_AllocationLegalStatusCV",
    "in_AllocationNativeID",
    "in_AllocationOwner",
    "in_AllocationPriorityDate",
    "in_AllocationTypeCV",
    "in_AllocationVolume_AF",
    "in_BeneficialUseCategory",
    "in_IrrigatedAcreage"]

dfPOD = pd.DataFrame(columns=columnslist, index=df.index)

In [None]:
#############################################################################################
#WaterSource
dfPOD['in_WaterSourceTypeCV'] = df['WaRecRCWClassTypeCode']
                                    
#Site
dfPOD['in_CoordinateAccuracy'] = df['Location_C']
dfPOD['in_Latitude'] = df['in_Latitude']
dfPOD['in_Longitude'] = df['in_Longitude']
dfPOD['in_SiteNativeID'] = df['D_Point_ID'].astype(str)
dfPOD['in_SiteTypeCV'] = df['D_Point_Ty']
dfPOD['in_PODorPOUSite'] = "POD"

#AllocationAmount_fact
dfPOD['in_AllocationFlow_CFS'] = df['in_AllocationFlow_CFS']
dfPOD['in_AllocationLegalStatusCV'] = df['WaRecProcessStatusTypeCode'].astype(str)
dfPOD['in_AllocationNativeID'] = df['WR_Doc_ID'].astype(str)
dfPOD['in_AllocationOwner'] = df['Owner'].astype(str)
dfPOD['in_AllocationPriorityDate'] = df['PriorityDate']
dfPOD['in_AllocationTypeCV'] = df['WaRecPhaseTypeCode']
dfPOD['in_AllocationVolume_AF'] = df['AnnualVolumeQuantity']
dfPOD['in_BeneficialUseCategory'] = df['PurposeOfUseTypeCodes'].astype(str)
dfPOD['in_IrrigatedAcreage'] = df['IrrigatedAreaQuantity']

dfPOD = dfPOD.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfPOD))
dfPOD

## POU Data

In [None]:
# Input Files
pouInput = "WA_POU_Input.csv"   # contains POU info
Person_Plus_EXTRACT_FromWRTSnotGWISFile = "Person_Plus_EXTRACT_FromWRTSnotGWIS.csv"  # Contains water use and owner info

In [None]:
# Dataframe creation
df_1 = pd.read_csv(pouInput, encoding = "ISO-8859-1")
df_3 = pd.read_csv(Person_Plus_EXTRACT_FromWRTSnotGWISFile, encoding = "ISO-8859-1")

In [None]:
# Merging dataframes into one, using left-join.
df = pd.DataFrame()
df = pd.merge(df_1, df_3, left_on='WR_DOC_ID', right_on='WR_Doc_ID', how='left')

df = df.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(df))
df

In [None]:
#Changing datatype of used date fields. 
df['PriorityDate'] = pd.to_datetime(df['PriorityDate'], errors = 'coerce')
df['PriorityDate'] = pd.to_datetime(df["PriorityDate"].dt.strftime('%m/%d/%Y'))

In [None]:
def assignOwner(valueFirst, valueMid, valueLast):
    #--- First Name ---
    if valueFirst == "" or pd.isnull(valueFirst):
        FirstName = ""
    else:
        FirstName = str(valueFirst).strip()
        
    #--- Midile Initial ---
    if valueMid == "" or pd.isnull(valueMid):
        MidName = ""
    else:
        MidName = str(valueMid).strip()
    
    #--- Last Name ---
    if valueLast == "" or pd.isnull(valueLast):
        LastName = ""
    else:
        LastName = str(valueLast).strip()

    if LastName == "":
        outlist = LastName + FirstName + MidName
    else:
        outlist = LastName + ", " + FirstName + " "+ MidName
        
    outlist = outlist.strip()
    return outlist


df['Owner'] = df.apply(lambda row: assignOwner(row['PersonFirstNM'],
                                               row['PersonMINM'],
                                               row['PersonLastOrOrganizationNM']), axis=1)

In [None]:
#Manually filling in empty class code with ‘unknown’ value.
def assignWaRecRCWClassTypeCode(colValue):
    if colValue == "" or pd.isnull(colValue):
        outlist = "Unspecified"
    else:
        outlist = colValue.strip()
    return outlist


df['WaRecRCWClassTypeCode'] = df.apply(lambda row: assignWaRecRCWClassTypeCode(row['WaRecRCWClassTypeCode']), axis=1)

In [None]:
# For creating AllocationAmount
def assignAllocationAmount(colrowValueIQ, colrowValueUC):
    if colrowValueIQ == '' or pd.isnull(colrowValueIQ):
        outVal = ""
    elif colrowValueIQ <= 0 or pd.isnull(colrowValueIQ):
        outVal = 0
    else:
        MultiFactor = 1.0
        gpmcfsUnit = colrowValueUC.strip()
        if gpmcfsUnit == 'GPM':
            MultiFactor = 0.00222800926
        elif gpmcfsUnit == 'GPD':
            MultiFactor = 1.0 / 646317.0
        try:
            outVal = MultiFactor * colrowValueIQ
        except:
            outVal = colrowValueIQ
    return outVal

df['in_AllocationFlow_CFS'] =  df.apply(lambda row: assignAllocationAmount(row['InstantaneousQuantity'], row['InstantaneousUnitCode']), axis=1)

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEWA_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = df['Latitude']
dfSiteNativeID['in_Longitude'] = df['Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

df['in_SiteNativeID'] = df.apply(lambda row: retrieveSiteNativeID( row['Latitude'], row['Longitude']), axis=1)
df

In [None]:
# Create output dataframe for ground water
columnslist = [   
    ### Water Source Info ###
    "in_WaterSourceName",
    "in_WaterSourceTypeCV",
    
    ### Site Info ###
    "in_CoordinateAccuracy",
    "in_Latitude",
    "in_Longitude",
    "in_SiteTypeCV",
    "in_PODorPOUSite",
    
    ### AllocationAmount_fact Info ###
    "in_AllocationFlow_CFS",
     "in_AllocationLegalStatusCV",
    "in_AllocationNativeID",
    "in_AllocationOwner",
    "in_AllocationPriorityDate",
    "in_AllocationTypeCV",
    "in_AllocationVolume_AF",
    "in_BeneficialUseCategory",
    "in_IrrigatedAcreage"]

dfPOU = pd.DataFrame(columns=columnslist, index=df.index)

In [None]:
#############################################################################################
#WaterSource
dfPOU['in_WaterSourceTypeCV'] = df['WaRecRCWClassTypeCode']
                                    
#Site
dfPOU['in_CoordinateAccuracy'] = ""
dfPOU['in_Latitude'] = df['Latitude']
dfPOU['in_Longitude'] = df['Longitude']
dfPOU['in_SiteNativeID'] = df['in_SiteNativeID'].astype(str)
dfPOU['in_SiteTypeCV'] = "Unspecified"
dfPOU['in_PODorPOUSite'] = "POU"

#AllocationAmount_fact
dfPOU['in_AllocationFlow_CFS'] = df['in_AllocationFlow_CFS']
dfPOU['in_AllocationLegalStatusCV'] = df['WaRecProcessStatusTypeCode'].astype(str)
dfPOU['in_AllocationNativeID'] = df['WR_Doc_ID'].astype(str)
dfPOU['in_AllocationOwner'] = df['Owner'].astype(str)
dfPOU['in_AllocationPriorityDate'] = df['PriorityDate']
dfPOU['in_AllocationTypeCV'] = df['WaRecPhaseTypeCode']
dfPOU['in_AllocationVolume_AF'] = df['AnnualVolumeQuantity']
dfPOU['in_BeneficialUseCategory'] = df['PurposeOfUseTypeCodes'].astype(str)
dfPOU['in_IrrigatedAcreage'] = df['IrrigatedAreaQuantity']

dfPOU = dfPOU.drop_duplicates().replace(np.nan, "").reset_index(drop=True)
print(len(dfPOU))
dfPOU

## Concatenate POD & POU

In [None]:
# Concatenate
frames = [dfPOD, dfPOU]
dfout = pd.concat(frames)
dfout = dfout.replace(np.nan, "").drop_duplicates()
dfout = dfout.replace("nan", "").drop_duplicates()

print(len(dfout))
dfout

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEWA_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceTypeCV']), axis=1)
dfout

## Export Outputs

In [None]:
#Exporting to Finished File
dfout.to_csv('P_WashingtonMaster.csv', index=False)  # The output