# Pre-processing Idaho Allocation data for WaDEQA upload.
Date Updated: 10/22/2020
Purpose:  To pre-process the Idaho data into one master file for simple DataFrame creation and extraction.  Working Idaho data for WaDEQA 2.0 is mostly composed of point of diversion data.

Useful Links to Data:
Point of Diversion (POD): (download spreadsheet)
https://data-idwr.opendata.arcgis.com/datasets/water-right-pods

Place of Use (PoU): Water Right (download the Zipped Files). See metadata in the View. Open WaterRightPOUs.dbf into Excel/load it into pandas Python 
https://data-idwr.opendata.arcgis.com/pages/gis-data#WaterRights

In [None]:
# Needed Libararies
import os
import numpy as np
import pandas as pd
import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Idaho/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Clean Owner Data

In [None]:
FI_ExtendedContactNames = "IdwrExtendedContactNamesWithFieldNames_input.xlsx"
df_ECN = pd.read_excel(FI_ExtendedContactNames)
df_ECN.head(1)
print(len(df_ECN))

In [None]:
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val

df_ECN['OrganizationName'] = df_ECN.apply(lambda row: cleanOwnerDataFunc(row['OrganizationName']), axis=1)
df_ECN['in_AllocationOwner'].unique()

In [None]:
df_ECN.to_excel('IdwrExtendedContactNamesWithFieldNames_input.xlsx', index=False)  # The output

In [None]:
# create owner dictionary
ExtOwnerNameDict = pd.Series(df_ECN.StreetAddress.values, index = df_ECN.RightID).to_dict()

## POD Sites Data

In [None]:
# Input File
FI_POD = "ID_Water_Right_PODs_input.xlsx"
dfinPOD = pd.read_excel(FI_POD)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "idD" + dfinPOD.index.astype(str)
    dfinPOD.to_excel('ID_Water_Right_PODs_input.xlsx', index=False)

dfinPOD.head(1)

In [None]:
# temp
# merge a few POU elements, like water use
FI_POU = "ID_Water_Right_PoUs_input.xlsx"
dfinPOU = pd.read_excel(FI_POU)

dfinPOD = pd.merge(dfinPOD, dfinPOU[['RightID','WaterUse']], left_on='RightID_POD', right_on='RightID', how='left')
print(len(dfinPOD))
dfinPOD.head(3)

In [None]:
# Update datatype of Priority Date to fit WaDE 2.0 structure

dfinPOD['PriorityDate_POD'] = pd.to_datetime(dfinPOD['PriorityDate_POD'])
dfinPOD['PriorityDate_POD'] = pd.to_datetime(dfinPOD["PriorityDate_POD"].dt.strftime('%m/%d/%Y'))

In [None]:
# WaterSourceType

WaterSourceTypeDict = {
"River" : "Surface Water",
"Ground Water" : "Groundwater",
"Spring" :  "Groundwater",
"Lake" :  "Surface Water",
"Pond" :  "Surface Water", 
"Canal" :  "Surface Water",
"Creek" : "Surface Water",
"Fork" : "Surface Water",
"Waste Water" : "Reuse",
"Drain" : "Drain",
"Gluch" : "Surface Water",
"Reservoir" : "reservoir",
"Slough" : "Surface Water",
"Ditch" : "Surface Water",
"Channel" : "Surface Water",
"Dry" : "Surface Water"}

def assignWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = 'Unspecified'
    elif colrowValue == "GROUND WATER":
        outList = "Groundwater"
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        String1 = String1.title()  # change to title format
        list_of_words = String1.split()
        for i in list_of_words:
            if i in WaterSourceTypeDict.keys():
                outList = WaterSourceTypeDict[i]
            else:
                outList = 'Unspecified'
    return outList

dfinPOD['in_WaterSourceTypeCV'] = dfinPOD.apply(lambda row: assignWaterSourceType(row['Source_POD']), axis=1)
dfinPOD.head(3)

In [None]:
# Sitetype

SiteTypeDict = {
"River" : "River",
"Spring" : "Spring",
"Lake" : "Lake",
"Pond" : "Pond", 
"Canal" : "Canal",
"Creek" : "Creek",
"Fork" : "Fork",
"Waste Water" : "Waste Water",
"Drain" : "Drain",
"Gluch" : "Gluch",
"Reservoir" : "Reservoir",
"Slough" : "Slough",
"Ditch" : "Ditch"
}

def assignSiteType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = "Unspecified"
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        String1 = String1.title()  # change to title format
        list_of_words = String1.split()
        for i in list_of_words:
            if i in SiteTypeDict.keys():
                outList = SiteTypeDict[i]
            else:
                outList = "Unspecified"
    return outList

dfinPOD['in_SiteTypeCV'] = dfinPOD.apply(lambda row: assignSiteType(row['Source_POD']), axis=1)
dfinPOD.head(3)

In [None]:
# Updating Owner names whose text was cut. Use extened contact name input.

def updateExtndedOwner(valRightID, valOwner):
    if valOwner == '' or pd.isnull(valOwner):
        outList = ''
    else:
        try:
            outList = ExtOwnerNameDict[valRightID]
        except:
            outList = re.sub("[$@&.;,/\)(-]", "", valOwner).strip()
    return outList

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: updateExtndedOwner(row['RightID_POD'], row['Owner_POD']), axis=1)
dfinPOD.head(3)

In [None]:
dfPOD = pd.DataFrame()

# Data Assessment UUID
dfPOD['WaDEUUID'] = dfinPOD['WaDEUUID']

# Water Source
dfPOD['in_WaterSourceName'] = dfinPOD['Source_POD']
dfPOD['in_WaterSourceTypeCV'] = dfinPOD['in_WaterSourceTypeCV']

# Site
dfPOD['in_CoordinateAccuracy'] = "Unspecified"
dfPOD['in_CoordinateMethodCV'] = dfinPOD['DataSource_POD']
dfPOD['in_County'] = "Unspecified"
dfPOD['in_Latitude'] = dfinPOD['Y_POD']
dfPOD['in_Longitude'] = dfinPOD['X_POD']
dfPOD['in_PODorPOUSite'] = "POD"
dfPOD['in_SiteName'] = dfinPOD['DiversionName_POD']
dfPOD['in_SiteNativeID'] = "POD" + dfinPOD['PointOfDiversionID_POD'].astype(str)
dfPOD['in_SiteTypeCV'] = dfinPOD['in_SiteTypeCV']

# Allocation Fact
dfPOD['in_AllocationNativeID'] = dfinPOD['RightID_POD']
dfPOD['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
dfPOD['in_AllocationPriorityDate'] = dfinPOD['PriorityDate_POD']
dfPOD['in_AllocationFlow_CFS'] = dfinPOD['OverallMaxDiversionRate_POD']
dfPOD['in_AllocationVolume_AF'] = ""
dfPOD['in_AllocationBasisCV'] = dfinPOD['Basis_POD']
dfPOD['in_AllocationTypeCV'] = dfinPOD['Status_POD']
dfPOD['in_BeneficialUseCategory'] = dfinPOD['WaterUse']
dfPOD['in_WaterAllocationNativeURL'] = dfinPOD['WRDocs_POD']

print(len(dfPOD))
dfPOD.head(3)

## POU Site Data

In [None]:
# Input File
FI_POU = "ID_Water_Right_PoUs_input.xlsx"
dfinPOU = pd.read_excel(FI_POU)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "idU" + dfinPOU.index.astype(str)
    dfinPOU.to_excel('ID_Water_Right_PoUs_input.xlsx', index=False)

dfinPOU.head()

In [None]:
# Update datatype of Priority Date to fit WaDE 2.0 structure.

dfinPOU['PriorityDa'] = dfinPOU['PriorityDa'].astype(str)
dfinPOU['PriorityDa'] = pd.to_datetime(dfinPOU['PriorityDa'])
dfinPOU['PriorityDa'] = pd.to_datetime(dfinPOU["PriorityDa"].dt.strftime('%m/%d/%Y'))

In [None]:
# AllocationVolume_AF

def assignAllocationVolume_AF(A, B):
    A = float(A)
    B = float(B)
    if A > 0:
        outString = A
    else:
        outString = B
    return outString

dfinPOU['in_AllocationVolume_AF'] = dfinPOU.apply(lambda row: assignAllocationVolume_AF(row['AcreLimit'], row['TotalAcres']), axis=1)
dfinPOU.head(3)

In [None]:
# WaterSourceType

WaterSourceTypeDict = {
"River" : "Surface Water",
"Ground Water" : "Groundwater",
"Spring" :  "Groundwater",
"Lake" :  "Surface Water",
"Pond" :  "Surface Water", 
"Canal" :  "Surface Water",
"Creek" : "Surface Water",
"Fork" : "Surface Water",
"Waste Water" : "Reuse",
"Drain" : "Drain",
"Gluch" : "Surface Water",
"Reservoir" : "Reservoir",
"Slough" : "Surface Water",
"Ditch" : "Surface Water",
"Channel" : "Surface Water",
"Dry" : "Surface Water"}

def assignWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = 'Unspecified'
    elif colrowValue == "GROUND WATER":
        outList = "Groundwater"
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        String1 = String1.title()  # change to title format
        list_of_words = String1.split()
        for i in list_of_words:
            if i in WaterSourceTypeDict.keys():
                outList = WaterSourceTypeDict[i]
            else:
                outList = 'Unspecified'
    return outList

dfinPOU['in_WaterSourceTypeCV'] = dfinPOU.apply(lambda row: assignWaterSourceType(row['Source']), axis=1)
dfinPOU.head(3)

In [None]:
# Updating Owner names whose text was cut. Use extened contact name input.

def updateExtndedOwner(valRightID, valOwner):
    if valOwner == '' or pd.isnull(valOwner):
        outList = ''
    else:
#         valRightID = valRightID.strip()  # remove whitespace chars
        try:
            outList = ExtOwnerNameDict[valRightID]
        except:
            outList = re.sub("[$@&.;,/\)(-]", "", valOwner).strip()
    return outList

dfinPOU['in_AllocationOwner'] = dfinPOU.apply(lambda row: updateExtndedOwner(row['RightID'], row['Owner']), axis=1)
dfinPOU.head(3)

In [None]:
dfPOU = pd.DataFrame()

# Data Assessment UUID
dfPOU['WaDEUUID'] = dfinPOU['WaDEUUID']

# Water Source
dfPOU['in_WaterSourceName'] = dfinPOU['Source']
dfPOU['in_WaterSourceTypeCV'] = dfinPOU['in_WaterSourceTypeCV']

# Site
dfPOU['in_CoordinateAccuracy'] = "Unspecified"
dfPOU['in_CoordinateMethodCV'] = "Centroid"
dfPOU['in_County'] = "Unspecified"
dfPOU['in_Latitude'] = dfinPOU['Latitude']
dfPOU['in_Longitude'] = dfinPOU['Longitude']
dfPOU['in_PODorPOUSite'] = "POU"
dfPOU['in_SiteName'] = "Unspecified"
dfPOU['in_SiteTypeCV'] = "Unspecified"
dfPOU['in_SiteNativeID'] = "POU" + dfinPOU['PlaceOfUse'].astype(str)

# Allocation Fact
dfPOU['in_AllocationNativeID'] = dfinPOU['RightID']
dfPOU['in_AllocationOwner'] = dfinPOU['in_AllocationOwner']
dfPOU['in_AllocationPriorityDate'] = dfinPOU['PriorityDa']
dfPOU['in_AllocationFlow_CFS'] = ""
dfPOU['in_AllocationVolume_AF'] = dfinPOU['in_AllocationVolume_AF']
dfPOU['in_AllocationBasisCV'] = ""
dfPOU['in_AllocationTypeCV'] = ""
dfPOU['in_BeneficialUseCategory'] = dfinPOU['WaterUse']
dfPOU['in_WaterAllocationNativeURL'] = dfinPOU['WRDocs']

print(len(dfPOU))
dfPOU.head(3)

## Concatenate and Export

In [None]:
# Concatenate dataframes
frames = [dfPOD, dfPOU]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates()
print(len(outdf))

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEID_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf

## Shapefile Data
- For attaching gemetry to csv inputs.

In [None]:
# Shapefile input - POU2
dfPoUshapetemp = gpd.read_file('shapefile/ID_PoU2.shp')
print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['PlaceOfUse'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

## Review and Export

In [None]:
outdf.dtypes

In [None]:
#Exporting to Finished File
outdf.to_csv('P_IdahoMaster.csv', index=False)  # The output
#dfPoUshape.to_csv('P_idGeometry.csv', index=False) # The output geometry.