# Preprocessing Nebraska Allocation data for WaDEQA upload.
- Date Updated: 08/26/2020
- Purpose:  To preprocess the Nebraska data into one master file for simple DataFrame creation and extraction.
- Joining API surface water data - to POD shapefile data via **RightID**.

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Nebraska/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Data: get NEwr POD data via API service
- has nested data, will need to explode lists and transform dictionaries to series and concatenate by row.

In [None]:
# # already done, see nePODAPIData.zip below

# # Get all surface water points from NeDNR API.
# # Note: API has lots of NULL values, have to put a hard stop of what to search.
# dfsw = pd.DataFrame()
# page = 1

# # for i in range(length):
# while page < 50:
#     url = "https://nednr.nebraska.gov/IwipApi/api/v1/WaterRights/AllSurfaceWaterPoints?page=" + str(page)
#     print(url)
#     page = 1 + page
#     try:
#         responseD = json.loads(requests.get(url).text)
#         DLtL = responseD['Results']
#         length = len(DLtL)
#         for i in range(length):
#             row = pd.DataFrame([DLtL[i]])
#             dfsw = dfsw.append(row)
#     except:
#         print("...error with url")

# print(len(dfsw))
# dfsw.head()

In [None]:
# # I think we can drop 'PumpSheets', 'notice', 'SpecialConditions' & 'NoticeExemptions'...
# dfsw2 = dfsw.drop(['PumpSheets', 'Notices', 'SpecialConditions', 'NoticeExemptions'], axis=1)

# dfsw2 = pd.concat([dfsw2.drop(['RightUse'], axis=1), dfsw2['RightUse'].apply(pd.Series)], axis=1)

# dfsw2 = dfsw2.explode('PointOfDiversions')
# dfsw2 = dfsw2.explode('Contacts')

# dfsw2 = pd.concat([dfsw2.drop(['PointOfDiversions'], axis=1), dfsw2['PointOfDiversions'].apply(pd.Series)], axis=1)
# dfsw2 = pd.concat([dfsw2.drop(['Contacts'], axis=1), dfsw2['Contacts'].apply(pd.Series)], axis=1)


# dfsw2 = dfsw2.drop_duplicates().reset_index(drop=True)
# print(len(dfsw2))
# dfsw2.head()

In [None]:
# dfsw2.to_csv('nePODAPIData.zip', compression=dict(method='zip', archive_name='nePODAPIData.csv'), index=False)
# print(len(dfsw2))
# dfsw2.head(1)

In [None]:
# Input File - dataframeTimeSeries.zip
dfpodin = pd.read_csv('nePODAPIData.zip', compression='zip').replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfpodin:
    dfpodin['WaDEUUID'] = "nePOD" + dfpodin.index.astype(str)
    dfpodin.to_csv('nePODAPIData.zip', compression=dict(method='zip', archive_name='nePODAPIData.csv'), index=False)

print(len(dfpodin))
dfpodin.head(1)

In [None]:
#Creating Beneficial Use from NeDWR Provdied code.  See metaData_SurfaceWaterWebSimpleSearch.pdf for details.

NebraskaBenUseCodeDict = {
"CG" : "Conducting Groundwater for Irrigation (Source is a Registered Groundwater Well)",
"CO" : "Cooling",
"CS" : "Conducting Surface Water for Irrigation (Uses Water from an Existing Appropriation)",
"DG" : "Dredge",
"DI" : "Domestic, Irrigation and Manufacturing",
"DO" : "Domestic",
"DS" : "Domestic Storage",
"FC" : "Fish Culture",
"FL" : "Flood Control",
"FW" : "Fish and Wildlife",
"IB" : "Instream Basin-Management",
"IF" : "Instream Flow",
"IG" : "Induced Ground Water Recharge",
"IN" : "Intentional Underground Storage",
"IR" : "Irrigation from Natural Stream",
"IS" : "Irrigation and Storage (an appropriation approved for both uses)",
"IU" : "Irrigation and Incidental Underground Storage",
"MF" : "Manufacturing",
"ML" : "Maintain Level of a Lake",
"MU" : "Municipal",
"NL" : "Irrigation from Natural Lake",
"OU" : "Storage (for irr from res on lands not covered by nat flow appropriation / Incidental UG Storage)",
"PI" : "Power and Incidental Underground Storage",
"PR" : "Power",
"PS" : "Supplemental Power and Incidental Underground Storage",
"PW" : "Public Water Supply",
"RC" : "Groundwater Recharge",
"RD" : "Raise Dam (for increase in head for power production)",
"SC" : "Supplemental Cooling (prior appropriation for cooling)",
"SD" : "Supplemental Domestic",
"SF" : "Supplemental Fish Culture",
"SI" : "Supplemental Irrigation (irr from res on lands also covered by nat flow appropriation)",
"SO" : "Storage (for irr from res on lands not covered by nat flow appropriation)",
"SP" : "Supplemental Power (prior appropriation for power)",
"SS" : "Supplemental Storage (prior appropriation for storage)",
"ST" : "Storage",
"SU" : "Storage and Incidental Underground Storage",
"TI" : "Temporary Transfer to In-Stream Use",
"UI" : "Supplemental Irrigation and Incidental Underground Storage",
"US" : "Incidental Underground Storage",
"WS" : "Waste Storage",
"WT" : "Wetlands"}

def assignRightUse(colrowValue):
    if colrowValue == "" or pd.isnull(colrowValue):
        outList = "WaDE Unspecified"
    else:
        String1 = colrowValue.strip()  # remove whitespace chars
        try:
            outList = NebraskaBenUseCodeDict[String1]
        except:
            outList = "WaDE Unspecified"
    return outList

dfpodin['BeneficialUseCategory'] = dfpodin.apply(lambda row: assignRightUse(row['UseCode']), axis=1)
dfpodin['BeneficialUseCategory'].unique()

In [None]:
# first & last name funciton
def assignownerName(fName, lName):
    
    # Cleaning Text
    fName = str(fName)
    lName = str(lName)
    fName = fName.replace("*", "")
    lName = lName.replace("*", "")      
    
    # Check if first or last name are empty
    if fName == "" or pd.isnull(fName):
        outList1 = "" # use blank value
    else:
        outList1 = fName.strip()
        
    if lName == "" or pd.isnull(lName):
        outList2 = "" # use blank value
    else:
        outList2 = lName.strip()

    # ouput
    if outList1 == "" and outList2 == "": # both first and last name are blank
        outList = "WaDE Unspecified"
    elif outList1 == "":
        outList = outList2 # use last name only
    elif outList2 == "":
        outList = outList1 # use first name only
    else:
        outList = " ".join(map(str, [fName, lName]))
    return outList

dfpodin['in_AllocationOwner'] = dfpodin.apply(lambda row: assignownerName(row['FirstName'], row['LastName']), axis=1)


import re
def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val

dfpodin['in_AllocationOwner'] = dfpodin.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfpodin['in_AllocationOwner'].unique()

In [None]:
# AllocationFlow_CFS - based on reporeted Unit

def assignAllocationFlow_CFS(colvA, colvB):
    if colvA == '' or pd.isnull(colvA):
        outList = 0
    else:
        colvB = colvB.strip()
        if colvB == "CFS":
            outList = colvA
        else:
            outList = 0
    return outList

dfpodin['AllocationFlow_CFS'] = dfpodin.apply(lambda row: assignAllocationFlow_CFS(row['ProGrant'], row['Units']), axis=1)
dfpodin['AllocationFlow_CFS'].unique()

In [None]:
# AllocationVolume_AF - based on reporeted Unit

def assignAllocationVolume_AF(colvA, colvB):
    if colvA == '' or pd.isnull(colvA):
        outList = 0
    else:
        colvB = colvB.strip()
        if colvB == "AF":
            outList = colvA
        else:
            outList = 0
    return outList

dfpodin['AllocationVolume_AF'] = dfpodin.apply(lambda row: assignAllocationVolume_AF(row['ProGrant'], row['Units']), axis=1)
dfpodin['AllocationVolume_AF'].unique()

In [None]:
# Create VariableSpecificCv value
def createVariableSpecificCv(unit):
    unit = unit.strip()
    outString = "NEwr_V1"
    if unit == "CFS":
        outString = "NEwr_V1"
    if unit == "AF":
        outString = "NEwr_V2"

    return(outString)

dfpodin['in_VariableSpecificUUID'] = dfpodin.apply(lambda row: createVariableSpecificCv(row['Units']), axis=1)
dfpodin['in_VariableSpecificUUID'].unique()

In [None]:
# Creating the output Dataframe for PODs.

dfPOD = pd.DataFrame(index=dfpodin.index)

# Data Assessment UUID
dfPOD['WaDEUUID'] = dfpodin['WaDEUUID']

# Variable
dfPOD["in_VariableSpecificUUID"] = dfpodin['in_VariableSpecificUUID']

# Water Source
dfPOD["in_WaterSourceName"] = dfpodin['SourceName']
dfPOD["in_WaterSourceTypeCV"] = "Surface Water"

# # Site
dfPOD["in_CoordinateAccuracy"] = "WaDE Unspecified"
dfPOD["in_CoordinateMethodCV"] = "WaDE Unspecified"
dfPOD['in_HUC12'] = dfpodin['HUC12']
dfPOD['in_HUC8'] = ""
dfPOD['in_County'] = dfpodin['CountyName']
dfPOD["in_Latitude"] = dfpodin['LatitudeDecimalDegrees.1']
dfPOD["in_Longitude"] = dfpodin['LongitudeDecimalDegrees.1']
dfPOD["in_PODorPOUSite"] = "POD"
dfPOD["in_SiteName"] = "WaDE Unspecified"
dfPOD["in_SiteNativeID"] = "POD" + dfpodin['PointOfDiversionID'].astype(str)
dfPOD["in_SiteTypeCV"] = "WaDE Unspecified"
dfPOD["in_StateCV"] = "NE"

# Allocation
dfPOD["in_AllocationApplicationDate"] = ""
dfPOD["in_AllocationExpirationDate"] = ""
dfPOD["in_AllocationFlow_CFS"] = dfpodin['AllocationFlow_CFS'].astype(float)
dfPOD["in_AllocationVolume_AF"] = dfpodin['AllocationVolume_AF'].astype(float)
dfPOD['in_AllocationLegalStatusCV'] = dfpodin['RightStatus'].astype(str)
dfPOD["in_AllocationNativeID"] = dfpodin['RightID'].astype(str)
dfPOD['in_AllocationOwner'] = dfpodin['in_AllocationOwner']
dfPOD['in_AllocationPriorityDate'] = dfpodin['PriorityDate']
dfPOD['in_AllocationTimeframeEnd'] = ""
dfPOD['in_AllocationTimeframeStart'] = ""
dfPOD['in_AllocationTypeCV'] = ""
dfPOD["in_BeneficialUseCategory"] = dfpodin['BeneficialUseCategory']
dfPOD['in_CommunityWaterSupplySystem'] = ""
dfPOD['in_ExemptOfVolumeFlowPriority'] = "0"
dfPOD["in_IrrigatedAcreage"] = ""
dfPOD["in_IrrigationMethodCV"] = ""
dfPOD["in_WaterAllocationNativeURL"] = 'https://nednr.nebraska.gov/dynamic/WaterRights/WaterRights/SWRDetailPageForPublic?RightId=' + dfpodin['RightID'].astype(str)

dfPOD = dfPOD.drop_duplicates().reset_index(drop=True)
print(len(dfPOD))
dfPOD.head(2)

## Data: NEwr POU data via shapefile
- should already be transfomred to WSG format in a csv.

In [None]:
# Input File - dataframeTimeSeries.zip
dfpouin = pd.read_csv('BND_SurfaceWaterRights_DNR.zip', compression='zip')

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfpouin:
    dfpouin['WaDEUUID'] = "nePOU" + dfpouin.index.astype(str)
    dfpouin.to_csv('BND_SurfaceWaterRights_DNR.zip', compression=dict(method='zip', archive_name='BND_SurfaceWaterRights_DNR.csv'), index=False)

print(len(dfpouin))
dfpouin.head(1)

In [None]:
# combine some POD data to POU site info
dfpouin = pd.merge(dfpouin, dfPOD[['in_AllocationNativeID', 'in_County', 'in_AllocationLegalStatusCV', 'in_AllocationOwner', 'in_BeneficialUseCategory']], left_on=dfpouin['RightID'].astype(str), right_on='in_AllocationNativeID', how='left')
print(len(dfpouin))
dfpouin.head(1)

In [None]:
# AllocationFlow_CFS - based on reporeted Unit

def assignAllocationFlow_CFS(colvA, colvB):
    if colvA == '' or pd.isnull(colvA):
        outList = 0
    else:
        colvB = colvB.strip()
        if colvB == "CFS":
            outList = colvA
        else:
            outList = 0
    return outList

dfpouin['AllocationFlow_CFS'] = dfpouin.apply(lambda row: assignAllocationFlow_CFS(row['ProGrant'], row['Units']), axis=1)
dfpouin['AllocationFlow_CFS'].unique()

In [None]:
# AllocationVolume_AF - based on reporeted Unit

def assignAllocationVolume_AF(colvA, colvB):
    if colvA == '' or pd.isnull(colvA):
        outList = 0
    else:
        colvB = colvB.strip()
        if colvB == "AF":
            outList = colvA
        else:
            outList = 0
    return outList

dfpouin['AllocationVolume_AF'] = dfpouin.apply(lambda row: assignAllocationVolume_AF(row['ProGrant'], row['Units']), axis=1)
dfpouin['AllocationVolume_AF'].unique()

In [None]:
# Create VariableSpecificCv value
def createVariableSpecificCv(unit):
    unit = unit.strip()
    outString = "NEwr_V1"
    if unit == "CFS":
        outString = "NEwr_V1"
    if unit == "AF":
        outString = "NEwr_V2"

    return(outString)

dfpouin['in_VariableSpecificUUID'] = dfpouin.apply(lambda row: createVariableSpecificCv(row['Units']), axis=1)
dfpouin['in_VariableSpecificUUID'].unique()

In [None]:
# Creating the output Dataframe for POUs.

dfPOU = pd.DataFrame(index=dfpouin.index)

# Data Assessment UUID
dfPOU['WaDEUUID'] = dfpouin['WaDEUUID']

# Variable
dfPOU["in_VariableSpecificUUID"] = dfpouin['in_VariableSpecificUUID']

# Water Source
dfPOU["in_WaterSourceName"] = dfpouin['SourceName']
dfPOU["in_WaterSourceTypeCV"] = "Surface Water"

# Site
dfPOU["in_CoordinateAccuracy"] = "WaDE Unspecified"
dfPOU["in_CoordinateMethodCV"] = "WaDE Unspecified"
dfPOU['in_HUC12'] = ""
dfPOU['in_HUC8'] = ""
dfPOU['in_County'] = dfpouin['in_County']  #from POD data
dfPOU["in_Latitude"] = dfpouin['Latitude']
dfPOU["in_Longitude"] = dfpouin['Longitude']
dfPOU["in_PODorPOUSite"] = "POU"
dfPOU["in_SiteName"] = "WaDE Unspecified"
dfPOU["in_SiteNativeID"] = "POU" + dfpouin['OBJECTID'].astype(str)
dfPOU["in_SiteTypeCV"] = "WaDE Unspecified"
dfPOU["in_StateCV"] = "NE"

# # Allocation
dfPOU["in_AllocationApplicationDate"] = ""
dfPOU["in_AllocationExpirationDate"] = ""
dfPOU["in_AllocationFlow_CFS"] = dfpouin['AllocationFlow_CFS'].astype(float)
dfPOU["in_AllocationVolume_AF"] = dfpouin['AllocationVolume_AF'].astype(float)
dfPOU['in_AllocationLegalStatusCV'] = dfpouin['in_AllocationLegalStatusCV']  #from POD data
dfPOU["in_AllocationNativeID"] = dfpouin['RightID'].astype(str)
dfPOU['in_AllocationOwner'] = dfpouin['in_AllocationOwner']  #from POD data
dfPOU['in_AllocationPriorityDate'] = dfpouin['PriorityDa']
dfPOU['in_AllocationTimeframeEnd'] = ""
dfPOU['in_AllocationTimeframeStart'] = ""
dfPOU['in_AllocationTypeCV'] = ""
dfPOU["in_BeneficialUseCategory"] = dfpouin['in_BeneficialUseCategory']  #from POD data
dfPOU['in_CommunityWaterSupplySystem'] = ""
dfPOU['in_ExemptOfVolumeFlowPriority'] = "0"
dfPOU["in_IrrigatedAcreage"] = dfpouin['Acres_Orde'].astype(float).fillna(0)
dfPOU["in_IrrigationMethodCV"] = ""
dfPOU["in_WaterAllocationNativeURL"] = 'https://nednr.nebraska.gov/dynamic/WaterRights/WaterRights/SWRDetailPageForPublic?RightId=' + dfpouin['RightID'].astype(str)

dfPOU = dfPOU.drop_duplicates().reset_index(drop=True)
print(len(dfPOU))
dfPOU.head(2)

In [None]:
# Unique values for 'WaterSourceTypeCV'
for x in dfPOU['in_BeneficialUseCategory'].sort_values().unique():
    print(f'"' + x + '",')

## Concatenate POD and POU Data.  Clean Data.

In [None]:
# Concatenate
frames = [dfPOD, dfPOU]
dfout = pd.concat(frames)

#Removing all NaN Values and replacing with blank
dfout = dfout.replace(np.nan, "", regex=True).reset_index()

print(len(dfout))
dfout.head(1)

In [None]:
# Fixing empty string names

def fixEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [None]:
dfout['in_WaterSourceName'] = dfout.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
dfout['in_WaterSourceName'].unique()

In [None]:
dfout['in_County'] = dfout.apply(lambda row: fixEmptyString(row['in_County']), axis=1)
dfout['in_County'].unique()

In [None]:
dfout['in_AllocationLegalStatusCV'] = dfout.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
dfout['in_AllocationLegalStatusCV'].unique()

In [None]:
dfout['in_BeneficialUseCategory'] = dfout.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
dfout['in_BeneficialUseCategory'].unique()

In [None]:
# Fixing null or empty lat and long values
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)

In [None]:
# Changing datatype of date fields to fit WaDE.
dfout['in_AllocationPriorityDate'] = pd.to_datetime(dfout['in_AllocationPriorityDate'], errors = 'coerce')
dfout['in_AllocationPriorityDate'] = pd.to_datetime(dfout["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
dfout['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
dfout['in_AllocationFlow_CFS'] = pd.to_numeric(dfout['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
dfout['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
dfout['in_AllocationVolume_AF'] = pd.to_numeric(dfout['in_AllocationVolume_AF'], errors='coerce').fillna(0)
dfout['in_AllocationVolume_AF'].unique()

In [None]:
# Making Sure datatype of HUC12 is int.

def assignHUC12(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        outList = int(colrowValue)
    return outList

dfout['in_HUC12'] = dfout.apply(lambda row: assignHUC12(row['in_HUC12']), axis=1)
dfout['in_HUC12'].unique()

## WaDE Custom Elements (due to missing state site info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceName']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/BND_SurfaceWaterRights_DNR.shp')
print(len(dfPoUshapetemp))
dfPoUshapetemp.head(1)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['OBJECTID'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfPoUshape.head(3)

# Export the Output 

In [None]:
#technique to check datatype of long dataframes.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
# Exporting output files.
dfout.to_csv('Pwr_neWaterRightMain.zip', index=False, compression="zip")  # The output, save as a zip.
dfPoUshape.to_csv('Pwr_neGeometry.zip', index=False, compression="zip")  # The output geometry, save as zip.