# Pre-processing North Dakaota Allocation data for WaDEQA upload.
Date Updated: 03/28/2023
Purpose:  To pre-process the ND data into one master file for simple DataFrame creation and extraction

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/NorthDakota/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Data: Permit

In [None]:
# Input File
dfinPOD = pd.read_csv("Permits_input.zip", encoding = "ISO-8859-1", compression='zip').replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "ndwr" + df.index.astype(str)
    dfinPOD.to_csv('Permits_input.csv', index=False)

print(len(dfinPOD))
dfinPOD.head(1)

In [None]:
# swapping order owner name
import re

def createOwnerName(val):
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        val = str(val)
        val = val.strip()
        if "," in val:
            x = val.split(",")
            outString = str(x[0]).strip() + " " + str(x[1]).strip()
        else:
            outString = val     
    return outString
dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: createOwnerName(row['permit_hol']), axis=1)

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val
dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

In [None]:
dfinPOD['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfinPOD['permit_ind'].astype(str)
dfinPOD['in_WaterAllocationNativeURL'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "NDwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "NDwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "NDwr_O1"

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Water Source
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df["in_WaterSourceName"] = dfinPOD['source_nam']
df['in_WaterSourceNativeID'] = ""
df["in_WaterSourceTypeCV"] = dfinPOD['source']

# Site
df["in_CoordinateAccuracy"] = "WaDE Unspecified"
df["in_CoordinateMethodCV"] = "Centroid of Area"
df['in_County'] = dfinPOD['county']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df["in_Latitude"] = dfinPOD['latitude']
df["in_Longitude"] = dfinPOD['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df["in_PODorPOUSite"] = "POD"
df["in_SiteName"] = "WaDE Unspecified"
df["in_SiteNativeID"] = "POD" + dfinPOD['pod'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df["in_SiteTypeCV"] = "WaDE Unspecified"
df["in_StateCV"] = "ND"
df['in_USGSSiteID'] = ""

# Allocation
df["in_AllocationApplicationDate"] = dfinPOD['date_issue']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df["in_AllocationExpirationDate"] = dfinPOD['date_cance']
df["in_AllocationFlow_CFS"] = dfinPOD['req_rate'].astype(float)
df['in_AllocationLegalStatusCV'] = dfinPOD['pod_status'].astype(str)
df["in_AllocationNativeID"] = dfinPOD['permit_num'].astype(str)
df['in_AllocationOwner'] = dfinPOD['in_AllocationOwner']
df['in_AllocationPriorityDate'] = dfinPOD['priority_d']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOD['period_end']
df['in_AllocationTimeframeStart'] = dfinPOD['period_sta']
df['in_AllocationTypeCV'] = ""
df["in_AllocationVolume_AF"] = dfinPOD['req_acft'].astype(float)
df["in_BeneficialUseCategory"] = dfinPOD['use_type']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://www.swc.nd.gov/info_edu/map_data_resources/waterpermits/single.php?id=" + dfinPOD['in_WaterAllocationNativeURL']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

## Concatenate and Clean Data

In [None]:
# Concatenate dataframes
frames = [outPOD]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## WaDE Custom Elements (due to missing sate info)

In [None]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: fixEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

In [None]:
# Changing datatype of date fields to fit WaDE.
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'], errors = 'coerce')
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf["in_AllocationApplicationDate"].dt.strftime('%m/%d/%Y'))

outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf['in_AllocationExpirationDate'], errors = 'coerce')
outdf['in_AllocationExpirationDate'] = pd.to_datetime(outdf["in_AllocationExpirationDate"].dt.strftime('%m/%d/%Y'))

outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf.head()

In [None]:
outdf['in_AllocationTimeframeEnd'] = pd.to_datetime(outdf['in_AllocationTimeframeEnd'], errors = 'coerce')
outdf['in_AllocationTimeframeEnd'] = outdf["in_AllocationTimeframeEnd"].dt.strftime('%m/%d')
outdf['in_AllocationTimeframeEnd'].unique()

In [None]:
outdf['in_AllocationTimeframeStart'] = pd.to_datetime(outdf['in_AllocationTimeframeStart'], errors = 'coerce')
outdf['in_AllocationTimeframeStart'] = outdf["in_AllocationTimeframeStart"].dt.strftime('%m/%d')
outdf['in_AllocationTimeframeStart'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

In [None]:
%%time

# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

# Export the Output 

In [None]:
#technique to check datatype of long dataframes.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
# Export the output dataframe
outdf.to_csv('Pwr_ndMain.zip', index=False, compression="zip")  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.