# Pre-processing Oklahoma Allocation data for WaDEQA upload.
Date Updated: 04/07/2020
Purpose:  To pre-process the Oklahoma data into one master file for simple DataFrame creation and extraction.  To validate datatypes and other data related informattion.

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
#Working Directory and Input File
workingDir = "G:/Shared drives/WaDE Data/Oklahoma/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Division Data
- groundwater wells
- surface water divisions

In [None]:
# groundwater
# Input File
PGW_Input = "Permitted_Groundwater_Wells_input.zip"
df_PGW = pd.read_csv(PGW_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PGW:
    df_PGW['WaDEUUID'] = "okGD" + df_PGW.index.astype(str)
    df_PGW.to_csv('Permitted_Groundwater_Wells_input.zip', compression=dict(method='zip', archive_name='Permitted_Groundwater_Wells_input.csv'), index=False)

print(len(df_PGW))
df_PGW.head()

In [None]:
# surface Water
# Input File
PSWDP_Input = "Permitted_Surface_Water_Diversion_Points_input.csv"
df_PSWDP = pd.read_csv(PSWDP_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PSWDP:
    df_PSWDP['WaDEUUID'] = "okSD" + df_PSWDP.index.astype(str)
    df_PSWDP.to_csv('Permitted_Surface_Water_Diversion_Points_input.zip', compression=dict(method='zip', archive_name='Permitted_Surface_Water_Diversion_Points_input.csv'), index=False)

print(len(df_PSWDP))
df_PSWDP.head()

In [None]:
# Concatenate - Both datasets share the same columns.
dfPOD = pd.concat([df_PGW, df_PSWDP], ignore_index=True).reset_index(drop=True)

print(len(dfPOD))
dfPOD.head(1)

In [None]:
#Changing datatype of used date fields. 
dfPOD['DATE_FILED'] = pd.to_datetime(dfPOD['DATE_FILED'], errors = 'coerce')
dfPOD['DATE_FILED'] = pd.to_datetime(dfPOD['DATE_FILED'].dt.strftime('%m/%d/%Y'))

dfPOD['DATE_ISSUED'] = pd.to_datetime(dfPOD['DATE_ISSUED'], errors = 'coerce')
dfPOD['DATE_ISSUED'] = pd.to_datetime(dfPOD['DATE_ISSUED'].dt.strftime('%m/%d/%Y'))

In [None]:
# # Creating WaDE Custom site native ID for easy site identificaiion
# # ----------------------------------------------------------------------------------------------------

# # Create temp SiteNativeID dataframe of unique site.
# def assignSiteUUID(colrowValue):
#     string1 = str(colrowValue)
#     outstring = "wadeID" + string1
#     return outstring

# dfSiteNativeID = pd.DataFrame()
# dfSiteNativeID['in_Latitude'] = dfPOD['LATITUDE']
# dfSiteNativeID['in_Longitude'] = dfPOD['LONGITUDE']
# dfSiteNativeID = dfSiteNativeID.drop_duplicates()

# dftemp = pd.DataFrame(index=dfSiteNativeID.index)
# dftemp["Count"] = range(1, len(dftemp.index) + 1)
# dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)
# dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str)

# # ----------------------------------------------------------------------------------------------------

# # Retreive WaDE Custom site native ID
# SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
# def retrieveSiteNativeID(A, B):
#     if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
#         outList = ''
#     else:
#         colrowValue = str(A).strip() + str(B).strip()
#         try:
#             outList = SiteNativeIDdict[colrowValue]
#         except:
#             outList = ''
#     return outList

# dfPOD['in_SiteNativeID'] = dfPOD.apply(lambda row: retrieveSiteNativeID( row['LATITUDE'], row['LONGITUDE']), axis=1)
# dfPOD['in_SiteNativeID'] = "POD" + dfPOD['in_SiteNativeID'].astype(str)
# dfPOD.head(2)

In [None]:
dfPOD['in_PODorPOUSite'] = "POD"

In [None]:
dfPOD['in_SiteNativeID'] = "POD" + dfPOD['RECORD_ID'].astype(str).str.strip()
dfPOD['in_SiteNativeID'].unique()

## Place of Use Data

In [None]:
# Input File
AOU_Input = "OK_AreasofUse_input.zip"
dfPOU = pd.read_csv(AOU_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfPOU:
    dfPOU['WaDEUUID'] = "okU" + dfPOU.index.astype(str)
    dfPOU.to_csv('OK_AreasofUse_input.zip', compression=dict(method='zip', archive_name='OK_AreasofUse_input.csv'), index=False)

print(len(dfPOU))
dfPOU.head()

In [None]:
#Changing datatype of used date fields. 
dfPOU['DATE_FILED'] = pd.to_datetime(dfPOU['DATE_FILED'], errors = 'coerce')
dfPOU['DATE_FILED'] = pd.to_datetime(dfPOU['DATE_FILED'].dt.strftime('%m/%d/%Y'))

dfPOU['DATE_ISSUED'] = pd.to_datetime(dfPOU['DATE_ISSUED'], errors = 'coerce')
dfPOU['DATE_ISSUED'] = pd.to_datetime(dfPOU['DATE_ISSUED'].dt.strftime('%m/%d/%Y'))

In [None]:
# # Creating WaDE Custom site native ID for easy site identificaiion
# # ----------------------------------------------------------------------------------------------------

# # Create temp SiteNativeID dataframe of unique site.
# def assignSiteUUID(colrowValue):
#     string1 = str(colrowValue)
#     outstring = "wadeID" + string1
#     return outstring

# dfSiteNativeID = pd.DataFrame()
# dfSiteNativeID['in_Latitude'] = dfPOU['LATITUDE']
# dfSiteNativeID['in_Longitude'] = dfPOU['LONGITUDE']
# dfSiteNativeID = dfSiteNativeID.drop_duplicates()

# dftemp = pd.DataFrame(index=dfSiteNativeID.index)
# dftemp["Count"] = range(1, len(dftemp.index) + 1)
# dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)
# dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str)

# # ----------------------------------------------------------------------------------------------------

# # Retreive WaDE Custom site native ID
# SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
# def retrieveSiteNativeID(A, B):
#     if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
#         outList = ''
#     else:
#         colrowValue = str(A).strip() + str(B).strip()
#         try:
#             outList = SiteNativeIDdict[colrowValue]
#         except:
#             outList = ''
#     return outList

# dfPOU['in_SiteNativeID'] = dfPOU.apply(lambda row: retrieveSiteNativeID( row['LATITUDE'], row['LONGITUDE']), axis=1)
# dfPOU['in_SiteNativeID'] = "POU" + dfPOU['in_SiteNativeID'].astype(str)
# dfPOU.head(2)

In [None]:
dfPOU['in_PODorPOUSite'] = "POU"

In [None]:
dfPOU['in_SiteNativeID'] = "POU" + dfPOU['RECORD_ID'].astype(str).str.strip()
dfPOU['in_SiteNativeID'].unique()

## Concaenate POD and POU

In [None]:
# Concatenate
# Both datasets share the same columns.
dfin = pd.concat([dfPOD, dfPOU], ignore_index=True).reset_index(drop=True).replace(np.nan, '')

print(len(dfin))
dfin.head()

In [None]:
#Fixing Beneficial Uses PRIMARY_PURPOSE
def fixRecFishWild(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == 'Recreation, Fish, Wildlife':
        outList = 'Recreation Fish Wildlife'
    else:
        outList = colrowValue
    return outList

dfin['PRIMARY_PURPOSE'] = dfin.apply(lambda row: fixRecFishWild(row['PRIMARY_PURPOSE']), axis=1)
dfin['PRIMARY_PURPOSE'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "OKwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "OKwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "OKwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfin['WATER']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = dfin['HYDRO_UNIT'].replace("", 0).replace(" ", 0).fillna(0).astype(float).astype(int).astype(str).replace("0", "") # see above
df['in_Latitude'] = dfin['LATITUDE']
df['in_Longitude'] = dfin['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfin['in_PODorPOUSite'] # see above
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfin['in_SiteNativeID'].replace("", 0).fillna(0).astype(str) # see above
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "OK"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin['DATE_FILED']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = dfin['STATUS']
df['in_AllocationNativeID'] =  dfin['PERMIT_NUMBER'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin['ENTITY_NAME']
df['in_AllocationPriorityDate'] = dfin['DATE_ISSUED']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = dfin['TOTAL_PERMITTED_ACRE_FEET']
df['in_BeneficialUseCategory'] = dfin['PRIMARY_PURPOSE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

outdf = df.copy()
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, '')
print(len(outdf))
outdf.head()

## Data Fixes

In [None]:
# Clean owner name up
def cleanOwnerDataFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val
outdf['in_AllocationOwner'] = outdf.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# in_Latitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna("")
outdf['in_Latitude'].unique()

In [None]:
# in_Longitude
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna("")
outdf['in_Longitude'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching gemetry to csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/OK_PoU2.shp')
dfPoUshapetemp.head(3)

In [None]:
# # Creating WaDE Custom site native ID for easy site identificaiion
# # ----------------------------------------------------------------------------------------------------

# # Create temp SiteNativeID dataframe of unique site.
# def assignSiteUUID(colrowValue):
#     string1 = str(colrowValue)
#     outstring = "wadeID" + string1
#     return outstring

# dfSiteNativeID = pd.DataFrame()
# dfSiteNativeID['in_Latitude'] = dfPoUshapetemp['Lattitude']
# dfSiteNativeID['in_Longitude'] = dfPoUshapetemp['Longitude']
# dfSiteNativeID = dfSiteNativeID.drop_duplicates()

# dftemp = pd.DataFrame(index=dfSiteNativeID.index)
# dftemp["Count"] = range(1, len(dftemp.index) + 1)
# dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)
# dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str)


# # ----------------------------------------------------------------------------------------------------

# # Retreive WaDE Custom site native ID
# SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
# def retrieveSiteNativeID(A, B):
#     if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
#         outList = ''
#     else:
#         colrowValue = str(A).strip() + str(B).strip()
#         try:
#             outList = SiteNativeIDdict[colrowValue]
#         except:
#             outList = ''
#     return outList

# dfPoUshapetemp['in_SiteNativeID'] = dfPoUshapetemp.apply(lambda row: retrieveSiteNativeID( row['Lattitude'], row['Longitude']), axis=1)
# dfPoUshapetemp.head(2)

In [None]:
dfPOU['in_SiteNativeID'] = "POU" + dfPOU['RECORD_ID'].astype(str).str.strip()
dfPOU['in_SiteNativeID'].unique()

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['RECORD_ID'].replace("", 0).fillna(0).astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head(3)

## Export Data

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('Pwr_okMain.zip', compression=dict(method='zip', archive_name='Pwr_okMain.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.