# Pre-processing Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Oklahoma/WaterAllocation_WaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Oklahoma/WaterAllocation_WaterUse


## Water Right Data
- POD Division data (groundwater wells, surface water division)
- POU data (Permitted_Areas_of_Use, Permitted_Dedicated_Lands)

In [3]:
# Input File - groundwater POD
PGW_Input = "RawInputData/water_right/Permitted_GW_Wells.zip"
df_PGW = pd.read_csv(PGW_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PGW:
    df_PGW['WaDEUUID'] = "okGD" + df_PGW.index.astype(str)
    df_PGW.to_csv('RawInputData/water_right/Permitted_GW_Wells.zip', compression=dict(method='zip', archive_name='Permitted_GW_Wells.csv'), index=False)

print(len(df_PGW))
df_PGW.head(1)

21484


Unnamed: 0,RECORD_ID,PERMIT_NUM,LATITUDE,LONGITUDE,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,WaDEUUID
0,59530.0,20090520,36.51447,-101.32117,Q3 Grid,6/16/2009 0:00:00,,Permit,Groundwater,Active,"Brown, Simmons, John",,NW,NW,NE,34,01N,16EC,Texas,Regular,1920.0,Irrigation,4/23/2009 0:00:00,1/12/2010 0:00:00,,,18118,20090520-59530,okGD0


In [4]:
# Input File - surface Water POD
PSWDP_Input = "RawInputData/water_right/Permitted_SW_Diversions.zip"
df_PSWDP = pd.read_csv(PSWDP_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PSWDP:
    df_PSWDP['WaDEUUID'] = "okSD" + df_PSWDP.index.astype(str)
    df_PSWDP.to_csv('RawInputData/water_right/Permitted_SW_Diversions.zip', compression=dict(method='zip', archive_name='Permitted_SW_Diversions.csv'), index=False)

print(len(df_PSWDP))
df_PSWDP.head(1)

3200


Unnamed: 0,RECORD_ID,PERMIT_NUM,LATITUDE,LONGITUDE,CREATIONME,DATECREATE,MODIFIEDBY,RECORD_TYP,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,WaDEUUID
0,31892.0,19950008,34.59339,-99.23026,USER,6/16/2009 0:00:00,MPSUGHRU,Permit,Surface Water,Active,Nichols Family Partnership Limited,,NE,NE,NE,6,01N,19WI,Jackson,Regular,208.0,Irrigation,2/13/1995 0:00:00,6/13/1995 0:00:00,11120303,1151,10668,19950008-31892,okSD0


In [5]:
# Concatenate - Both datasets share the same columns.
dfPOD = pd.concat([df_PGW, df_PSWDP], ignore_index=True).reset_index(drop=True)

print(len(dfPOD))
dfPOD.head(1)

24684


Unnamed: 0,RECORD_ID,PERMIT_NUM,LATITUDE,LONGITUDE,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,WaDEUUID,MODIFIEDBY
0,59530.0,20090520,36.51447,-101.32117,Q3 Grid,6/16/2009 0:00:00,,Permit,Groundwater,Active,"Brown, Simmons, John",,NW,NW,NE,34,01N,16EC,Texas,Regular,1920.0,Irrigation,4/23/2009 0:00:00,1/12/2010 0:00:00,,,18118,20090520-59530,okGD0,


In [6]:
# POD specific
dfPOD['in_Latitude'] = dfPOD['LATITUDE']
dfPOD['in_Longitude'] = dfPOD['LONGITUDE']
dfPOD['in_PODorPOUSite'] = "POD"

dfPOD['in_SiteNativeID'] = "pod" + dfPOD['RECORD_ID'].astype(float).astype("int64").astype(str).str.strip()
dfPOD['in_SiteNativeID'].unique()

array(['pod59530', 'pod36565', 'pod11674', ..., 'pod76062', 'pod2805',
       'pod2601'], dtype=object)

In [7]:
# Input File - Permitted_Areas_of_Use POU
PAU_Input = "RawInputData/water_right/shapefiles/Permitted_Areas_of_Use.zip"
df_PAU = gpd.read_file(PAU_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PAU:
    df_PAU['WaDEUUID'] = "okPAU" + df_PAU.index.astype(str)
    df_PAU.to_csv('RawInputData/water_right/Permitted_Areas_of_Use.zip', compression=dict(method='zip', archive_name='Permitted_Areas_of_Use.csv'), index=False)

print(len(df_PAU))
df_PAU.head(1)

4211


Unnamed: 0,RECORD_ID,PERMIT_NUM,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,OWRB_,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,wadeLat,wadeLong,Shape_Leng,Shape_Area,geometry,WaDEUUID
0,52092.0,19580175,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,"Arbuckle Enterprises, LLC",,,,SW,24,01N,01WI,Garvin,Vested,200.0,Irrigation,1958-04-16,1969-08-12,11130303,1081,11530,19580175-52092,34.53944,-97.26043,0.03145,6e-05,"POLYGON ((-97.25616 34.53763, -97.25616 34.535...",okPAU0


In [8]:
# Input File - Permitted_Dedicated_Lands POU
PDL_Input = "RawInputData/water_right/shapefiles/Permitted_Dedicated_Lands.zip"
df_PDL = gpd.read_file(PDL_Input).replace(np.nan, "").replace ("nan,nan", "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PDL:
    df_PDL['WaDEUUID'] = "okPDL" + df_PDL.index.astype(str)
    df_PDL.to_csv('RawInputData/water_right/Permitted_Dedicated_Lands.zip', compression=dict(method='zip', archive_name='Permitted_Dedicated_Lands.csv'), index=False)

print(len(df_PDL))
df_PDL.head(1)

20777


Unnamed: 0,RECORD_ID,PERMIT_NUM,CREATIONME,DATECREATE,MODIFIEDBY,OWRBGIS_WR,RECORD_TYP,OWRB_,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,Shape_Leng,COMB_ID,wadeLat,wadeLong,Shape_Le_1,Shape_Area,geometry,WaDEUUID
0,5032.0,19700166,Q3 Grid,2009-06-16,,0.0,Permit,,Groundwater,Active,"de Boer, Abel",,,E2,SE,22,01N,01EC,Cimarron,Prior Right,167.0,Irrigation,1970-04-24,1979-01-09,11100103,2094,12408,2416.75296,19700166-5032,36.53426,-102.9325,0.02344,3e-05,"POLYGON ((-102.93026 36.53246, -102.93025 36.5...",okPDL0


In [9]:
# Concatenate - Both datasets share the same columns.
dfPOU = pd.concat([df_PAU, df_PDL], ignore_index=True).reset_index(drop=True)

print(len(dfPOU))
dfPOU.head(1)

24988


Unnamed: 0,RECORD_ID,PERMIT_NUM,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,OWRB_,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,wadeLat,wadeLong,Shape_Leng,Shape_Area,geometry,WaDEUUID,MODIFIEDBY,OWRBGIS_WR,Shape_Le_1
0,52092.0,19580175,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,"Arbuckle Enterprises, LLC",,,,SW,24,01N,01WI,Garvin,Vested,200.0,Irrigation,1958-04-16,1969-08-12,11130303,1081,11530,19580175-52092,34.53944,-97.26043,0.03145,6e-05,"POLYGON ((-97.25616 34.53763, -97.25616 34.535...",okPAU0,,,


In [10]:
# POIU specific
dfPOU['in_Latitude'] = dfPOU['wadeLat']
dfPOU['in_Longitude'] = dfPOU['wadeLong']
dfPOU['in_PODorPOUSite'] = "POU"

dfPOU['in_SiteNativeID'] = "pou" + dfPOU['RECORD_ID'].astype(float).astype("int64").astype(str).str.strip()
dfPOU['in_SiteNativeID'].unique()

array(['pou52092', 'pou33548', 'pou21001', ..., 'pou29452', 'pou29453',
       'pou40718'], dtype=object)

In [11]:
# Concatenate POD and POU together

dfPOD['geometry'] = "" # as filler
df_wr = pd.concat([dfPOD, dfPOU], ignore_index=True).reset_index(drop=True).replace(np.nan, '')

print(len(df_wr))
df_wr.head(1)

49672


Unnamed: 0,RECORD_ID,PERMIT_NUM,LATITUDE,LONGITUDE,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,WaDEUUID,MODIFIEDBY,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteNativeID,geometry,OWRB_,wadeLat,wadeLong,Shape_Leng,Shape_Area,OWRBGIS_WR,Shape_Le_1
0,59530.0,20090520,36.51447,-101.32117,Q3 Grid,6/16/2009 0:00:00,,Permit,Groundwater,Active,"Brown, Simmons, John",,NW,NW,NE,34,01N,16EC,Texas,Regular,1920.0,Irrigation,4/23/2009 0:00:00,1/12/2010 0:00:00,,,18118,20090520-59530,okGD0,,36.51447,-101.32117,POD,pod59530,,,,,,,,


In [12]:
#Fixing Beneficial Uses PRIMARY_PURPOSE
def fixRecFishWild(colrowValue):
    colrowValue = str(colrowValue).strip()
    if colrowValue == 'Recreation, Fish, Wildlife':
        outList = 'Recreation Fish & Wildlife'
    else:
        outList = colrowValue
    return outList

df_wr['PRIMARY_PU'] = df_wr.apply(lambda row: fixRecFishWild(row['PRIMARY_PU']), axis=1)
df_wr['PRIMARY_PU'].value_counts()

PRIMARY_PU
Irrigation                    35101
Public Supply                  8441
Agriculture                    1750
Mining                         1547
Recreation Fish & Wildlife     1241
Industrial                      850
Commercial                      415
Power                           240
Other                            87
Name: count, dtype: int64

## Water Use Data
- Groundwater permit use
- Surface water permit use

In [13]:
# Input File - Groundwater Use 2000-2020
InputFile = "RawInputData/water_use/Groundwater Use 2000-2020.zip"
df_uGW = pd.read_csv(InputFile).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_uGW:
    df_uGW['WaDEUUID'] = "okuGW" + df_uGW.index.astype(str)
    df_uGW.to_csv('RawInputData/water_use/Groundwater Use 2000-2020.zip', compression=dict(method='zip', archive_name='Groundwater Use 2000-2020.csv'), index=False)

print(len(df_uGW))
df_uGW.head(1)

84972


Unnamed: 0,ID,PERMIT_NUM,Permit Holder,Water Type,County,Year,Irrigation,Public Supply,Industrial,Power,Mining,Commercial,Recreation Fish & Wildlife,Agriculture,Other,Total Amount,Unnamed: 16,WaDEUUID
0,1,19040011,Prior Right,GW,G,2000,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,,okuGW0


In [14]:
# Input File - Surface Water Use 2000-2020
InputFile = "RawInputData/water_use/Surface Water Use 2000-2020.zip"
df_uSW = pd.read_csv(InputFile).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_uSW:
    df_uSW['WaDEUUID'] = "okuSW" + df_uSW.index.astype(str)
    df_uSW.to_csv('RawInputData/water_use/Surface Water Use 2000-2020.zip', compression=dict(method='zip', archive_name='Surface Water Use 2000-2020.csv'), index=False)

print(len(df_uSW))
df_uSW.head(1)

23599


Unnamed: 0,ID,PERMIT_NUM,Permit Holder,Water Type,County,Year,Irrigation,Public Supply,Industrial,Power,Mining,Commercial,Recreation Fish & Wildlife,Agriculture,Other Total,Total Amount,WaDEUUID
0,1,18990001,Vested,GW,S,2004,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,okSTS0


In [15]:
# Concatenate groundwater with surface water use
# datasets do differ by PERMIT_NUM

df_u = pd.concat([df_uGW, df_uSW], ignore_index=True).reset_index(drop=True).replace(np.nan, '')

print(len(df_u))
df_u.head(1)

108571


Unnamed: 0,ID,PERMIT_NUM,Permit Holder,Water Type,County,Year,Irrigation,Public Supply,Industrial,Power,Mining,Commercial,Recreation Fish & Wildlife,Agriculture,Other,Total Amount,Unnamed: 16,WaDEUUID,Other Total
0,1,19040011,Prior Right,GW,G,2000,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,,okuGW0,


In [16]:
# Pivot data, set each recorded ben use amount to in_Amount input

# temp base dataframe
df_u2 = df_u[['PERMIT_NUM', 'Year']]

# ben use list for amount value & column name
benUseList = ['Irrigation', 'Public Supply', 'Industrial', 'Power', 'Mining', 'Commercial', 'Recreation Fish & Wildlife', 'Agriculture', 'Other']

# output Dataframne
df_u3 = pd.DataFrame()
for x in benUseList:
    x = str(x)
    df_temp = df_u2.copy()
    df_temp['benuseListValue'] = x
    df_temp['in_Amount'] = df_u[x]
    df_u3 = pd.concat([df_u3, df_temp], ignore_index=True).reset_index(drop=True).replace(np.nan, '')
    
print(len(df_u3))
df_u3.head(1)

977139


Unnamed: 0,PERMIT_NUM,Year,benuseListValue,in_Amount
0,19040011,2000,Irrigation,0.0


In [17]:
# left-join merge wr sites to water use data by permit number

dfin = df_wr.merge(df_u3, left_on='PERMIT_NUM', right_on='PERMIT_NUM', how='left').replace(np.nan, "")
print(len(dfin))
dfin.head(1)

3703457


Unnamed: 0,RECORD_ID,PERMIT_NUM,LATITUDE,LONGITUDE,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,WaDEUUID,MODIFIEDBY,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteNativeID,geometry,OWRB_,wadeLat,wadeLong,Shape_Leng,Shape_Area,OWRBGIS_WR,Shape_Le_1,Year,benuseListValue,in_Amount
0,59530.0,20090520,36.51447,-101.32117,Q3 Grid,6/16/2009 0:00:00,,Permit,Groundwater,Active,"Brown, Simmons, John",,NW,NW,NE,34,01N,16EC,Texas,Regular,1920.0,Irrigation,4/23/2009 0:00:00,1/12/2010 0:00:00,,,18118,20090520-59530,okGD0,,36.51447,-101.32117,POD,pod59530,,,,,,,,,2010.0,Irrigation,285.0


## WaDE Input

In [18]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "OKwrwu_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "OKwrwu_V1" # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Annual"
df['in_VariableCV'] = "Water Use"

# Organization Info
df['in_OrganizationUUID'] = "OKwrwu_OR1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = ""
df['in_WaterSourceTypeCV'] = dfin['WATER']

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = dfin['HYDRO_UNIT'].replace("", 0).replace(" ", 0).fillna(0).astype(float).astype("int64").astype(str).replace("0", "")
df['in_Latitude'] = dfin['in_Latitude'] # see above
df['in_Longitude'] = dfin['in_Longitude'] # see above
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfin['in_PODorPOUSite'] # see above
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfin['in_SiteNativeID'].replace("", 0).fillna(0).astype(str) # see above
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "OK"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin['DATE_FILED']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = dfin['STATUS']
df['in_AllocationNativeID'] =  dfin['PERMIT_NUM'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin['ENTITY_NAM']
df['in_AllocationPriorityDate'] = dfin['DATE_ISSUE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = dfin['RECORD_TYP']
df['in_AllocationVolume_AF'] = dfin['TOTAL_PERM']
df['in_BeneficialUseCategory'] = dfin['PRIMARY_PU']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin['in_Amount']
df['in_AssociatedNativeAllocationIDs'] = dfin['PERMIT_NUM'].replace("", 0).fillna(0).astype(str)
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = dfin['benuseListValue']
df['in_ReportYearCV'] = dfin['Year'].replace("", 0).fillna(0).astype(float).astype(int).astype(str)
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = "12/31/" + df['in_ReportYearCV'].astype(str)
df['in_TimeframeStart'] = "01/01/" + df['in_ReportYearCV'].astype(str)
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

  df['in_ReportYearCV'] = dfin['Year'].replace("", 0).fillna(0).astype(float).astype(int).astype(str)


3703457


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,4/23/2009 0:00:00,,,,,,,,,Active,20090520,"Brown, Simmons, John",1/12/2010 0:00:00,,,,Permit,1920.0,Irrigation,,,,,,0,,,,,,,,,,,285.0,20090520,,Irrigation,2010,,12/31/2010,01/01/2010
1,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,4/23/2009 0:00:00,,,,,,,,,Active,20090520,"Brown, Simmons, John",1/12/2010 0:00:00,,,,Permit,1920.0,Irrigation,,,,,,0,,,,,,,,,,,460.0,20090520,,Irrigation,2011,,12/31/2011,01/01/2011
2,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,4/23/2009 0:00:00,,,,,,,,,Active,20090520,"Brown, Simmons, John",1/12/2010 0:00:00,,,,Permit,1920.0,Irrigation,,,,,,0,,,,,,,,,,,460.0,20090520,,Irrigation,2012,,12/31/2012,01/01/2012
3,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,4/23/2009 0:00:00,,,,,,,,,Active,20090520,"Brown, Simmons, John",1/12/2010 0:00:00,,,,Permit,1920.0,Irrigation,,,,,,0,,,,,,,,,,,1041.3,20090520,,Irrigation,2013,,12/31/2013,01/01/2013
4,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,4/23/2009 0:00:00,,,,,,,,,Active,20090520,"Brown, Simmons, John",1/12/2010 0:00:00,,,,Permit,1920.0,Irrigation,,,,,,0,,,,,,,,,,,1041.3,20090520,,Irrigation,2014,,12/31/2014,01/01/2014


## Concatenate POD and POU Data.  Make needed changes

In [19]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [20]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

3703457


## Clean Data / data types

In [21]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

  Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')


In [22]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [23]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Texas', 'Tillman', 'Beaver', 'Jackson', 'Major', 'Woodward',
       'Delaware', 'Ellis', 'Garvin', 'Cimarron', 'Grady', 'Marshall',
       'Bryan', 'Choctaw', 'Cleveland', 'Mcclain', 'Caddo', 'Harper',
       'Pawnee', 'Kingfisher', 'Oklahoma', 'Harmon', 'Love', 'Atoka',
       'Murray', 'Cherokee', 'Carter', 'Dewey', 'Wagoner', 'Roger Mills',
       'Kiowa', 'Latimer', 'Beckham', 'Le Flore', 'Alfalfa', 'Washita',
       'Grant', 'Pottawatomie', 'Stephens', 'Comanche', 'Greer', 'Coal',
       'Garfield', 'Kay', 'Tulsa', 'Blaine', 'Jefferson', 'Custer',
       'Woods', 'Canadian', 'Osage', 'Noble', 'Seminole', 'Okfuskee',
       'Pittsburg', 'Hughes', 'Johnston', 'Logan', 'Lincoln', 'Cotton',
       'Muskogee', 'Sequoyah', 'Adair', 'Mcintosh', 'Ottawa', 'Pontotoc',
       'Payne', 'Haskell', 'Mccurtain', 'Creek', 'Craig', 'Mayes',
       'Pushmataha', 'Rogers', '', 'Nowata', 'Washington', 'Okmulgee'],
      dtype=object)

In [24]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [25]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Brown, Simmons, John', 'Miller, Edward Lee', 'Lett, Iletta', ...,
       'Gray, Ray E', 'Slatten, Jack A', 'Holland, Ronald'], dtype=object)

In [26]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [27]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [28]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [29]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array([''], dtype=object)

In [30]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [31]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Texas', 'Tillman', 'Beaver', 'Jackson', 'Major', 'Woodward',
       'Delaware', 'Ellis', 'Garvin', 'Cimarron', 'Grady', 'Marshall',
       'Bryan', 'Choctaw', 'Cleveland', 'Mcclain', 'Caddo', 'Harper',
       'Pawnee', 'Kingfisher', 'Oklahoma', 'Harmon', 'Love', 'Atoka',
       'Murray', 'Cherokee', 'Carter', 'Dewey', 'Wagoner', 'Roger Mills',
       'Kiowa', 'Latimer', 'Beckham', 'Le Flore', 'Alfalfa', 'Washita',
       'Grant', 'Pottawatomie', 'Stephens', 'Comanche', 'Greer', 'Coal',
       'Garfield', 'Kay', 'Tulsa', 'Blaine', 'Jefferson', 'Custer',
       'Woods', 'Canadian', 'Osage', 'Noble', 'Seminole', 'Okfuskee',
       'Pittsburg', 'Hughes', 'Johnston', 'Logan', 'Lincoln', 'Cotton',
       'Muskogee', 'Sequoyah', 'Adair', 'Mcintosh', 'Ottawa', 'Pontotoc',
       'Payne', 'Haskell', 'Mccurtain', 'Creek', 'Craig', 'Mayes',
       'Pushmataha', 'Rogers', '', 'Nowata', 'Washington', 'Okmulgee'],
      dtype=object)

In [32]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Brown, Simmons, John', 'Miller, Edward Lee', 'Lett, Iletta', ...,
       'Gray, Ray E', 'Slatten, Jack A', 'Holland, Ronald'], dtype=object)

In [33]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Agriculture',
 'Commercial',
 'Industrial',
 'Irrigation',
 'Mining',
 'Other',
 'Power',
 'Public Supply',
 'Recreation Fish & Wildlife']

In [34]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([36.51446873, 36.57841806, 34.58644895, ..., 36.93936008,
       36.93938201, 36.99027734])

In [35]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-101.32117073, -101.19902572,  -99.02400764, ..., -101.02216918,
       -101.00865812, -100.91434576])

In [36]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([''], dtype=object)

In [37]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round().replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([1920.0, 500.0, 110.0, ..., 7346.0, 505.0, 1062.0], dtype=object)

In [38]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array([285.0, 460.0, 1041.3, ..., 256.5, 807.5, 230.9], dtype=object)

In [39]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [40]:
#Update datatype of iAllocation Application Date to fit WaDE 2.0 structure
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'], errors = 'coerce')
outdf['in_AllocationApplicationDate'] = pd.to_datetime(outdf['in_AllocationApplicationDate'].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_AllocationApplicationDate'].unique()

<DatetimeArray>
['2009-04-23 00:00:00', '1965-10-26 00:00:00', '1969-09-25 00:00:00',
 '1983-02-10 00:00:00', '1955-05-17 00:00:00', '2004-09-07 00:00:00',
 '1972-03-14 00:00:00', '1971-01-08 00:00:00', '1969-10-03 00:00:00',
 '1974-09-23 00:00:00',
 ...
 '1954-05-08 00:00:00', '2014-05-12 00:00:00', '2018-04-18 00:00:00',
 '2018-04-20 00:00:00', '2019-08-28 00:00:00', '1948-06-17 00:00:00',
 '2006-11-06 00:00:00', '2007-03-02 00:00:00', '1925-12-29 00:00:00',
 '2021-02-25 00:00:00']
Length: 7443, dtype: datetime64[ns]

In [41]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
['2010-01-12 00:00:00', '1978-12-12 00:00:00', '1978-02-14 00:00:00',
 '1983-06-14 00:00:00', '2005-05-10 00:00:00', '1979-02-13 00:00:00',
 '1979-06-12 00:00:00', '1979-05-08 00:00:00', '1979-07-10 00:00:00',
 '1988-08-09 00:00:00',
 ...
 '2003-12-31 00:00:00', '1966-10-11 00:00:00', '2019-08-22 00:00:00',
 '1932-04-08 00:00:00', '2003-09-14 00:00:00', '1991-09-23 00:00:00',
 '1969-07-08 00:00:00', '2019-04-05 00:00:00', '1963-11-12 00:00:00',
 '2003-07-18 00:00:00']
Length: 1145, dtype: datetime64[ns]

In [42]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
['2010-12-31 00:00:00', '2011-12-31 00:00:00', '2012-12-31 00:00:00',
 '2013-12-31 00:00:00', '2014-12-31 00:00:00', '2015-12-31 00:00:00',
 '2016-12-31 00:00:00', '2017-12-31 00:00:00',                 'NaT',
 '2003-12-31 00:00:00', '2004-12-31 00:00:00', '2005-12-31 00:00:00',
 '2006-12-31 00:00:00', '2007-12-31 00:00:00', '2008-12-31 00:00:00',
 '2009-12-31 00:00:00', '2018-12-31 00:00:00', '2019-12-31 00:00:00',
 '2020-12-31 00:00:00', '2000-12-31 00:00:00', '2001-12-31 00:00:00',
 '2002-12-31 00:00:00']
Length: 22, dtype: datetime64[ns]

In [43]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y')).replace("NaT", "").fillna("")
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
['2010-01-01 00:00:00', '2011-01-01 00:00:00', '2012-01-01 00:00:00',
 '2013-01-01 00:00:00', '2014-01-01 00:00:00', '2015-01-01 00:00:00',
 '2016-01-01 00:00:00', '2017-01-01 00:00:00',                 'NaT',
 '2003-01-01 00:00:00', '2004-01-01 00:00:00', '2005-01-01 00:00:00',
 '2006-01-01 00:00:00', '2007-01-01 00:00:00', '2008-01-01 00:00:00',
 '2009-01-01 00:00:00', '2018-01-01 00:00:00', '2019-01-01 00:00:00',
 '2020-01-01 00:00:00', '2000-01-01 00:00:00', '2001-01-01 00:00:00',
 '2002-01-01 00:00:00']
Length: 22, dtype: datetime64[ns]

In [44]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

array(['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '0', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2018', '2019', '2020', '2000', '2001', '2002'], dtype=object)

In [45]:
# # Assign Primary Use Category

# import sys
# sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
# import AssignPrimaryUseCategoryFile # Use Custom import file

# outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
# outdf['in_PrimaryUseCategory'].unique()

In [46]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Water Use_Annual_Irrigation_Groundwater',
       'Water Use_Annual_Public Supply_Groundwater',
       'Water Use_Annual_Industrial_Groundwater',
       'Water Use_Annual_Power_Groundwater',
       'Water Use_Annual_Mining_Groundwater',
       'Water Use_Annual_Commercial_Groundwater',
       'Water Use_Annual_Recreation Fish & Wildlife_Groundwater',
       'Water Use_Annual_Agriculture_Groundwater',
       'Water Use_Annual_Other_Groundwater',
       'Water Use_Annual__Groundwater',
       'Water Use_Annual_Irrigation_Surface Water',
       'Water Use_Annual_Public Supply_Surface Water',
       'Water Use_Annual_Industrial_Surface Water',
       'Water Use_Annual_Power_Surface Water',
       'Water Use_Annual_Mining_Surface Water',
       'Water Use_Annual_Commercial_Surface Water',
       'Water Use_Annual_Recreation Fish & Wildlife_Surface Water',
       'Water Use_Annual_Agriculture_Surface Water',
       'Water Use_Annual_Other_Surface Water',
       'Water Use_Annual__Surf

In [47]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2'], dtype=object)

In [48]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['pod59530', 'pod36565', 'pod11674', ..., 'pou29452', 'pou29453',
       'pou40718'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

In [49]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = [""] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

3703457


array(['Active', 'Pending'], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [50]:
# PoU Shapefile Data
shapefileInput =  "RawInputData/water_right/shapefiles/Permitted_Areas_of_Use.zip" # ziped folder of the shp file

dfPoUshapetemp = gpd.read_file(shapefileInput)
dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(dfPoUshapetemp))
dfPoUshapetemp.head()

4211


Unnamed: 0,RECORD_ID,PERMIT_NUM,CREATIONME,DATECREATE,DATEMODIFI,RECORD_TYP,OWRB_,WATER,STATUS,ENTITY_NAM,QUARTER4,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYP,TOTAL_PERM,PRIMARY_PU,DATE_FILED,DATE_ISSUE,HYDRO_UNIT,STREAM_SYS,USER_ID,COMB_ID,wadeLat,wadeLong,Shape_Leng,Shape_Area,geometry
0,52092.0,19580175,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,"Arbuckle Enterprises, LLC",,,,SW,24,01N,01WI,Garvin,Vested,200.0,Irrigation,1958-04-16,1969-08-12,11130303,1081,11530,19580175-52092,34.53944,-97.26043,0.03145,6e-05,"POLYGON ((-97.25616 34.53763, -97.25616 34.535..."
1,33548.0,19700317,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,"Oliva, Gumaro Ponce",,,NW,NE,2,26N,12EI,Washington,Regular,12.0,Irrigation,1970-09-08,1970-12-08,11070106,2140,18499,19700317-33548,36.76979,-95.99626,0.01654,2e-05,"POLYGON ((-95.99625 36.76792, -95.99853 36.767..."
2,21001.0,19980021,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,"Wildlife Conservation, Dept of",,,E2,NE,27,26N,16EI,Nowata,Regular,220.0,"Recreation, Fish, Wildlife",1998-07-20,1998-12-08,11070103,2152,18011,19980021-21001,36.70996,-95.57785,0.02357,3e-05,"POLYGON ((-95.57786 36.70631, -95.58012 36.706..."
3,32063.0,19760125,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,Circle P Ranch,,,E2,SE,33,21N,16EI,Rogers,Regular,277.0,Irrigation,1976-10-05,1977-03-08,11070105,2151,5922,19760125-32063,36.25293,-95.59926,0.0235,3e-05,"POLYGON ((-95.59928 36.24930, -95.60100 36.249..."
4,34273.0,19760125,Q3 Grid,2009-06-16,,Permit,,Surface Water,Active,Circle P Ranch,,,W2,SW,34,21N,16EI,Rogers,Regular,277.0,Irrigation,1976-10-05,1977-03-08,11070105,2151,5922,19760125-34273,36.25293,-95.59474,0.02357,3e-05,"POLYGON ((-95.59248 36.24931, -95.59429 36.249..."


In [51]:
# create temp dataframe to hold native ID and geometry from shapefile input
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)

# assing values to temp dataframe based on shapefile input
# for in_SiteNativeID assure ID value is the same as that listed above for POU info.
dfPoUshape['in_SiteNativeID'] = "pou" + dfPoUshapetemp['RECORD_ID'].replace("", 0).fillna(0).astype(float).astype("int64").astype(str).str.strip()
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head()

4210


Unnamed: 0,in_SiteNativeID,geometry
0,pou52092,"POLYGON ((-97.25616 34.53763, -97.25616 34.535..."
1,pou33548,"POLYGON ((-95.99625 36.76792, -95.99853 36.767..."
2,pou21001,"POLYGON ((-95.57786 36.70631, -95.58012 36.706..."
3,pou32063,"POLYGON ((-95.59928 36.24930, -95.60100 36.249..."
4,pou34273,"POLYGON ((-95.59248 36.24931, -95.59429 36.249..."


## Export Outputs

In [52]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3703457 entries, 0 to 3703456
Data columns (total 74 columns):
 #   Column                                        Dtype         
---  ------                                        -----         
 0   WaDEUUID                                      object        
 1   in_MethodUUID                                 object        
 2   in_VariableSpecificUUID                       object        
 3   in_AggregationIntervalUnitCV                  object        
 4   in_VariableCV                                 object        
 5   in_OrganizationUUID                           object        
 6   in_Geometry                                   object        
 7   in_GNISFeatureNameCV                          object        
 8   in_WaterQualityIndicatorCV                    object        
 9   in_WaterSourceName                            object        
 10  in_WaterSourceNativeID                        object        
 11  in_WaterSourceTypeCV    

In [53]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,2009-04-23,,,,,,,,,Active,20090520,"Brown, Simmons, John",2010-01-12,,,,Permit,1920.00000,Irrigation,,,,,,0,,,,,,,,,,,285.00000,20090520,,Irrigation,2010,,2010-12-31,2010-01-01,Water Use_Annual_Irrigation_Groundwater
1,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,2009-04-23,,,,,,,,,Active,20090520,"Brown, Simmons, John",2010-01-12,,,,Permit,1920.00000,Irrigation,,,,,,0,,,,,,,,,,,460.00000,20090520,,Irrigation,2011,,2011-12-31,2011-01-01,Water Use_Annual_Irrigation_Groundwater
2,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,2009-04-23,,,,,,,,,Active,20090520,"Brown, Simmons, John",2010-01-12,,,,Permit,1920.00000,Irrigation,,,,,,0,,,,,,,,,,,460.00000,20090520,,Irrigation,2012,,2012-12-31,2012-01-01,Water Use_Annual_Irrigation_Groundwater
3,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,2009-04-23,,,,,,,,,Active,20090520,"Brown, Simmons, John",2010-01-12,,,,Permit,1920.00000,Irrigation,,,,,,0,,,,,,,,,,,1041.30000,20090520,,Irrigation,2013,,2013-12-31,2013-01-01,Water Use_Annual_Irrigation_Groundwater
4,okGD0,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,,36.51447,-101.32117,,,POD,,pod59530,,,OK,,2009-04-23,,,,,,,,,Active,20090520,"Brown, Simmons, John",2010-01-12,,,,Permit,1920.00000,Irrigation,,,,,,0,,,,,,,,,,,1041.30000,20090520,,Irrigation,2014,,2014-12-31,2014-01-01,Water Use_Annual_Irrigation_Groundwater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3703452,okPDL20775,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,11100102,36.93938,-101.00866,,,POU,,pou29453,,,OK,,NaT,,,,,,,,,Active,19880510,Blaser Farms Inc,NaT,,,,Permit,3600.00000,Irrigation,,,,,,0,,,,,,,,,,,,19880510,,Other,2017,,2017-12-31,2017-01-01,Water Use_Annual_Other_Groundwater
3703453,okPDL20775,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,11100102,36.93938,-101.00866,,,POU,,pou29453,,,OK,,NaT,,,,,,,,,Active,19880510,Blaser Farms Inc,NaT,,,,Permit,3600.00000,Irrigation,,,,,,0,,,,,,,,,,,,19880510,,Other,2018,,2018-12-31,2018-01-01,Water Use_Annual_Other_Groundwater
3703454,okPDL20775,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,11100102,36.93938,-101.00866,,,POU,,pou29453,,,OK,,NaT,,,,,,,,,Active,19880510,Blaser Farms Inc,NaT,,,,Permit,3600.00000,Irrigation,,,,,,0,,,,,,,,,,,,19880510,,Other,2019,,2019-12-31,2019-01-01,Water Use_Annual_Other_Groundwater
3703455,okPDL20775,OKwrwu_M1,OKwrwu_V1,Annual,Water Use,OKwrwu_OR1,,,,,wadeId1,Groundwater,,,Texas,4326,,,11100102,36.93938,-101.00866,,,POU,,pou29453,,,OK,,NaT,,,,,,,,,Active,19880510,Blaser Farms Inc,NaT,,,,Permit,3600.00000,Irrigation,,,,,,0,,,,,,,,,,,,19880510,,Other,2020,,2020-12-31,2020-01-01,Water Use_Annual_Other_Groundwater


In [54]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwrwu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.