# Pre-processing Oklahoma Allocation data for WaDEQA upload.
Date Updated: 04/07/2020
Purpose:  To pre-process the Oklahoma data into one master file for simple DataFrame creation and extraction.  To validate datatypes and other data related informattion.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
import geopandas as gpd # the library that lets us read in shapefiles
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Working Directory and Input File
workingDir = "G:/Shared drives/WaDE Data/Oklahoma/WaterAllocation/RawInputData"
os.chdir(workingDir)

## POD Division Data
- groundwater wells
- surface water divisions

In [3]:
# groundwater
# Input File
PGW_Input = "Permitted_Groundwater_Wells_input.csv"
df_PGW = pd.read_csv(PGW_Input)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PGW:
    df_PGW['WaDEUUID'] = "okGD" + df_PGW.index.astype(str)
    df_PGW.to_csv('Permitted_Groundwater_Wells_input.csv', index=False)

df_PGW.head(1)

Unnamed: 0,X,Y,OBJECTID,RECORD_ID,PERMIT_NUMBER,LATITUDE,LONGITUDE,RECORD_TYPE,WATER,STATUS,ENTITY_NAME,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYPE,TOTAL_PERMITTED_ACRE_FEET,PRIMARY_PURPOSE,DATE_FILED,DATE_ISSUED,HYDRO_UNIT,STREAM_SYSTEM,RECORD_ID2,WaDEUUID
0,-101.896349,36.574734,561,9753,19980623,36.574728,-101.89634,Permit,Groundwater,Active,"Prestage Farms of Oklahoma, LLC",SW,SW,SE,5,01N,11EC,Texas,Regular,10.0,Agriculture,1998-11-20T00:00:00.000Z,1999-09-14T00:00:00.000Z,,,9753,okGD0


In [4]:
# surface Water
# Input File
PSWDP_Input = "Permitted_Surface_Water_Diversion_Points_input.csv"
df_PSWDP = pd.read_csv(PSWDP_Input)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df_PSWDP:
    df_PSWDP['WaDEUUID'] = "okSD" + df_PSWDP.index.astype(str)
    df_PSWDP.to_csv('Permitted_Surface_Water_Diversion_Points_input.csv', index=False)

df_PSWDP.head(1)

Unnamed: 0,X,Y,OBJECTID,RECORD_ID,PERMIT_NUMBER,LATITUDE,LONGITUDE,RECORD_TYPE,WATER,STATUS,ENTITY_NAME,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYPE,TOTAL_PERMITTED_ACRE_FEET,PRIMARY_PURPOSE,DATE_FILED,DATE_ISSUED,HYDRO_UNIT,STREAM_SYSTEM,RECORD_ID2,WaDEUUID
0,-95.341673,34.616335,3452,18546,19980032,34.616329,-95.341666,Permit,Surface Water,Active,"Ralston, Leo",SE,SE,NE,30,02N,19EI,Pushmataha,Regular,228.0,Irrigation,1998-08-21T00:00:00.000Z,1998-11-10T00:00:00.000Z,11140105.0,1030.0,18546,okSD0


In [None]:
# Concatenate - Both datasets share the same columns.
dfPOD = pd.concat([df_PGW, df_PSWDP], ignore_index=True).reset_index(drop=True)

print(len(dfPOD))
dfPOD.head(1)

In [None]:
# PODorPOUSite insert
dfPOD['in_PODorPOUSite'] = "POD"
dfPOD.head(3)

In [None]:
#Changing datatype of used date fields. 
dfPOD['DATE_FILED'] = pd.to_datetime(dfPOD['DATE_FILED'], errors = 'coerce')
dfPOD['DATE_FILED'] = pd.to_datetime(dfPOD['DATE_FILED'].dt.strftime('%m/%d/%Y'))

dfPOD['DATE_ISSUED'] = pd.to_datetime(dfPOD['DATE_ISSUED'], errors = 'coerce')
dfPOD['DATE_ISSUED'] = pd.to_datetime(dfPOD['DATE_ISSUED'].dt.strftime('%m/%d/%Y'))

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOK_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfPOD['LATITUDE']
dfSiteNativeID['in_Longitude'] = dfPOD['LONGITUDE']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPOD['in_SiteNativeID'] = dfPOD.apply(lambda row: retrieveSiteNativeID( row['LATITUDE'], row['LONGITUDE']), axis=1)
dfPOD['in_SiteNativeID'] = "POD" + dfPOD['in_SiteNativeID'].astype(str)
dfPOD.head(2)

## Place of Use Data

In [5]:
# Input File
AOU_Input = "OK_AreasofUse_input.csv"
dfPOU = pd.read_csv(AOU_Input)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfPOU:
    dfPOU['WaDEUUID'] = "okU" + dfPOU.index.astype(str)
    dfPOU.to_csv('OK_AreasofUse_input.csv', index=False)

print(len(dfPOU))
dfPOU.head(3)

4349


Unnamed: 0,OID_,OBJECTID,RECORD_ID,PERMIT_NUMBER,RECORD_TYPE,WATER,STATUS,ENTITY_NAME,QUARTER3,QUARTER2,QUARTER1,SECTION,TOWNSHIP,RANGE,COUNTY,PERMIT_TYPE,TOTAL_PERMITTED_ACRE_FEET,PRIMARY_PURPOSE,DATE_FILED,DATE_ISSUED,HYDRO_UNIT,STREAM_SYSTEM,LATITUDE,LONGITUDE,WaDEUUID
0,359,69,15030,19760098,Permit,Surface Water,Active,"Cline, Judy",,W2,E2,32,01N,01EI,Garvin,Regular,30.0,"Recreation, Fish, Wildlife",8/11/1976 0:00,12/14/1976 0:00,11130303,1081,34.514148,-97.219024,okU0
1,358,70,15501,19760098,Permit,Surface Water,Active,"Cline, Judy",,E2,E2,32,01N,01EI,Garvin,Regular,30.0,"Recreation, Fish, Wildlife",8/11/1976 0:00,12/14/1976 0:00,11130303,1081,34.514137,-97.21464,okU1
2,279,80,52093,19580175,Permit,Surface Water,Active,"Arbuckle Enterprises, LLC",,SE,NE,23,01N,01WI,Garvin,Vested,200.0,Irrigation,4/16/1958 0:00,8/12/1969 0:00,11130303,1081,34.544898,-97.266883,okU2


In [None]:
# PODorPOUSite insert
dfPOU['in_PODorPOUSite'] = "POU"
dfPOU.head(3)

In [None]:
#Changing datatype of used date fields. 
dfPOU['DATE_FILED'] = pd.to_datetime(dfPOU['DATE_FILED'], errors = 'coerce')
dfPOU['DATE_FILED'] = pd.to_datetime(dfPOU['DATE_FILED'].dt.strftime('%m/%d/%Y'))

dfPOU['DATE_ISSUED'] = pd.to_datetime(dfPOU['DATE_ISSUED'], errors = 'coerce')
dfPOU['DATE_ISSUED'] = pd.to_datetime(dfPOU['DATE_ISSUED'].dt.strftime('%m/%d/%Y'))

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOK_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfPOU['LATITUDE']
dfSiteNativeID['in_Longitude'] = dfPOU['LONGITUDE']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPOU['in_SiteNativeID'] = dfPOU.apply(lambda row: retrieveSiteNativeID( row['LATITUDE'], row['LONGITUDE']), axis=1)
dfPOU['in_SiteNativeID'] = "POU" + dfPOU['in_SiteNativeID'].astype(str)
dfPOU.head(2)

## Concaenate POD and POU

In [None]:
# Concatenate
# Both datasets share the same columns.
df = pd.concat([dfPOD, dfPOU], ignore_index=True).reset_index(drop=True)

print(len(df))
df.head()

## Data Fix

In [None]:
#Fixing Beneficial Uses PRIMARY_PURPOSE
def fixRecFishWild(colrowValue):
    if colrowValue == 'Recreation, Fish, Wildlife':
        outList = 'Recreation Fish Wildlife'
    else:
        outList = colrowValue
    return outList

df['PRIMARY_PURPOSE'] = df.apply(lambda row: fixRecFishWild(row['PRIMARY_PURPOSE']), axis=1)

In [None]:
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val

df['in_AllocationOwner'] = df.apply(lambda row: cleanOwnerDataFunc(row['ENTITY_NAME']), axis=1)
df.head(3)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOK_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = df['WATER']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

df['in_WaterSourceNativeID'] = df.apply(lambda row: retrieveWaterSourceNativeID(row['WATER']), axis=1)
df['in_WaterSourceNativeID'].unique()

## Shapefile Data
- For attaching gemetry to csv inputs.

In [None]:
# PoU Shapefile Data
# Shapefile input
dfPoUshapetemp = gpd.read_file('shapefile/OK_PoU2.shp')
dfPoUshapetemp.head(3)

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOK_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfPoUshapetemp['Lattitude']
dfSiteNativeID['in_Longitude'] = dfPoUshapetemp['Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPoUshapetemp['in_SiteNativeID'] = dfPoUshapetemp.apply(lambda row: retrieveSiteNativeID( row['Lattitude'], row['Longitude']), axis=1)
dfPoUshapetemp.head(2)

In [None]:
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)
dfPoUshape['in_SiteNativeID'] = "POU" + dfPoUshapetemp['in_SiteNativeID'].astype(str)
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head(3)

In [None]:
dfPoUshape['in_SiteNativeID'].unique()

## Export Data

In [None]:
#Exporting to Finished File
df.to_csv('P_OklahomaMaster.csv', index=False)  # The output
dfPoUshape.to_csv('P_OklahomaGeometry.csv', index=False) # The output geometry.