## Sites_dim
Code to generate sites.csv as input to the WaDE db for OK water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from waterallocationsFunctions import *

In [2]:
# working directory
working_dir = "C:/tseg/OKTest"
os.chdir(working_dir)

To run the following cell, make sure the input csv file is in the working directory. To obtain the data, go to the following link and download the tables: 

Permitted Surface Water Diversion Points
http://home-owrb.opendata.arcgis.com/datasets/permitted-surface-water-diversion-points?geometry=-119.379%2C31.373%2C-77.565%2C37.701  

Permitted Groundwater Wells (Point coverage)
http://home-owrb.opendata.arcgis.com/datasets/permitted-groundwater-wells  


In [3]:
# Input files
fileInput1 = "Permitted_Groundwater_Wells.csv" 
FileInput2 = "Permitted_Surface_Water_Diversion_Points.csv" # Points of diversion
FileInput3 = "Areas_of_Use.csv"  # 

# output sites
out_sitdim = 'sites.csv'

In [4]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['SiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV']

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [5]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [11]:
print("Reading inputs...")

# Read Inputs and merge tables
# ToDO: We are joining 'on-left': keep all rows of mater table (check if need to be refined)

# ground water
df100_l = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df100_l.index))

#### Join tables

# surface water 
df200 = pd.read_csv(FileInput2,encoding = "ISO-8859-1")  
print (len(df200.index))

# in this case we concatenate the two water right data
df100=pd.concat([df100_l, df200], ignore_index=True)
#df100
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

#df100 = df100.replace('', np.nan)
df100.head(5)

Reading inputs...
20859
3422
24281


Unnamed: 0,X,Y,OBJECTID,RECORD_ID,PERMIT_NUMBER,LATITUDE,LONGITUDE,RECORD_TYPE,WATER,STATUS,...,RANGE,COUNTY,PERMIT_TYPE,TOTAL_PERMITTED_ACRE_FEET,PRIMARY_PURPOSE,DATE_FILED,DATE_ISSUED,HYDRO_UNIT,STREAM_SYSTEM,RECORD_ID2
0,-101.896349,36.574734,561,9753,19980623,36.574728,-101.89634,Permit,Groundwater,Active,...,11EC,Texas,Regular,10.0,Agriculture,1998-11-20T00:00:00.000Z,1999-09-14T00:00:00.000Z,,,9753
1,-101.57512,36.516345,752,50052,20020591,36.516338,-101.575112,Permit,Groundwater,Active,...,14EC,Texas,Regular,1280.0,Irrigation,2002-09-20T00:00:00.000Z,2003-05-03T00:00:00.000Z,,,50052
2,-99.052511,34.582855,944,53324,20040578,34.582849,-99.052503,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53324
3,-99.050317,34.590121,954,53325,20040578,34.590116,-99.050308,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53325
4,-99.050317,34.586494,945,53326,20040578,34.586489,-99.050308,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53326


In [13]:
list(df100.columns) 

['X',
 'Y',
 'OBJECTID',
 'RECORD_ID',
 'PERMIT_NUMBER',
 'LATITUDE',
 'LONGITUDE',
 'RECORD_TYPE',
 'WATER',
 'STATUS',
 'ENTITY_NAME',
 'QUARTER3',
 'QUARTER2',
 'QUARTER1',
 'SECTION',
 'TOWNSHIP',
 'RANGE',
 'COUNTY',
 'PERMIT_TYPE',
 'TOTAL_PERMITTED_ACRE_FEET',
 'PRIMARY_PURPOSE',
 'DATE_FILED',
 'DATE_ISSUED',
 'HYDRO_UNIT',
 'STREAM_SYSTEM',
 'RECORD_ID2']

In [14]:
print("Direct mapping columns...")
#
# Utah directly mapped cells
destCols=['SiteNativeID', 'SiteTypeCV', 'Longitude', 'Latitude']
srsCols=['OBJECTID', 'WATER', 'LONGITUDE', 'LATITUDE']

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

Direct mapping columns...


In [15]:
print("Dropping empty lat/lon")
#drop the sites with no long and lat.
outdf100purge = outdf100.loc[(outdf100['Longitude'].isnull()) | (outdf100['Longitude'] == '') |
                             (outdf100['Latitude'].isnull()) | (outdf100['Latitude'] == '')]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100['Longitude'].isnull()) | (outdf100['Longitude'] == '') |
                             (outdf100['Latitude'].isnull()) | (outdf100['Latitude'] == '')].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

Dropping empty lat/lon


  result = method(y)


In [16]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
24281
24281


In [17]:
# hardcoded columns
print("Hard coded")
outdf100.EPSGCodeCV = 'EPSG:4326'
outdf100.SiteName = 'Not Provided'    # site name doesn't exist so use Not provided

Hard coded


In [18]:
print("Fix empty coordinatemethodCV")

# in this case all rows for CoordinateMethod are empty so hard code it
outdf100.CoordinateMethodCV = 'Unspecified'
#outdf100.loc[outdf100["CoordinateMethodCV"] == '', "CoordinateMethodCV"] = 'Unspecified'


Fix empty coordinatemethodCV


In [19]:
print("Check Site Native IDs are duplicated")

siteNativeIDdup=outdf100.loc[outdf100.duplicated(subset=['SiteNativeID'])]
siteNIdDup = False
if len(siteNativeIDdup.index) > 0:
    print("Site Native IDs are duplicated")
    siteNIdDup = True
#outdf100

siteNativeIDdup

Check Site Native IDs are duplicated


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,CoordinateMethodCV,CoordinateAccuracy,GNISCodeCV,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV


In [21]:
print("Adding SiteUUID...")

if siteNIdDup:    
    # 10.24.19 create unique site uuid
    outdf100 = outdf100.reset_index(drop=True)
    outdf100['TempUUID'] = range(1, len(outdf100.index) + 1)
    #append 'OK'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: "_".join(["OK", str(row['TempUUID'])]) , axis=1)
    #drop temp uuid
    outdf100 = outdf100.drop('TempUUID', axis=1)
else:
    #append 'OK'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: '' if str(row['SiteNativeID']) == '' 
                                        else "_".join(["OK", str(row['SiteNativeID'])]), axis=1)

#df100

Adding SiteUUID...


In [22]:
print("Droping duplicates...")
# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping duplicates...


In [23]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['SiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

Checking required isnot null...


In [24]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites
