## Sites_dim
Code to generate sites.csv as input to the WaDE db for OK water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from pyproj import CRS, Transformer, Proj
from utilityFunctions import *

In [2]:
# working directory
working_dir = "ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "WaterRights_Diversion.csv"

# output sites
out_sitdim = 'sites.csv'

In [4]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['WaDESiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV']

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [5]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# Read Inputs and merge tables

# points of diversion 
df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100.head(5)

Reading inputs...
13797


Unnamed: 0,WRKEY,BOCA_CD,WTR_ID,WR_NUMBER,WRTE_DESCR,WRTE_CD,WRST_DESCR,WRST_CD,VERS_ID_SEQ,VERS_TYPE,...,SCTN,QTR,GOVT_LOT,SPX,SPY,WELL_DPTH,RES,GEOCODES,DTM_CREATED,Unnamed: 46
0,200059-1,39E,113577,39E 113577 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,15.0,SWNESE,,,,,,4208591540101A400,20191212,
1,304174-1,39E,115446,39E 115446 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,25.0,SWSESW,,,,,,4208582530101A400,20191212,
2,304174-1,39E,115446,39E 115446 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,25.0,SWSESW,,,,,,4208582530101A400,20191212,
3,203253-1,39E,115458,39E 115458 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,35.0,NWNWNW,,,,,,,20191212,
4,203253-1,39E,115458,39E 115458 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,35.0,NWNWNW,,,,,,,20191212,


In [7]:
list(df100.columns) 

['WRKEY',
 'BOCA_CD',
 'WTR_ID',
 'WR_NUMBER',
 'WRTE_DESCR',
 'WRTE_CD',
 'WRST_DESCR',
 'WRST_CD',
 'VERS_ID_SEQ',
 'VERS_TYPE',
 'ENF_PRTY_DT_DATE',
 'MAJOR_TYPE',
 'SOURCE_NAME',
 'MODV_DESCR',
 'PURT_DESCR',
 'PURT_CD',
 'LST_NM_OR_BUSN_NM',
 'FST_NM',
 'MID_INT',
 'SUFX',
 'OWNER',
 'MAX_FLOW',
 'MAX_VOL',
 'MAX_ACRES',
 'SCANNED',
 'DIV_CNT',
 'POU_CNT',
 'IRR_CNT',
 'RESV_CNT',
 'ISSUE_RMK',
 'DITCH',
 'HIST_TYPE',
 'POD_NO',
 'COUNTY',
 'CNTY_CD',
 'COUNTY_ST_CD',
 'TR',
 'SCTN',
 'QTR',
 'GOVT_LOT',
 'SPX',
 'SPY',
 'WELL_DPTH',
 'RES',
 'GEOCODES',
 'DTM_CREATED',
 'Unnamed: 46']

In [8]:
print ("Site names...")

df100.assign(SiteName='')

df100 = df100.replace(np.nan, '')
 # If site name doesn't exist use Not provided
df100["SiteName"] = df100.apply(lambda row: row["DITCH"] 
                                                   if str(row["DITCH"]).strip() != '' 
                                                   else "Not Provided" , axis=1)

df100 

Site names...


Unnamed: 0,WRKEY,BOCA_CD,WTR_ID,WR_NUMBER,WRTE_DESCR,WRTE_CD,WRST_DESCR,WRST_CD,VERS_ID_SEQ,VERS_TYPE,...,QTR,GOVT_LOT,SPX,SPY,WELL_DPTH,RES,GEOCODES,DTM_CREATED,Unnamed: 46,SiteName
0,200059-1,39E,113577,39E 113577 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,SWNESE,,,,,,4208591540101A400,20191212,,Not Provided
1,304174-1,39E,115446,39E 115446 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,SWSESW,,,,,,4208582530101A400,20191212,,Not Provided
2,304174-1,39E,115446,39E 115446 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,SWSESW,,,,,,4208582530101A400,20191212,,Not Provided
3,203253-1,39E,115458,39E 115458 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,NWNWNW,,,,,,,20191212,,Not Provided
4,203253-1,39E,115458,39E 115458 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,NWNWNW,,,,,,,20191212,,Not Provided
5,209421-1,39E,120607,39E 120607 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,E2NENE,,,,,,,20191212,,Not Provided
6,209421-1,39E,120607,39E 120607 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,E2NENE,,,,,,,20191212,,Not Provided
7,209502-1,39E,120702,39E 120702 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,NWSESW,,,,,,,20191212,,Not Provided
8,209502-1,39E,120702,39E 120702 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,NWSESW,,,,,,,20191212,,Not Provided
9,210398-1,39E,121732,39E 121732 00,STATEMENT OF CLAIM,STOC,ACTIVE,ACTV,1,ORIGINAL RIGHT,...,SWSWSW,,,,,,,20191212,,Not Provided


In [None]:
print("Project to longitude/ latitude  ")

df100 = df100.assign(Longitude='')
df100 = df100.assign(Latitude='')

# use pyproj to project to lat lon
crs_to = CRS('EPSG:4326')  # CRS("WGS84")
#TODO: check the following MT falls in both zones 12 and 13 
crs_from = CRS("EPSG:26912")  
transformer = Transformer.from_crs(crs_from, crs_to)

# drop cells with no x or y coordinate
df100 = df100.replace(np.nan, '') 
dropIndex = df100.loc[(df100['X_METERS'] == '') | (df100['Y_METERS'] == '')].index
if len(dropIndex) > 0:
    df100 = df100.drop(dropIndex)
    df100 = df100.reset_index(drop=True)

lonList = []
latList = []
for ix in range(len(df100.index)):
    #print(ix)
    x1 = df100.loc[ix, 'X_METERS']
    y1 = df100.loc[ix, 'Y_METERS']
    try:
        lat, lon  = transformer.transform(float(x1), float(y1))
        lonList.append(lon)
        latList.append(lat)
    except:
        lonList.append('')
        latList.append('')

df100['Longitude'] = lonList
df100['Latitude'] = latList

df100

In [9]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteName', 'SiteTypeCV'] #, 'Longitude', 'Latitude']
srsCols=['POD_NO', 'SiteName', 'MODV_DESCR'] #, 'Longitude', 'Latitude']

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

Direct mapping columns...


In [10]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan)
                             | (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    #outdf100 = outdf100.drop(dropIndex) TODO there are too many empty location coordinates so we are not dropping them here
    outdf100 = outdf100.reset_index(drop=True)

Empty lat/lon


In [11]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
13797
135


In [12]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = 'EPSG:4326'


Hard coded


In [13]:
print("Check Site Native IDs are duplicated")

siteNIdDup = False
siteNativeIDdup=outdf100.loc[outdf100.duplicated(subset=['SiteNativeID'])]
if len(siteNativeIDdup.index) > 0:
    print("Site Native IDs are duplicated")
    siteNIdDup = True
#outdf100

siteNativeIDdup

Check Site Native IDs are duplicated
Site Native IDs are duplicated


Unnamed: 0,WaDESiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,CoordinateMethodCV,CoordinateAccuracy,GNISCodeCV,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV
1,,1,Not Provided,,PUMP,,,,,,,,,EPSG:4326,,,,,,
2,,1,Not Provided,,DAM,,,,,,,,,EPSG:4326,,,,,,
3,,1,Not Provided,,FLOWING,,,,,,,,,EPSG:4326,,,,,,
4,,1,Not Provided,,OTHER DIVERSION,,,,,,,,,EPSG:4326,,,,,,
5,,1,Not Provided,,DEVELOPED SPRING,,,,,,,,,EPSG:4326,,,,,,
6,,1,Not Provided,,HAND PUMP,,,,,,,,,EPSG:4326,,,,,,
7,,1,Not Provided,,DITCH/GRAVITY FLOW,,,,,,,,,EPSG:4326,,,,,,
9,,1,Not Provided,,WINDMILL,,,,,,,,,EPSG:4326,,,,,,
10,,1,Not Provided,,PIPELINE,,,,,,,,,EPSG:4326,,,,,,
11,,1,Not Provided,,SPRING BOX,,,,,,,,,EPSG:4326,,,,,,


In [14]:
print("Adding SiteUUID...")

if siteNIdDup:    
    # 10.24.19 create unique site uuid
    outdf100 = outdf100.reset_index(drop=True)
    outdf100['TempUUID'] = range(1, len(outdf100.index) + 1)
    #append 'NM'
    outdf100['WaDESiteUUID'] = outdf100.apply(lambda row: "_".join(["MT", str(row['TempUUID'])]),
                                          axis=1)
    #drop temp uuid
    outdf100 = outdf100.drop('TempUUID', axis=1)
else:
    #append 'NM'
    outdf100['WaDESiteUUID'] = outdf100.apply(lambda row: '' if str(row['SiteNativeID']) == '' 
                                        else "_".join(["MT", str(row['SiteNativeID'])]), axis=1)

outdf100

Adding SiteUUID...


Unnamed: 0,WaDESiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,CoordinateMethodCV,CoordinateAccuracy,GNISCodeCV,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV
0,MT_1,1,Not Provided,,WELL,,,,,,,,,EPSG:4326,,,,,,
1,MT_2,1,Not Provided,,PUMP,,,,,,,,,EPSG:4326,,,,,,
2,MT_3,1,Not Provided,,DAM,,,,,,,,,EPSG:4326,,,,,,
3,MT_4,1,Not Provided,,FLOWING,,,,,,,,,EPSG:4326,,,,,,
4,MT_5,1,Not Provided,,OTHER DIVERSION,,,,,,,,,EPSG:4326,,,,,,
5,MT_6,1,Not Provided,,DEVELOPED SPRING,,,,,,,,,EPSG:4326,,,,,,
6,MT_7,1,Not Provided,,HAND PUMP,,,,,,,,,EPSG:4326,,,,,,
7,MT_8,1,Not Provided,,DITCH/GRAVITY FLOW,,,,,,,,,EPSG:4326,,,,,,
8,MT_9,2,Not Provided,,WELL,,,,,,,,,EPSG:4326,,,,,,
9,MT_10,1,Not Provided,,WINDMILL,,,,,,,,,EPSG:4326,,,,,,


In [15]:
print("Droping duplicates...")

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,WaDESiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,CoordinateMethodCV,CoordinateAccuracy,GNISCodeCV,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV
0,MT_1,1,Not Provided,,WELL,,,,,,,,,EPSG:4326,,,,,,
1,MT_2,1,Not Provided,,PUMP,,,,,,,,,EPSG:4326,,,,,,
2,MT_3,1,Not Provided,,DAM,,,,,,,,,EPSG:4326,,,,,,
3,MT_4,1,Not Provided,,FLOWING,,,,,,,,,EPSG:4326,,,,,,
4,MT_5,1,Not Provided,,OTHER DIVERSION,,,,,,,,,EPSG:4326,,,,,,
5,MT_6,1,Not Provided,,DEVELOPED SPRING,,,,,,,,,EPSG:4326,,,,,,
6,MT_7,1,Not Provided,,HAND PUMP,,,,,,,,,EPSG:4326,,,,,,
7,MT_8,1,Not Provided,,DITCH/GRAVITY FLOW,,,,,,,,,EPSG:4326,,,,,,
8,MT_9,2,Not Provided,,WELL,,,,,,,,,EPSG:4326,,,,,,
9,MT_10,1,Not Provided,,WINDMILL,,,,,,,,,EPSG:4326,,,,,,


In [16]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["WaDESiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

Checking required isnot null...


In [17]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites
