## Sites_dim
Code to generate sites.csv as input to the WaDE db for WA water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from pyproj import CRS, Transformer, Proj
from utilityFunctions import *

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "wr_v_pod_public_xy.csv" 

# output sites
out_sitdim = 'sites.csv'

In [4]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['SiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV', 'HUC8', 'HUC12', 'County'
]

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [5]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# Read Inputs and merge tables

# points of diversion 
df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100.head(5)

Reading inputs...


  interactivity=interactivity, compiler=compiler, result=result)


184377


Unnamed: 0,X,Y,OBJECTID,pod_display,pod_display_short,wris_link,snp_id,pod_location_id,pod_use_id,app_char,...,begin_month,begin_day,end_month,end_day,technician_initials,agency,rec_creation_date,last_updt_date,feature_quality_code,remarks
0,539412.5,416705.799869,1,Permit: G 10961 * MI,G 10961,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,21755,6909,26859,G,...,1.0,1.0,12.0,31.0,MIGRT,OWRD,1996-06-01T00:00:00.000,1996-06-01T00:00:00.000,,0 G 10961 1
1,539232.9,416251.991798,2,Permit: G 10961 * MI,G 10961,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,21755,6910,26860,G,...,1.0,1.0,12.0,31.0,MIGRT,OWRD,1996-06-01T00:00:00.000,1996-06-01T00:00:00.000,,0 G 10961 2
2,470932.8,355915.458333,3,Permit: G 12684 * MI,G 12684,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23327,9355,29682,G,...,1.0,1.0,12.0,31.0,MIGRT,OWRD,2001-06-01T00:00:00.000,2001-06-01T00:00:00.000,,0 G 12684 1
3,1010124.0,931993.466535,4,Permit: G 12750 * MI,G 12750,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23390,9480,29835,G,...,1.0,1.0,12.0,31.0,MIGRT,OWRD,2001-05-01T00:00:00.000,2001-05-01T00:00:00.000,,0 G 12750 1
4,1010124.0,931993.466535,5,Permit: G 12750 * MI,G 12750,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23390,9480,29836,G,...,1.0,1.0,12.0,31.0,MIGRT,OWRD,2001-05-01T00:00:00.000,2001-05-01T00:00:00.000,,0 G 12750 1


In [7]:
list(df100.columns) 

['X',
 'Y',
 'OBJECTID',
 'pod_display',
 'pod_display_short',
 'wris_link',
 'snp_id',
 'pod_location_id',
 'pod_use_id',
 'app_char',
 'app_nbr',
 'permit_char',
 'permit_nbr',
 'cert_nbr',
 'claim_char',
 'claim_nbr',
 'decree_title',
 'transfer_nbr',
 'wr_type',
 'name_last',
 'name_first',
 'name_company',
 'pod_nbr',
 'pod_char',
 'source_type',
 'use_code',
 'use_category',
 'use_code_description',
 'priority_date',
 'duty',
 'rate_cfs',
 'rate_cfs_est',
 'max_rate_cfs',
 'acre_feet',
 'acre_feet_est',
 'max_rate_acre_feet',
 'source',
 'tributary_to',
 'streamcode',
 'stream_name',
 'supplemental',
 'begin_month',
 'begin_day',
 'end_month',
 'end_day',
 'technician_initials',
 'agency',
 'rec_creation_date',
 'last_updt_date',
 'feature_quality_code',
 'remarks']

In [8]:
print("SiteTypeCV Dictionary and function...")

SiteTypeCVDictOR = {
    "LK":"lake",
    "DR":"drain",
    "SP":"spring",
    "ST":"stream",
    "SL":"slough",
    "WW":"waste water",
    "WE":"well",
    "WR":"winter runoff",
    "SM":"sump",
    "PD":"pond",
    "RS":"reservoir",
    "DT":"ditch",
    "SE":"sewage effluent",
    "CN":"canal"
}

# Get SiteTypeCV based on the field source_type (Type of diversion code); map Blank to “Unknown”
def assignSiteTypeCVOR(colrowValue):
    # may need to modify capitalization in beneficialUseDictionary
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = 'Unknown'
    else:
        keyStr = colrowValue.strip()  # remove whitespace chars
        try:
            outList = SiteTypeCVDictOR[keyStr]
        except:
            outList = 'Unknown'

    return outList

SiteTypeCV Dictionary and function...


In [9]:
print("SiteTypeCV...")

df100 = df100.assign(SiteTypeCV='')  #add new column and make it empty

# no-loop approach?
df100['SiteTypeCV'] = df100.apply(lambda row: assignSiteTypeCVOR(row['source_type']), axis=1)

SiteTypeCV...


In [10]:
print("Project to longitude/ latitude  ")

df100 = df100.assign(Longitude='')
df100 = df100.assign(Latitude='')

# use pyproj to project to lat lon
crs_to = CRS('EPSG:4326')  # CRS("WGS84")
# Projection: EPSG:2992 -- NAD83 / Oregon GIC Lambert (ft)
crs_from = CRS("EPSG:2992")  
transformer = Transformer.from_crs(crs_from, crs_to)

# drop cells with no x or y coordinate
df100 = df100.replace(np.nan, '') 
dropIndex = df100.loc[(df100['X'] == '') | (df100['Y'] == '')].index
if len(dropIndex) > 0:
    df100 = df100.drop(dropIndex)
    df100 = df100.reset_index(drop=True)

lonList = []
latList = []
for ix in range(len(df100.index)):
    #print(ix)
    x1 = df100.loc[ix, 'X']
    y1 = df100.loc[ix, 'Y']
    try:
        lat, lon  = transformer.transform(float(x1), float(y1))
        lonList.append(lon)
        latList.append(lat)
    except:
        lonList.append('')
        latList.append('')

df100['Longitude'] = lonList
df100['Latitude'] = latList

df100

Project to longitude/ latitude  


  result = method(y)


Unnamed: 0,X,Y,OBJECTID,pod_display,pod_display_short,wris_link,snp_id,pod_location_id,pod_use_id,app_char,...,end_day,technician_initials,agency,rec_creation_date,last_updt_date,feature_quality_code,remarks,SiteTypeCV,Longitude,Latitude
0,5.394125e+05,4.167058e+05,1,Permit: G 10961 * MI,G 10961,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,21755,6909,26859,G,...,31,MIGRT,OWRD,1996-06-01T00:00:00.000,1996-06-01T00:00:00.000,,0 G 10961 1,well,-123.382877,42.855813
1,5.392329e+05,4.162520e+05,2,Permit: G 10961 * MI,G 10961,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,21755,6910,26860,G,...,31,MIGRT,OWRD,1996-06-01T00:00:00.000,1996-06-01T00:00:00.000,,0 G 10961 2,well,-123.383487,42.854551
2,4.709328e+05,3.559155e+05,3,Permit: G 12684 * MI,G 12684,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23327,9355,29682,G,...,31,MIGRT,OWRD,2001-06-01T00:00:00.000,2001-06-01T00:00:00.000,,0 G 12684 1,well,-123.629420,42.682269
3,1.010124e+06,9.319935e+05,4,Permit: G 12750 * MI,G 12750,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23390,9480,29835,G,...,31,MIGRT,OWRD,2001-05-01T00:00:00.000,2001-05-01T00:00:00.000,,0 G 12750 1,well,-121.654631,44.301041
4,1.010124e+06,9.319935e+05,5,Permit: G 12750 * MI,G 12750,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23390,9480,29836,G,...,31,MIGRT,OWRD,2001-05-01T00:00:00.000,2001-05-01T00:00:00.000,,0 G 12750 1,well,-121.654631,44.301041
5,2.098075e+06,1.005065e+06,6,Permit: G 12779 * MI,G 12779,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,23418,9515,29873,G,...,31,KLS,OWRD,2007-10-26T12:42:33.000,,10,Automapped as center of the envelope for the T...,well,-117.488914,44.467820
6,2.070442e+06,9.816352e+05,7,Permit: G 13450 * MI,G 13450,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,24080,10663,31263,G,...,31,KLS,OWRD,2007-10-26T12:42:33.000,,10,Automapped as center of the envelope for the T...,well,-117.597891,44.406309
7,2.071930e+06,9.777117e+05,8,Permit: G 13450 * MI,G 13450,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,24080,10664,31264,G,...,31,KLS,OWRD,2007-10-26T12:42:33.000,,10,Automapped as center of the envelope for the T...,well,-117.592731,44.395407
8,2.066681e+06,9.431072e+05,9,Permit: G 13450 * MI,G 13450,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,24080,10665,31265,G,...,31,KLS,OWRD,2007-10-26T12:42:33.000,,10,Automapped as center of the envelope for the T...,well,-117.617453,44.301031
9,2.067942e+06,9.685740e+05,10,Permit: G 13450 * MI,G 13450,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,24080,10666,31266,G,...,31,MIGRT,OWRD,2001-05-01T00:00:00.000,2001-05-01T00:00:00.000,,0 G 13450 4,well,-117.609214,44.370739


In [11]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteTypeCV', 'Longitude', 'Latitude'
          #, 'CoordinateMethodCV',           'CoordinateAccuracy'
         ]
srsCols=['pod_location_id', 'SiteTypeCV', 'Longitude', 'Latitude',
         #'CoordinateMethodCV', 'CoordinateAccuracy'
        ]
#pod_location_id

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

outdf100

Direct mapping columns...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,6909,,,well,-123.382877,42.855813,,,,...,,,,,,,,,,
1,,6910,,,well,-123.383487,42.854551,,,,...,,,,,,,,,,
2,,9355,,,well,-123.629420,42.682269,,,,...,,,,,,,,,,
3,,9480,,,well,-121.654631,44.301041,,,,...,,,,,,,,,,
4,,9480,,,well,-121.654631,44.301041,,,,...,,,,,,,,,,
5,,9515,,,well,-117.488914,44.467820,,,,...,,,,,,,,,,
6,,10663,,,well,-117.597891,44.406309,,,,...,,,,,,,,,,
7,,10664,,,well,-117.592731,44.395407,,,,...,,,,,,,,,,
8,,10665,,,well,-117.617453,44.301031,,,,...,,,,,,,,,,
9,,10666,,,well,-117.609214,44.370739,,,,...,,,,,,,,,,


In [12]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan)
                             | (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    outdf100 = outdf100.drop(dropIndex) 
    outdf100 = outdf100.reset_index(drop=True)
    
outdf100

Empty lat/lon


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,6909,,,well,-123.382877,42.855813,,,,...,,,,,,,,,,
1,,6910,,,well,-123.383487,42.854551,,,,...,,,,,,,,,,
2,,9355,,,well,-123.629420,42.682269,,,,...,,,,,,,,,,
3,,9480,,,well,-121.654631,44.301041,,,,...,,,,,,,,,,
4,,9480,,,well,-121.654631,44.301041,,,,...,,,,,,,,,,
5,,9515,,,well,-117.488914,44.467820,,,,...,,,,,,,,,,
6,,10663,,,well,-117.597891,44.406309,,,,...,,,,,,,,,,
7,,10664,,,well,-117.592731,44.395407,,,,...,,,,,,,,,,
8,,10665,,,well,-117.617453,44.301031,,,,...,,,,,,,,,,
9,,10666,,,well,-117.609214,44.370739,,,,...,,,,,,,,,,


In [13]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
184377
122471


In [14]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = 'EPSG:4326'
outdf100.SiteName = "Unspecified"
outdf100.CoordinateMethodCV = "Unspecified"
outdf100.StateCV = 'OR'

Hard coded


In [15]:
print("Check Site Native IDs are duplicated")

siteNIdDup = False
siteNativeIDdup=outdf100.loc[outdf100.duplicated(subset=['SiteNativeID'])]
if len(siteNativeIDdup.index) > 0:
    print("Site Native IDs are duplicated")
    siteNIdDup = True
#outdf100

print(len(siteNativeIDdup))

Check Site Native IDs are duplicated
0


In [16]:
print("Adding SiteUUID...")

if siteNIdDup:    
    # 10.24.19 create unique site uuid
    outdf100 = outdf100.reset_index(drop=True)
    outdf100['TempUUID'] = range(1, len(outdf100.index) + 1)
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: "_".join(["OR", str(row['TempUUID'])]),
                                          axis=1)
    #drop temp uuid
    outdf100 = outdf100.drop('TempUUID', axis=1)
else:
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: '' if str(row['SiteNativeID']) == '' 
                                        else "_".join(["OR", str(row['SiteNativeID'])]), axis=1)

outdf100

Adding SiteUUID...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,OR_6909,6909,Unspecified,,well,-123.382877,42.855813,,,,...,EPSG:4326,,,,,,OR,,,
1,OR_6910,6910,Unspecified,,well,-123.383487,42.854551,,,,...,EPSG:4326,,,,,,OR,,,
2,OR_9355,9355,Unspecified,,well,-123.629420,42.682269,,,,...,EPSG:4326,,,,,,OR,,,
3,OR_9480,9480,Unspecified,,well,-121.654631,44.301041,,,,...,EPSG:4326,,,,,,OR,,,
4,OR_9515,9515,Unspecified,,well,-117.488914,44.467820,,,,...,EPSG:4326,,,,,,OR,,,
5,OR_10663,10663,Unspecified,,well,-117.597891,44.406309,,,,...,EPSG:4326,,,,,,OR,,,
6,OR_10664,10664,Unspecified,,well,-117.592731,44.395407,,,,...,EPSG:4326,,,,,,OR,,,
7,OR_10665,10665,Unspecified,,well,-117.617453,44.301031,,,,...,EPSG:4326,,,,,,OR,,,
8,OR_10666,10666,Unspecified,,well,-117.609214,44.370739,,,,...,EPSG:4326,,,,,,OR,,,
9,OR_10776,10776,Unspecified,,well,-123.193846,45.550864,,,,...,EPSG:4326,,,,,,OR,,,


In [17]:
print("Droping duplicates...")

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,OR_6909,6909,Unspecified,,well,-123.382877,42.855813,,,,...,EPSG:4326,,,,,,OR,,,
1,OR_6910,6910,Unspecified,,well,-123.383487,42.854551,,,,...,EPSG:4326,,,,,,OR,,,
2,OR_9355,9355,Unspecified,,well,-123.629420,42.682269,,,,...,EPSG:4326,,,,,,OR,,,
3,OR_9480,9480,Unspecified,,well,-121.654631,44.301041,,,,...,EPSG:4326,,,,,,OR,,,
4,OR_9515,9515,Unspecified,,well,-117.488914,44.467820,,,,...,EPSG:4326,,,,,,OR,,,
5,OR_10663,10663,Unspecified,,well,-117.597891,44.406309,,,,...,EPSG:4326,,,,,,OR,,,
6,OR_10664,10664,Unspecified,,well,-117.592731,44.395407,,,,...,EPSG:4326,,,,,,OR,,,
7,OR_10665,10665,Unspecified,,well,-117.617453,44.301031,,,,...,EPSG:4326,,,,,,OR,,,
8,OR_10666,10666,Unspecified,,well,-117.609214,44.370739,,,,...,EPSG:4326,,,,,,OR,,,
9,OR_10776,10776,Unspecified,,well,-123.193846,45.550864,,,,...,EPSG:4326,,,,,,OR,,,


In [18]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

Checking required isnot null...


In [19]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites
