## Sites_dim
Code to generate sites.csv as input to the WaDE db for WA water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from pyproj import CRS, Transformer, Proj

In [2]:
# working directory
working_dir = "ProcessedInputData"
os.chdir(working_dir)

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'ProcessedInputData'

In [3]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['SiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV', 'HUC8', 'HUC12', 'County'
]

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [None]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [None]:
# Input files
fileInput1 = "Well_Registry_Wells55.csv" 
fileInput2 = "GWSI_Sites.csv"

# output sites
out_sitdim = 'sites.csv'

In [None]:
print("Reading inputs...")

# Read Inputs 
# 
df200 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df200.index))
df200.drop_duplicates(inplace=True)
print(len(df200))
#df200

# columns of GWSI_Sites
# "X", "Y", "OBJECTID", "SITE_ID", "LOCAL_ID", "REG_ID", "WELL_TYPE", "DD_LAT", "DD_LONG",
# "LAT_NAD27", "LONG_NAD27", "WELL_ALT", "WATER_USE", "WELL_DEPTH", "CASE_DIAM", "DRILL_DATE", 
# "WL_COUNT", "LASTWLDATE", "WL_DTW", "WL_ELEV", "SOURCE", "IDXBK

cols_GWSI = ["SITE_ID", "LOCAL_ID", "REG_ID", "WELL_TYPE", "DD_LAT", "DD_LONG",
      "WATER_USE",  "DRILL_DATE", "LASTWLDATE", "WL_DTW", "SOURCE", "IDXBK"]

df300 = pd.read_csv(fileInput2,encoding = "ISO-8859-1", usecols = cols_GWSI) #, or alternatively encoding = "utf-8"
print (len(df300.index))
df200.drop_duplicates(inplace=True)
print(len(df300))
#df300

In [None]:
print("Join tables...")

df100=pd.merge(df200, df300, left_on='REGISTRY_ID', right_on='REG_ID', how='inner') #

#print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100 = df100.replace(np.nan, '')

df100
#df100.head(5)

In [None]:
# use only unique water rights that may have multiple sites/pds
print("Dropping duplicates...")

df100.drop_duplicates(subset = ['REGISTRY_ID'], inplace=True)   #
df100 = df100.reset_index(drop=True)

print (len(df100.index))

df100

In [None]:
list(df100.columns) 

In [None]:
print("Adding SiteUUID...")

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: '_'.join(['AZ', str(row["SITE_ID"])]), axis=1)

df100

In [None]:
print("Project to longitude/ latitude  ")

df100 = df100.assign(Longitude='')
df100 = df100.assign(Latitude='')

# use pyproj to project to lat lon
crs_to = CRS('EPSG:4326')  # CRS("WGS84")
# NAD27 crs_from = CRS("EPSG:4267") 
# NAD83 UTM Zone 12N
crs_from = CRS("EPSG:26912") 
transformer = Transformer.from_crs(crs_from, crs_to)

# drop cells with no x or y coordinate
df100 = df100.replace(np.nan, '') 
dropIndex = df100.loc[(df100['UTM_X_METERS'] == '') | (df100['UTM_Y_METERS'] == '')].index
if len(dropIndex) > 0:
    df100 = df100.drop(dropIndex)
    df100 = df100.reset_index(drop=True)

lonList = []
latList = []
for ix in range(len(df100.index)):
    #print(ix)
    x1 = df100.loc[ix, 'UTM_X_METERS']
    y1 = df100.loc[ix, 'UTM_Y_METERS']
    try:
        lat, lon  = transformer.transform(float(x1), float(y1))
        lonList.append(lon)
        latList.append(lat)
    except:
        lonList.append('')
        latList.append('')

df100['Longitude'] = lonList
df100['Latitude'] = latList

df100

In [None]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteUUID', 'SiteTypeCV', 'Longitude', 'Latitude']
srsCols=['SITE_ID', 'SiteUUID', 'WELL_TYPE', 'Longitude', 'Latitude']

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

outdf100

In [None]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan)
                             | (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    outdf100 = outdf100.drop(dropIndex) 
    outdf100 = outdf100.reset_index(drop=True)
    
outdf100

In [None]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

In [None]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = 'EPSG:4326'
outdf100.SiteName = "Unspecified"
outdf100.CoordinateMethodCV = "Unspecified"
outdf100.StateCV = "AZ"

outdf100

In [None]:
print("Droping duplicates...")

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

In [None]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

In [None]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")