## Sites_dim
Code to generate sites.csv as input to the WaDE db for WA water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from pyproj import CRS, Transformer, Proj

In [2]:
# working directory
working_dir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Washington/WaterAllocation"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "RawInputData/D_Point.csv"

# output sites
out_sitdim = 'ProcessedInputData/sites.csv'

In [4]:
#column names

columns=[
'SiteID',
'SiteUUID',
'CoordinateAccuracy',
'CoordinateMethodCV',
'County',
'EPSGCodeCV',
'Geometry',
'GNISCodeCV',
'HUC12',
'HUC8',
'Latitude',
'Longitude',
'NHDNetworkStatusCV',
'NHDProductCV',
'SiteName',
'SiteNativeID',
'SitePoint',
'SiteTypeCV',
'StateCV',
'USGSSiteID']

In [5]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
# Read Inputs and merge tables
print("Reading inputs...")

df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

print (len(df100.index))
df100.head(5)

Reading inputs...
155062


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,X,Y,OBJECTID,D_Point_ID,D_Point_Type_CD,Location_CD,Assoc_FL,Misc_CD,Position_With_CD,Active_DT,Inactive_DT,Update_TD,Update_User_ID,Comment_DS,Created_TD,Created_User_ID
0,1665873.059,454923.2363,1,200801.0,WL,U,N,,S,,,2013-03-28T09:58:00.000,"""ECY\DKRO461""",,,
1,1816741.203,456744.0551,2,200889.0,MW,G,Y,,S,,,,,,,
2,1813847.794,456733.0063,3,200890.0,MW,G,Y,,S,,,,,,,
3,1571475.128,457350.1307,4,201092.0,WL,U,Y,,S,,,,,,,
4,1715391.547,402037.7843,5,201191.0,MW,G,Y,,S,,,2010-06-11T11:28:19.000,"""ECY\DKRO461""",,,


In [7]:
list(df100.columns) 

['X',
 'Y',
 'OBJECTID',
 'D_Point_ID',
 'D_Point_Type_CD',
 'Location_CD',
 'Assoc_FL',
 'Misc_CD',
 'Position_With_CD',
 'Active_DT',
 'Inactive_DT',
 'Update_TD',
 'Update_User_ID',
 'Comment_DS',
 'Created_TD',
 'Created_User_ID']

In [8]:
print("SiteTypeCV...")

SiteTypeCVDictWA = {
    "GC":"ground water collector",
    "HW":"headworks gravity flow (or surface water device unknown)",
    "ID":"irrigation dam",
    "MW":"monitoring well",
    "PM":"surface water pump",
    "RD":"reservoir dam",
    "WL":"well (or ground water device unknown)"
}

# Get SiteTypeCV based on the field "D_Point_Type_CD" and map Blank to “Unknown”
def assignSiteTypeCVWA(colrowValue):
    # may need to modify capitalization in beneficialUseDictionary
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = 'Unknown'
    else:
        keyStr = colrowValue.strip()  # remove whitespace chars
        try:
            outList = SiteTypeCVDictWA[keyStr]
        except:
            outList = 'Unknown'

    return outList


df100 = df100.assign(SiteTypeCV='')  #add new column and make it empty
df100['SiteTypeCV'] = df100.apply(lambda row: assignSiteTypeCVWA(row['D_Point_Type_CD']), axis=1)

SiteTypeCV...


In [9]:
print("Project to longitude/ latitude  ")

df100 = df100.assign(Longitude='')
df100 = df100.assign(Latitude='')

# use pyproj to project to lat lon
crs_to = CRS('EPSG:4326')  # CRS("WGS84")
# Projection: NAD83 HARN / Washington South (ftUS)
crs_from = CRS("EPSG:2927")  
transformer = Transformer.from_crs(crs_from, crs_to)

# drop cells with no x or y coordinate
df100 = df100.replace(np.nan, '') 
dropIndex = df100.loc[(df100['X'] == '') | (df100['Y'] == '')].index
if len(dropIndex) > 0:
    df100 = df100.drop(dropIndex)
    df100 = df100.reset_index(drop=True)

lonList = []
latList = []
for ix in range(len(df100.index)):
    #print(ix)
    x1 = df100.loc[ix, 'X']
    y1 = df100.loc[ix, 'Y']
    try:
        lat, lon  = transformer.transform(float(x1), float(y1))
        lonList.append(lon)
        latList.append(lat)
    except:
        lonList.append('')
        latList.append('')

df100['Longitude'] = lonList
df100['Latitude'] = latList

Project to longitude/ latitude  


  result = method(y)


In [10]:
# print("CoordinateMethodCV...")

# coordMethodCVDictWA = {
#     "BLMPLS":"Bureau of Land Management's Public Land Survey System",
#     "DC_SEC":"Douglas County Sections",
#     "M":"paper map or manuscript",
#     "N":"not applicable (ex., when fieldchecked withGPS)",
#     "O":"orthophotos (DOQQ black and white)",
#     "Q":"quads (DRG24 topos)",
#     "S":"sections (DNR)",
#     "T":"plstiger"
# }

# #coordMethodCVDictWA_var = {
# '''
#     "P########":"parcels by year month day or by year (ex., P20040409 or P2006)"
#     "CYO####":"County Yakima Orthos by year (ex., CYO2011)",
#     "ISSW####":"Image Services Statewide (ex., ISSW2011)",
#     "NAIP####":"NAIP photos by year (ex., NAIP2006)"
# '''
# #}

# # Get coord method based on the field "Position_With_CD" 
# def assigncoordMethodCVWA(colrowValue):
#     # may need to modify capitalization in beneficialUseDictionary
#     if colrowValue == '' or pd.isnull(colrowValue):
#         outList = "Unspecified"
#     else:
#         keyStr = colrowValue.strip()  # remove whitespace chars
#         try:
#             if keyStr[0] == 'P' and len(keyStr) == 9:
#                     outList = "parcels by year month day"
#             elif keyStr[0] == 'P' and len(keyStr) == 5:
#                     outList = "parcels by year"
#             elif keyStr[0:3] == 'CYO':
#                 outList = "County Yakima Orthos by year"
#             elif keyStr[0:4] == 'ISSW':
#                 outList = "Image Services Statewide"
#             elif keyStr[0:4] == 'NAIP':
#                 outList = "NAIP photos by year"
#             else:
#                 outList = coordMethodCVDictWA[keyStr]
#         except:
#             outList = "Unspecified"

#     return outList

# df100 = df100.assign(CoordinateMethodCV='')  #add new column and make it empty
# df100['CoordinateMethodCV'] = df100.apply(lambda row: assigncoordMethodCVWA(row['Position_With_CD']), axis=1)


df100 = df100.assign(CoordinateMethodCV='')  #add new column and make it empty
df100.CoordinateMethodCV = "Unspecified"

In [11]:
print("Coordinate Accuracy...")

coordinateAccuracyDictWA = {
    "C":"field checked (without GPS)",
    "G":"field checked with GPS",
    "P":"proposed (does not exist in real world)",
    "PA":"proposed and All-right (does not exist in real world)",
    "PD":"proposed and Dubious (does not exist in real world)",
    "PM":"proposed and Multiple Dubious (does not exist in real world)",
    "PX":"proposed and Centroid Dubious (does not exist in real world)",
    "U":"unchecked",
    "UA":"unchecked and All-right",
    "UD":"unchecked and Dubious",
    "UM":"unchecked and Multiple Dubious",
    "UX":"unchecked and Centroid Dubious",
    "W":"from well log, unchecked",
    "WA":"from well log, unchecked and All-right",
    "WD":"from well log, unchecked and Dubious",
    "WX":"from well log, unchecked and Centroid Dubious"
}

# Get coord accuracy based on the field "Location_CD" 
def assigncoordAccuracyWA(colrowValue):
    # may need to modify capitalization in beneficialUseDictionary
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        keyStr = colrowValue.strip()  # remove whitespace chars
        try:
            outList = coordinateAccuracyDictWA[keyStr]
        except:
            outList = ''

    return outList


df100 = df100.assign(CoordinateAccuracy='')  #add new column and make it empty
df100['CoordinateAccuracy'] = df100.apply(lambda row: assigncoordAccuracyWA(row['Location_CD']), axis=1)

Coordinate Accuracy...


In [12]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteTypeCV', 'Longitude', 'Latitude', 'CoordinateMethodCV',
          'CoordinateAccuracy']
srsCols=['D_Point_ID', 'SiteTypeCV', 'Longitude', 'Latitude', 'CoordinateMethodCV',
         'CoordinateAccuracy']

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

Direct mapping columns...


In [13]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan) & (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    outdf100 = outdf100.drop(dropIndex) 
    outdf100 = outdf100.reset_index(drop=True)

Empty lat/lon


In [14]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = 'EPSG:4326'
outdf100.SiteName = "Unspecified"
outdf100.StateCV = "WA"

Hard coded


In [15]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
155062
155059


In [16]:
print("Check Site Native IDs are duplicated")

siteNIdDup = False
siteNativeIDdup=outdf100.loc[outdf100.duplicated(subset=['SiteNativeID'])]
if len(siteNativeIDdup.index) > 0:
    print("Site Native IDs are duplicated")
    siteNIdDup = True
#outdf100

print(len(siteNativeIDdup))

Check Site Native IDs are duplicated
Site Native IDs are duplicated
20


In [17]:
print("Adding SiteUUID...")

if siteNIdDup:    
    # 10.24.19 create unique site uuid
    outdf100 = outdf100.reset_index(drop=True)
    outdf100['TempUUID'] = range(1, len(outdf100.index) + 1)
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: "_".join(["WA", str(row['TempUUID'])]),
                                          axis=1)
    #drop temp uuid
    outdf100 = outdf100.drop('TempUUID', axis=1)
else:
    outdf100['SiteUUID'] = outdf100.apply(lambda row: '' if str(row['SiteNativeID']) == '' 
                                        else "_".join(["WA", str(row['SiteNativeID'])]), axis=1)

Adding SiteUUID...


In [18]:
print("Droping duplicates...")

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

Droping duplicates...


In [19]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('ProcessedInputData/sites_mandatoryFieldMissing.csv')  # index=False,

Checking required isnot null...


In [20]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites


In [21]:
outdf100

Unnamed: 0,SiteID,SiteUUID,CoordinateAccuracy,CoordinateMethodCV,County,EPSGCodeCV,Geometry,GNISCodeCV,HUC12,HUC8,Latitude,Longitude,NHDNetworkStatusCV,NHDProductCV,SiteName,SiteNativeID,SitePoint,SiteTypeCV,StateCV,USGSSiteID
0,,WA_1,unchecked,Unspecified,,EPSG:4326,,,,,46.580802,-120.398762,,,Unspecified,200801,,well (or ground water device unknown),WA,
1,,WA_2,field checked with GPS,Unspecified,,EPSG:4326,,,,,46.583691,-119.798724,,,Unspecified,200889,,monitoring well,WA,
2,,WA_3,field checked with GPS,Unspecified,,EPSG:4326,,,,,46.583730,-119.810231,,,Unspecified,200890,,monitoring well,WA,
3,,WA_4,unchecked,Unspecified,,EPSG:4326,,,,,46.587173,-120.774208,,,Unspecified,201092,,well (or ground water device unknown),WA,
4,,WA_5,field checked with GPS,Unspecified,,EPSG:4326,,,,,46.435437,-120.202625,,,Unspecified,201191,,monitoring well,WA,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155054,,WA_155055,unchecked and All-right,Unspecified,,EPSG:4326,,,,,46.596856,-120.739066,,,Unspecified,714850,,well (or ground water device unknown),WA,
155055,,WA_155056,unchecked and Centroid Dubious,Unspecified,,EPSG:4326,,,,,46.599094,-120.702581,,,Unspecified,714851,,well (or ground water device unknown),WA,
155056,,WA_155057,unchecked and Centroid Dubious,Unspecified,,EPSG:4326,,,,,46.595637,-120.708010,,,Unspecified,714852,,well (or ground water device unknown),WA,
155057,,WA_155058,unchecked and Centroid Dubious,Unspecified,,EPSG:4326,,,,,46.600990,-120.710829,,,Unspecified,714853,,well (or ground water device unknown),WA,


# Old code to Temporary hold on to

In [22]:
#Issue of CoordinateMethodCV for QA database.  Need to simpfliy.  going to rmeove the 'keyStr' portions.
#Temp solutions: Set all to Unspecified




# print("CoordinateMethodCV...")

# coordMethodCVDictWA = {
#     "BLMPLS":"Bureau of Land Management's Public Land Survey System",
#     "DC_SEC":"Douglas County Sections",
#     "M":"paper map or manuscript",
#     "N":"not applicable (ex., when fieldchecked withGPS)",
#     "O":"orthophotos (DOQQ black and white)",
#     "Q":"quads (DRG24 topos)",
#     "S":"sections (DNR)",
#     "T":"plstiger"
# }

# #coordMethodCVDictWA_var = {
# '''
#     "P########":"parcels by year month day or by year (ex., P20040409 or P2006)"
#     "CYO####":"County Yakima Orthos by year (ex., CYO2011)",
#     "ISSW####":"Image Services Statewide (ex., ISSW2011)",
#     "NAIP####":"NAIP photos by year (ex., NAIP2006)"
# '''
# #}

# # Get coord method based on the field "Position_With_CD" 
# def assigncoordMethodCVWA(colrowValue):
#     # may need to modify capitalization in beneficialUseDictionary
#     if colrowValue == '' or pd.isnull(colrowValue):
#         outList = 'Unspecified'
#     else:
#         keyStr = colrowValue.strip()  # remove whitespace chars
#         try:
#             if keyStr[0] == 'P' and len(keyStr) == 9:
#                     outList = "parcels by year month day " + keyStr[1:9]
#             elif keyStr[0] == 'P' and len(keyStr) == 5:
#                     outList = "parcels by year " + keyStr[1:5]
#             elif keyStr[0:3] == 'CYO':
#                 outList = "County Yakima Orthos by year " + keyStr[3:7]
#             elif keyStr[0:4] == 'ISSW':
#                 outList = "Image Services Statewide " + keyStr[4:8]
#             elif keyStr[0:4] == 'NAIP':
#                 outList = "NAIP photos by year " + keyStr[4:8]
#             else:
#                 outList = coordMethodCVDictWA[keyStr]
#         except:
#             outList = 'Unspecified'

#     return outList



# df100 = df100.assign(CoordinateMethodCV='')  #add new column and make it empty
# df100['CoordinateMethodCV'] = df100.apply(lambda row: assigncoordMethodCVWA(row['Position_With_CD']), 
#                                                       axis=1)
# df100

In [23]:
stringA = "WSDE_Site_17920,WSDE_Site_18546,WSDE_Site_18536,WSDE_Site_18551,WSDE_Site_18531,WSDE_Site_18538,WSDE_Site_18534,WSDE_Site_18547,WSDE_Site_18542,WSDE_Site_18548,WSDE_Site_18540,WSDE_Site_18535,WSDE_Site_18544,WSDE_Site_18541,WSDE_Site_18543,WSDE_Site_18539,WSDE_Site_18549,WSDE_Site_18530,WSDE_Site_18537,WSDE_Site_18545,WSDE_Site_18533,WSDE_Site_18532,WSDE_Site_18550"
print(len(stringA))

367
