## Sites_dim
Code to generate sites.csv as input to the WaDE db for WA water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from pyproj import CRS, Transformer, Proj

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "Permits.csv" 

# output sites
out_sitdim = 'sites.csv'

In [4]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['SiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV', 'HUC8', 'HUC12', 'County'
        ]

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [5]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# Read Inputs and merge tables

# points of diversion 
df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100.head(5)

Reading inputs...
12181


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,...,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",3/4/1991,Irrigation,Denied,,,...,,,,0,0.0,0.0,0.0,0.0,-99.78988,46.1113
1,2,1E,2,15310236CC,"HYDE, GEORGE H.",8/15/1901,Irrigation,Cancelled,,,...,,,,0,0.0,0.0,0.0,0.0,-103.75216,48.02622
2,3,2B,3,15310236BA,"SLATER, A. L.",9/2/1901,Irrigation,Cancelled,,,...,,,,0,0.0,0.0,0.0,0.0,-103.74691,48.03707
3,4,2D,4,14910026AB,"GUDMUNSEN, ROBERT AND LOWRAINE",1/26/1906,Irrigation,Perfected,4/30/1937,,...,,,,0,0.0,0.0,0.0,0.0,-103.44126,47.70176
4,5,3C,5,15009823DC,"HARTEL, LEMOINE",2/3/1906,Irrigation,Cancelled,1/10/1990,6/7/2017,...,,,,0,0.0,0.0,0.0,0.0,-103.184202,47.791708


In [7]:
list(df100.columns) 

['permit_ind',
 'permit_num',
 'pod_index',
 'pod',
 'permit_hol',
 'priority_d',
 'use_type',
 'status',
 'date_issue',
 'date_cance',
 'beneficial',
 'county',
 'hu_sub_bas',
 'aquifer',
 'subaquifer',
 'req_acft',
 'req_acre',
 'req_rate',
 'req_storag',
 'app_acft',
 'app_acre',
 'app_rate',
 'app_storag',
 'pod_status',
 'source',
 'irrigation',
 'source_nam',
 'mainstem',
 'impound_lo',
 'impound_na',
 'return_des',
 'discharge_',
 'period_sta',
 'period_end',
 'return_qua',
 'held_acft',
 'held_acre',
 'held_rate',
 'held_stora',
 'longitude',
 'latitude']

In [8]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteTypeCV', 'Longitude', 'Latitude', 'SiteName'
          #, 'CoordinateMethodCV',           'CoordinateAccuracy'
         ]
srsCols=['pod', 'source', 'longitude', 'latitude', 'aquifer'
         #'CoordinateMethodCV', 'CoordinateAccuracy'
        ]
#pod_location_id

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

outdf100

Direct mapping columns...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,13007302B,,,Ground Water,-99.789880,46.111300,,,,...,,,,,,,,,,
1,,15310236CC,,,Surface Water,-103.752160,48.026220,,,,...,,,,,,,,,,
2,,15310236BA,,,Surface Water,-103.746910,48.037070,,,,...,,,,,,,,,,
3,,14910026AB,,,Surface Water,-103.441260,47.701760,,,,...,,,,,,,,,,
4,,15009823DC,,,Surface Water,-103.184202,47.791708,,,,...,,,,,,,,,,
5,,15310234DD,,,Surface Water,-103.779470,48.026220,,,,...,,,,,,,,,,
6,,15507714DA,,,Surface Water,-100.520880,48.244860,,,,...,,,,,,,,,,
7,,15110331AA,,,Surface Water,-103.907440,47.860910,,,,...,,,,,,,,,,
8,,15509628C,,,Surface Water,-103.038480,48.215410,,,,...,,,,,,,,,,
9,,15510030BB,,,Surface Water,-103.601270,48.224830,,,,...,,,,,,,,,,


In [9]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan)
                             | (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    outdf100 = outdf100.drop(dropIndex) 
    outdf100 = outdf100.reset_index(drop=True)
    
outdf100

Empty lat/lon


  result = method(y)


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,13007302B,,,Ground Water,-99.789880,46.111300,,,,...,,,,,,,,,,
1,,15310236CC,,,Surface Water,-103.752160,48.026220,,,,...,,,,,,,,,,
2,,15310236BA,,,Surface Water,-103.746910,48.037070,,,,...,,,,,,,,,,
3,,14910026AB,,,Surface Water,-103.441260,47.701760,,,,...,,,,,,,,,,
4,,15009823DC,,,Surface Water,-103.184202,47.791708,,,,...,,,,,,,,,,
5,,15310234DD,,,Surface Water,-103.779470,48.026220,,,,...,,,,,,,,,,
6,,15507714DA,,,Surface Water,-100.520880,48.244860,,,,...,,,,,,,,,,
7,,15110331AA,,,Surface Water,-103.907440,47.860910,,,,...,,,,,,,,,,
8,,15509628C,,,Surface Water,-103.038480,48.215410,,,,...,,,,,,,,,,
9,,15510030BB,,,Surface Water,-103.601270,48.224830,,,,...,,,,,,,,,,


In [10]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
12181
11064


In [11]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = "EPSG:4326"
outdf100.loc[outdf100["SiteName"] == '', "SiteName"] = "Unspecified"
outdf100.CoordinateMethodCV = "Unspecified"


Hard coded


In [12]:
print("Check Site Native IDs are duplicated")

siteNIdDup = False
siteNativeIDdup=outdf100.loc[outdf100.duplicated(subset=['SiteNativeID'])]
if len(siteNativeIDdup.index) > 0:
    print("Site Native IDs are duplicated")
    siteNIdDup = True
#outdf100

print(len(siteNativeIDdup))

Check Site Native IDs are duplicated
Site Native IDs are duplicated
753


In [13]:
print("Adding SiteUUID...")

if siteNIdDup:    
    # 10.24.19 create unique site uuid
    outdf100 = outdf100.reset_index(drop=True)
    outdf100['TempUUID'] = range(1, len(outdf100.index) + 1)
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: "_".join(["ND", str(row['TempUUID'])]),
                                          axis=1)
    #drop temp uuid
    outdf100 = outdf100.drop('TempUUID', axis=1)
else:
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: '' if str(row['SiteNativeID']) == '' 
                                        else "_".join(["ND", str(row['SiteNativeID'])]), axis=1)

outdf100

Adding SiteUUID...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,ND_1,13007302B,Unspecified,,Ground Water,-99.789880,46.111300,,,,...,EPSG:4326,,,,,,,,,
1,ND_2,15310236CC,Unspecified,,Surface Water,-103.752160,48.026220,,,,...,EPSG:4326,,,,,,,,,
2,ND_3,15310236BA,Unspecified,,Surface Water,-103.746910,48.037070,,,,...,EPSG:4326,,,,,,,,,
3,ND_4,14910026AB,Unspecified,,Surface Water,-103.441260,47.701760,,,,...,EPSG:4326,,,,,,,,,
4,ND_5,15009823DC,Unspecified,,Surface Water,-103.184202,47.791708,,,,...,EPSG:4326,,,,,,,,,
5,ND_6,15310234DD,Unspecified,,Surface Water,-103.779470,48.026220,,,,...,EPSG:4326,,,,,,,,,
6,ND_7,15507714DA,Unspecified,,Surface Water,-100.520880,48.244860,,,,...,EPSG:4326,,,,,,,,,
7,ND_8,15110331AA,Unspecified,,Surface Water,-103.907440,47.860910,,,,...,EPSG:4326,,,,,,,,,
8,ND_9,15509628C,Unspecified,,Surface Water,-103.038480,48.215410,,,,...,EPSG:4326,,,,,,,,,
9,ND_10,15510030BB,Unspecified,,Surface Water,-103.601270,48.224830,,,,...,EPSG:4326,,,,,,,,,


In [14]:
print("Droping duplicates...")

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,ND_1,13007302B,Unspecified,,Ground Water,-99.789880,46.111300,,,,...,EPSG:4326,,,,,,,,,
1,ND_2,15310236CC,Unspecified,,Surface Water,-103.752160,48.026220,,,,...,EPSG:4326,,,,,,,,,
2,ND_3,15310236BA,Unspecified,,Surface Water,-103.746910,48.037070,,,,...,EPSG:4326,,,,,,,,,
3,ND_4,14910026AB,Unspecified,,Surface Water,-103.441260,47.701760,,,,...,EPSG:4326,,,,,,,,,
4,ND_5,15009823DC,Unspecified,,Surface Water,-103.184202,47.791708,,,,...,EPSG:4326,,,,,,,,,
5,ND_6,15310234DD,Unspecified,,Surface Water,-103.779470,48.026220,,,,...,EPSG:4326,,,,,,,,,
6,ND_7,15507714DA,Unspecified,,Surface Water,-100.520880,48.244860,,,,...,EPSG:4326,,,,,,,,,
7,ND_8,15110331AA,Unspecified,,Surface Water,-103.907440,47.860910,,,,...,EPSG:4326,,,,,,,,,
8,ND_9,15509628C,Unspecified,,Surface Water,-103.038480,48.215410,,,,...,EPSG:4326,,,,,,,,,
9,ND_10,15510030BB,Unspecified,,Surface Water,-103.601270,48.224830,,,,...,EPSG:4326,,,,,,,,,


In [15]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['SiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

Checking required isnot null...


In [16]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites
