## Sites_dim
Code to generate sites.csv as input to the WaDE db for WA water rights

In [5]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
# from pyproj import CRS, Transformer, Proj
from pyproj import transform, Proj

In [7]:
# working directory
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/California/WaterAllocation/RawInputData"
os.chdir(workingDir)

In [8]:
# Input files
fileInput1 = "EWRIMS MASTER FLAT FILE DATA DICTIONARY DRAFT 1-17-20.xlsx"

# output sites
out_sitdim = 'sites.csv'

In [9]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['SiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV', 'HUC8', 'HUC12', 'County'
]

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [10]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [11]:
print("Reading inputs...")

# Read Inputs and merge tables

# sheet to read = "ewrims_flat_file"
df100 = pd.read_excel(fileInput1, header=0, sheet_name="ewrims_flat_file", skiprows=0, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

df100 = df100.drop_duplicates()   #
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100 = df100.replace(np.nan, '')
df100

Reading inputs...
57736


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE
0,0,,,,,Not Determined,,,,,...,,,,,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,,,,,0
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,,,,,0
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,,,,,0
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,Pending,1578441600000000000,,1578564981000000000,34,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57731,73078,UN002198,,,,Not Determined,Active,UN002198,,,...,,,,,,,,,,0
57732,73088,UN002199,,,,Not Determined,Active,UN002199,,,...,,,,,,,,,,0
57733,73089,A033099,,,,Appropriative,Pending,A033099,,,...,,,,,,,,,,0
57734,73108,UN002200,,,,Not Determined,Active,UN002200,,,...,,,,,,,,,,0


In [12]:
list(df100.columns) 

['WR_WATER_RIGHT_ID',
 'APPLICATION_NUMBER',
 'CERTIFICATE_ID',
 'PERMIT_ID',
 'LICENSE_ID',
 'WATER_RIGHT_TYPE',
 'WATER_RIGHT_STATUS',
 'APPLICATION_NUMBER_PARTY',
 'PWSS_ID',
 'PRIORITY_DATE',
 'RECEIPT_DATE',
 'REJECTION_DATE',
 'APPLICATION_RECD_DATE',
 'APPLICATION_ACCEPTANCE_DATE',
 'PROJECT_TYPE',
 'RECORD_SUMMARY',
 'INCOMPLETE_STATEMENT',
 'NUMBER_OF_PROTESTS',
 'AGENT_NAME',
 'AGENT_ENTITY_TYPE',
 'APPLICATION_PRIMARY_OWNER',
 'PRIMARY_OWNER_ENTITY_TYPE',
 'SUB_TYPE',
 'INI_REPORTED_DIV_AMOUNT',
 'INI_REPORTED_DIV_UNIT',
 'FACE_VALUE_AMOUNT',
 'FACE_VALUE_UNITS',
 'FEE_DUE',
 'FEE_RECEIVED',
 'APPL_FEE_AMOUNT',
 'APPL_FEE_AMT_RECD',
 'MAX_DD_APPL',
 'MAX_DD_UNITS',
 'MAX_DD_ANN',
 'MAX_STORAGE',
 'MAX_TAKEN_FROM_SOURCE',
 'YEAR_DIVERSION_COMMENCED',
 'MAX_BENEFICIALLY_USED',
 'SUPPLEMENTAL_STATEMENT_CYCLE',
 'TYPE_OF_DIVERSION_FACILITY',
 'QUANTITY_OF_WATER_DIVERTED',
 'QOW_DIVERTED_UNIT',
 'QUANTITY_MEASUREMENT_YEAR',
 'MAX_RATE_OF_DIVERSION',
 'MAX_RATE_OF_DIV_UNIT',
 'REC

In [13]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude',
          'CoordinateMethodCV', 'HUC8', 'HUC12', 'County'
          #'CoordinateAccuracy'
         ]
srsCols=['POD_ID', 'DIVERSION_SITE_NAME', 'TYPE_OF_DIVERSION_FACILITY', 'LONGITUDE', 'LATITUDE',
         'LOCATION_METHOD', 'HUC_8_NUMBER', 'HUC_12_NUMBER', 'COUNTY'
         #'CoordinateAccuracy'
        ]
#pod_location_id

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

outdf100

Direct mapping columns...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,,,,,,,,,,...,,,,,,,,,,
1,,60498,LAKE DOMINGO,,,-116.273,32.6144,,,,...,,,,,,,,1.81002e+07,1.81002e+11,San Diego
2,,34881,,,,-118.289,34.578,,,,...,,,,,,,,1.80902e+07,1.80902e+11,Los Angeles
3,,28036,,,,-122.054,39.7945,,,,...,,,,,,,,1.80202e+07,1.80202e+11,Glenn
4,,23233,,,,-119.707,37.0026,,,,...,,,,,,,,1.804e+07,1.804e+11,Madera
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57731,,,,,,,,,,,...,,,,,,,,,,
57732,,,,,,,,,,,...,,,,,,,,,,
57733,,,,,,,,,,,...,,,,,,,,,,
57734,,,,,,,,,,,...,,,,,,,,,,


In [14]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan)
                             | (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    outdf100 = outdf100.drop(dropIndex) 
    outdf100 = outdf100.reset_index(drop=True)
    
outdf100

Empty lat/lon


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,60498,LAKE DOMINGO,,,-116.273,32.6144,,,,...,,,,,,,,1.81002e+07,1.81002e+11,San Diego
1,,34881,,,,-118.289,34.578,,,,...,,,,,,,,1.80902e+07,1.80902e+11,Los Angeles
2,,28036,,,,-122.054,39.7945,,,,...,,,,,,,,1.80202e+07,1.80202e+11,Glenn
3,,23233,,,,-119.707,37.0026,,,,...,,,,,,,,1.804e+07,1.804e+11,Madera
4,,405,,,,-122.756,38.8218,,,,...,,,,,,,,1.80201e+07,1.80201e+11,Lake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51366,,77483,Pond,,,-123.197,39.1899,,,,...,,,,,,,,1.80101e+07,1.80101e+11,Mendocino
51367,,77519,Primary POD,,,-123.979,40.1936,,,,...,,,,,,,,1.80101e+07,1.80101e+11,Humboldt
51368,,77499,Primary POD 1,,,-123.739,40.1038,,,,...,,,,,,,,1.80101e+07,1.80101e+11,Humboldt
51369,,77501,Earthen Bern Lined Pond,,,-123.681,40.9519,,,,...,,,,,,,,1.80102e+07,1.80102e+11,Humboldt


In [15]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
51371
51371


In [16]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = 'EPSG:4326'
outdf100.StateCV = 'CA'

Hard coded


In [17]:
print("Rows with no value for site name and coordinate method")

outdf100.loc[outdf100['SiteName']=='','SiteName'] = "Unspecified"
outdf100.loc[outdf100['CoordinateMethodCV']=='','CoordinateMethodCV'] = "Unspecified"

outdf100

Rows with no value for site name and coordinate method


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,,60498,LAKE DOMINGO,,,-116.273,32.6144,,,,...,EPSG:4326,,,,,,CA,1.81002e+07,1.81002e+11,San Diego
1,,34881,Unspecified,,,-118.289,34.578,,,,...,EPSG:4326,,,,,,CA,1.80902e+07,1.80902e+11,Los Angeles
2,,28036,Unspecified,,,-122.054,39.7945,,,,...,EPSG:4326,,,,,,CA,1.80202e+07,1.80202e+11,Glenn
3,,23233,Unspecified,,,-119.707,37.0026,,,,...,EPSG:4326,,,,,,CA,1.804e+07,1.804e+11,Madera
4,,405,Unspecified,,,-122.756,38.8218,,,,...,EPSG:4326,,,,,,CA,1.80201e+07,1.80201e+11,Lake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51366,,77483,Pond,,,-123.197,39.1899,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Mendocino
51367,,77519,Primary POD,,,-123.979,40.1936,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Humboldt
51368,,77499,Primary POD 1,,,-123.739,40.1038,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Humboldt
51369,,77501,Earthen Bern Lined Pond,,,-123.681,40.9519,,,,...,EPSG:4326,,,,,,CA,1.80102e+07,1.80102e+11,Humboldt


In [18]:
print("Check Site Native IDs are duplicated")

siteNIdDup = False
siteNativeIDdup=outdf100.loc[outdf100.duplicated(subset=['SiteNativeID'])]
if len(siteNativeIDdup.index) > 0:
    print("Site Native IDs are duplicated")
    siteNIdDup = True
#outdf100

print(len(siteNativeIDdup))

Check Site Native IDs are duplicated
0


In [19]:
print("Adding SiteUUID...")

if siteNIdDup:    
    # 10.24.19 create unique site uuid
    outdf100 = outdf100.reset_index(drop=True)
    outdf100['TempUUID'] = range(1, len(outdf100.index) + 1)
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: "_".join(["CA", str(row['TempUUID'])]),
                                          axis=1)
    #drop temp uuid
    outdf100 = outdf100.drop('TempUUID', axis=1)
else:
    #append 'NM'
    outdf100['SiteUUID'] = outdf100.apply(lambda row: '' if str(row['SiteNativeID']) == '' 
                                        else "_".join(["CA", str(int(row['SiteNativeID']))]), axis=1)

outdf100

Adding SiteUUID...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,CA_60498,60498,LAKE DOMINGO,,,-116.273,32.6144,,,,...,EPSG:4326,,,,,,CA,1.81002e+07,1.81002e+11,San Diego
1,CA_34881,34881,Unspecified,,,-118.289,34.578,,,,...,EPSG:4326,,,,,,CA,1.80902e+07,1.80902e+11,Los Angeles
2,CA_28036,28036,Unspecified,,,-122.054,39.7945,,,,...,EPSG:4326,,,,,,CA,1.80202e+07,1.80202e+11,Glenn
3,CA_23233,23233,Unspecified,,,-119.707,37.0026,,,,...,EPSG:4326,,,,,,CA,1.804e+07,1.804e+11,Madera
4,CA_405,405,Unspecified,,,-122.756,38.8218,,,,...,EPSG:4326,,,,,,CA,1.80201e+07,1.80201e+11,Lake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51366,CA_77483,77483,Pond,,,-123.197,39.1899,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Mendocino
51367,CA_77519,77519,Primary POD,,,-123.979,40.1936,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Humboldt
51368,CA_77499,77499,Primary POD 1,,,-123.739,40.1038,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Humboldt
51369,CA_77501,77501,Earthen Bern Lined Pond,,,-123.681,40.9519,,,,...,EPSG:4326,,,,,,CA,1.80102e+07,1.80102e+11,Humboldt


In [20]:
print("Droping duplicates...")

#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,CA_60498,60498,LAKE DOMINGO,,,-116.273,32.6144,,,,...,EPSG:4326,,,,,,CA,1.81002e+07,1.81002e+11,San Diego
1,CA_34881,34881,Unspecified,,,-118.289,34.578,,,,...,EPSG:4326,,,,,,CA,1.80902e+07,1.80902e+11,Los Angeles
2,CA_28036,28036,Unspecified,,,-122.054,39.7945,,,,...,EPSG:4326,,,,,,CA,1.80202e+07,1.80202e+11,Glenn
3,CA_23233,23233,Unspecified,,,-119.707,37.0026,,,,...,EPSG:4326,,,,,,CA,1.804e+07,1.804e+11,Madera
4,CA_405,405,Unspecified,,,-122.756,38.8218,,,,...,EPSG:4326,,,,,,CA,1.80201e+07,1.80201e+11,Lake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51366,CA_77483,77483,Pond,,,-123.197,39.1899,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Mendocino
51367,CA_77519,77519,Primary POD,,,-123.979,40.1936,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Humboldt
51368,CA_77499,77499,Primary POD 1,,,-123.739,40.1038,,,,...,EPSG:4326,,,,,,CA,1.80101e+07,1.80101e+11,Humboldt
51369,CA_77501,77501,Earthen Bern Lined Pond,,,-123.681,40.9519,,,,...,EPSG:4326,,,,,,CA,1.80102e+07,1.80102e+11,Humboldt


In [21]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

Checking required isnot null...


In [22]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites
