## Sites_dim
Code to generate sites.csv as input to the WaDE db for WA water rights

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.parser import parse
from pyproj import CRS, Transformer, Proj

In [2]:
# working directory
working_dir = "ProcessedInputData"
os.chdir(working_dir)

In [3]:
#column names
#10.24.19 rename 'WaDESiteUUID' to 'SiteUUID'
columns=['SiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude', 'Latitude',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV', 'HUC8', 'HUC12', 'County'
]

# These are not used currently. Data types inferred from the inputs
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [4]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [5]:
# Input files
fileInput1 = "Well_Registry_Wells55.csv" 
fileInput2 = "GWSI_Sites.csv"

# output sites
out_sitdim = 'sites.csv'

In [6]:
print("Reading inputs...")

# Read Inputs 
# 
df200 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df200.index))
df200.drop_duplicates(inplace=True)
print(len(df200))
#df200

# columns of GWSI_Sites
# "X", "Y", "OBJECTID", "SITE_ID", "LOCAL_ID", "REG_ID", "WELL_TYPE", "DD_LAT", "DD_LONG",
# "LAT_NAD27", "LONG_NAD27", "WELL_ALT", "WATER_USE", "WELL_DEPTH", "CASE_DIAM", "DRILL_DATE", 
# "WL_COUNT", "LASTWLDATE", "WL_DTW", "WL_ELEV", "SOURCE", "IDXBK

cols_GWSI = ["SITE_ID", "LOCAL_ID", "REG_ID", "WELL_TYPE", "DD_LAT", "DD_LONG",
      "WATER_USE",  "DRILL_DATE", "LASTWLDATE", "WL_DTW", "SOURCE", "IDXBK"]

df300 = pd.read_csv(fileInput2,encoding = "ISO-8859-1", usecols = cols_GWSI) #, or alternatively encoding = "utf-8"
print (len(df300.index))
df200.drop_duplicates(inplace=True)
print(len(df300))
#df300

Reading inputs...


  interactivity=interactivity, compiler=compiler, result=result)


216408
216408
45460
45460


In [7]:
print("Join tables...")

df100=pd.merge(df200, df300, left_on='REGISTRY_ID', right_on='REG_ID', how='inner') #

#print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

df100 = df100.replace(np.nan, '')

df100
#df100.head(5)

Join tables...


Unnamed: 0,X,Y,OBJECTID,PROGRAM,REGISTRY_ID,OWNER_NAME,RGR_PUMP_DATA,WELLTYPE,WELL_TYPE_GROUP,DLIC_NUM,...,REG_ID,WELL_TYPE,DD_LAT,DD_LONG,WATER_USE,DRILL_DATE,LASTWLDATE,WL_DTW,SOURCE,IDXBK
0,-112.498143,33.784977,2,55,60001,"SFI GRAND VISTA, LLC",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,611,...,60001.0,INDEX,33.785056,-112.497806,INDUSTRIAL,7/2/1984,12/7/2018,489,,BK02
1,-110.231423,31.346352,10,55,84460,RICHARD RICHARDS,NO,EXEMPT,EXEMPT,152,...,84460.0,GWSI,31.347036,-110.230244,UNUSED,6/16/1980,6/16/1980,150,,
2,-114.574586,34.863445,15,55,84630,JOHN CLAYPOOL,NO,NON-EXEMPT,NON-EXEMPT,157,...,84630.0,GWSI,34.863611,-114.573889,DOMESTIC,6/20/1980,6/20/1980,18,,
3,-114.576929,34.890564,16,55,84633,HUBSON,NO,NON-EXEMPT,NON-EXEMPT,157,...,84633.0,GWSI,34.890833,-114.575833,DOMESTIC,6/18/1980,6/18/1980,18,,
4,-111.498588,32.604010,20,55,84660,"KEP INC,",NO,NON-EXEMPT - REPLACEMENT WELL IN NEW LOCATION,NON-EXEMPT,1,...,84660.0,GWSI,32.604056,-111.497833,UNUSED,,,,,
5,-114.060088,35.617904,26,55,84693,"DESERT COMMUNITIES, INC",NO,EXEMPT,EXEMPT,111,...,84693.0,GWSI,35.617500,-114.060000,UNUSED,6/30/1980,,,,
6,-112.501829,34.806309,32,55,84735,MARC M. SNYDER,NO,EXEMPT,EXEMPT,54,...,84735.0,GWSI,34.805556,-112.501389,DOMESTIC,7/2/1980,3/3/1994,377,,
7,-112.698166,33.432670,39,55,84785,STOTZ DAIRY,YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,84785.0,GWSI,33.432303,-112.694694,STOCK,8/27/1980,9/24/1992,160.1,,
8,-112.696010,33.432618,40,55,84786,"STOTZ DAIRY,",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,84786.0,GWSI,33.432303,-112.694694,STOCK,8/4/1980,11/20/1997,154.2,,
9,-113.941227,33.866950,61,55,84826,JUAN RAMON DEL REAL,NO,EXEMPT,EXEMPT,138,...,84826.0,GWSI,33.867056,-113.942389,DOMESTIC,9/30/1980,11/17/2004,151.9,,


In [8]:
# use only unique water rights that may have multiple sites/pds
print("Dropping duplicates...")

df100.drop_duplicates(subset = ['REGISTRY_ID'], inplace=True)   #
df100 = df100.reset_index(drop=True)

print (len(df100.index))

df100

Dropping duplicates...
24026


Unnamed: 0,X,Y,OBJECTID,PROGRAM,REGISTRY_ID,OWNER_NAME,RGR_PUMP_DATA,WELLTYPE,WELL_TYPE_GROUP,DLIC_NUM,...,REG_ID,WELL_TYPE,DD_LAT,DD_LONG,WATER_USE,DRILL_DATE,LASTWLDATE,WL_DTW,SOURCE,IDXBK
0,-112.498143,33.784977,2,55,60001,"SFI GRAND VISTA, LLC",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,611,...,60001.0,INDEX,33.785056,-112.497806,INDUSTRIAL,7/2/1984,12/7/2018,489,,BK02
1,-110.231423,31.346352,10,55,84460,RICHARD RICHARDS,NO,EXEMPT,EXEMPT,152,...,84460.0,GWSI,31.347036,-110.230244,UNUSED,6/16/1980,6/16/1980,150,,
2,-114.574586,34.863445,15,55,84630,JOHN CLAYPOOL,NO,NON-EXEMPT,NON-EXEMPT,157,...,84630.0,GWSI,34.863611,-114.573889,DOMESTIC,6/20/1980,6/20/1980,18,,
3,-114.576929,34.890564,16,55,84633,HUBSON,NO,NON-EXEMPT,NON-EXEMPT,157,...,84633.0,GWSI,34.890833,-114.575833,DOMESTIC,6/18/1980,6/18/1980,18,,
4,-111.498588,32.604010,20,55,84660,"KEP INC,",NO,NON-EXEMPT - REPLACEMENT WELL IN NEW LOCATION,NON-EXEMPT,1,...,84660.0,GWSI,32.604056,-111.497833,UNUSED,,,,,
5,-114.060088,35.617904,26,55,84693,"DESERT COMMUNITIES, INC",NO,EXEMPT,EXEMPT,111,...,84693.0,GWSI,35.617500,-114.060000,UNUSED,6/30/1980,,,,
6,-112.501829,34.806309,32,55,84735,MARC M. SNYDER,NO,EXEMPT,EXEMPT,54,...,84735.0,GWSI,34.805556,-112.501389,DOMESTIC,7/2/1980,3/3/1994,377,,
7,-112.698166,33.432670,39,55,84785,STOTZ DAIRY,YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,84785.0,GWSI,33.432303,-112.694694,STOCK,8/27/1980,9/24/1992,160.1,,
8,-112.696010,33.432618,40,55,84786,"STOTZ DAIRY,",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,84786.0,GWSI,33.432303,-112.694694,STOCK,8/4/1980,11/20/1997,154.2,,
9,-113.941227,33.866950,61,55,84826,JUAN RAMON DEL REAL,NO,EXEMPT,EXEMPT,138,...,84826.0,GWSI,33.867056,-113.942389,DOMESTIC,9/30/1980,11/17/2004,151.9,,


In [9]:
list(df100.columns) 

['X',
 'Y',
 'OBJECTID',
 'PROGRAM',
 'REGISTRY_ID',
 'OWNER_NAME',
 'RGR_PUMP_DATA',
 'WELLTYPE',
 'WELL_TYPE_GROUP',
 'DLIC_NUM',
 'APPROVED',
 'INSTALLED',
 'WELL_DEPTH',
 'WATER_LEVEL',
 'CASING_DEPTH',
 'CASING_DIAMETER',
 'CASING_TYPE',
 'PUMP_TYPE',
 'PUMP_POWER',
 'PUMPRATE',
 'TESTEDRATE',
 'DRAW_DOWN',
 'COMPLETION_REPORT_STATUS',
 'DRILL_LOG',
 'WELL_CANCELLED',
 'CADASTRAL',
 'COUNTY',
 'WATERSHED',
 'BASIN_NAME',
 'SUBBASIN_NAME',
 'AMA',
 'QUAD_CODE',
 'WHOLE_TOWNSHIP',
 'HALF_TOWNSHIP',
 'NORTHSOUTH',
 'WHOLE_RANGE',
 'HALF_RANGE',
 'EASTWEST',
 'SECTION',
 'QUARTER_160_ACRE',
 'QACRE160DIR',
 'QUARTER_40_ACRE',
 'QACRE40DIR',
 'QUARTER_10_ACRE',
 'QACRE10DIR',
 'UTM_X_METERS',
 'UTM_Y_METERS',
 'APPLICATION_DATE',
 'ADDRESS1',
 'ADDRESS2',
 'CITY',
 'STATE',
 'ZIP',
 'ZIP4',
 'SITE_ID',
 'LOCAL_ID',
 'REG_ID',
 'WELL_TYPE',
 'DD_LAT',
 'DD_LONG',
 'WATER_USE',
 'DRILL_DATE',
 'LASTWLDATE',
 'WL_DTW',
 'SOURCE',
 'IDXBK']

In [10]:
print("Adding SiteUUID...")

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: '_'.join(['AZ', str(row["SITE_ID"])]), axis=1)

df100

Adding SiteUUID...


Unnamed: 0,X,Y,OBJECTID,PROGRAM,REGISTRY_ID,OWNER_NAME,RGR_PUMP_DATA,WELLTYPE,WELL_TYPE_GROUP,DLIC_NUM,...,WELL_TYPE,DD_LAT,DD_LONG,WATER_USE,DRILL_DATE,LASTWLDATE,WL_DTW,SOURCE,IDXBK,SiteUUID
0,-112.498143,33.784977,2,55,60001,"SFI GRAND VISTA, LLC",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,611,...,INDEX,33.785056,-112.497806,INDUSTRIAL,7/2/1984,12/7/2018,489,,BK02,AZ_334708112295301
1,-110.231423,31.346352,10,55,84460,RICHARD RICHARDS,NO,EXEMPT,EXEMPT,152,...,GWSI,31.347036,-110.230244,UNUSED,6/16/1980,6/16/1980,150,,,AZ_312049110134902
2,-114.574586,34.863445,15,55,84630,JOHN CLAYPOOL,NO,NON-EXEMPT,NON-EXEMPT,157,...,GWSI,34.863611,-114.573889,DOMESTIC,6/20/1980,6/20/1980,18,,,AZ_345149114342601
3,-114.576929,34.890564,16,55,84633,HUBSON,NO,NON-EXEMPT,NON-EXEMPT,157,...,GWSI,34.890833,-114.575833,DOMESTIC,6/18/1980,6/18/1980,18,,,AZ_345327114343301
4,-111.498588,32.604010,20,55,84660,"KEP INC,",NO,NON-EXEMPT - REPLACEMENT WELL IN NEW LOCATION,NON-EXEMPT,1,...,GWSI,32.604056,-111.497833,UNUSED,,,,,,AZ_323616111295201
5,-114.060088,35.617904,26,55,84693,"DESERT COMMUNITIES, INC",NO,EXEMPT,EXEMPT,111,...,GWSI,35.617500,-114.060000,UNUSED,6/30/1980,,,,,AZ_353705114033601
6,-112.501829,34.806309,32,55,84735,MARC M. SNYDER,NO,EXEMPT,EXEMPT,54,...,GWSI,34.805556,-112.501389,DOMESTIC,7/2/1980,3/3/1994,377,,,AZ_344820112300501
7,-112.698166,33.432670,39,55,84785,STOTZ DAIRY,YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,GWSI,33.432303,-112.694694,STOCK,8/27/1980,9/24/1992,160.1,,,AZ_332557112414801
8,-112.696010,33.432618,40,55,84786,"STOTZ DAIRY,",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,GWSI,33.432303,-112.694694,STOCK,8/4/1980,11/20/1997,154.2,,,AZ_332557112414501
9,-113.941227,33.866950,61,55,84826,JUAN RAMON DEL REAL,NO,EXEMPT,EXEMPT,138,...,GWSI,33.867056,-113.942389,DOMESTIC,9/30/1980,11/17/2004,151.9,,,AZ_335203113563401


In [11]:
print("Project to longitude/ latitude  ")

df100 = df100.assign(Longitude='')
df100 = df100.assign(Latitude='')

# use pyproj to project to lat lon
crs_to = CRS('EPSG:4326')  # CRS("WGS84")
# NAD27 crs_from = CRS("EPSG:4267") 
# NAD83 UTM Zone 12N
crs_from = CRS("EPSG:26912") 
transformer = Transformer.from_crs(crs_from, crs_to)

# drop cells with no x or y coordinate
df100 = df100.replace(np.nan, '') 
dropIndex = df100.loc[(df100['UTM_X_METERS'] == '') | (df100['UTM_Y_METERS'] == '')].index
if len(dropIndex) > 0:
    df100 = df100.drop(dropIndex)
    df100 = df100.reset_index(drop=True)

lonList = []
latList = []
for ix in range(len(df100.index)):
    #print(ix)
    x1 = df100.loc[ix, 'UTM_X_METERS']
    y1 = df100.loc[ix, 'UTM_Y_METERS']
    try:
        lat, lon  = transformer.transform(float(x1), float(y1))
        lonList.append(lon)
        latList.append(lat)
    except:
        lonList.append('')
        latList.append('')

df100['Longitude'] = lonList
df100['Latitude'] = latList

df100

Project to longitude/ latitude  


  result = method(y)


Unnamed: 0,X,Y,OBJECTID,PROGRAM,REGISTRY_ID,OWNER_NAME,RGR_PUMP_DATA,WELLTYPE,WELL_TYPE_GROUP,DLIC_NUM,...,DD_LONG,WATER_USE,DRILL_DATE,LASTWLDATE,WL_DTW,SOURCE,IDXBK,SiteUUID,Longitude,Latitude
0,-112.498143,33.784977,2,55,60001,"SFI GRAND VISTA, LLC",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,611,...,-112.497806,INDUSTRIAL,7/2/1984,12/7/2018,489,,BK02,AZ_334708112295301,-112.498131,33.784972
1,-110.231423,31.346352,10,55,84460,RICHARD RICHARDS,NO,EXEMPT,EXEMPT,152,...,-110.230244,UNUSED,6/16/1980,6/16/1980,150,,,AZ_312049110134902,-110.231413,31.346347
2,-114.574586,34.863445,15,55,84630,JOHN CLAYPOOL,NO,NON-EXEMPT,NON-EXEMPT,157,...,-114.573889,DOMESTIC,6/20/1980,6/20/1980,18,,,AZ_345149114342601,-114.574574,34.863440
3,-114.576929,34.890564,16,55,84633,HUBSON,NO,NON-EXEMPT,NON-EXEMPT,157,...,-114.575833,DOMESTIC,6/18/1980,6/18/1980,18,,,AZ_345327114343301,-114.576918,34.890559
4,-111.498588,32.604010,20,55,84660,"KEP INC,",NO,NON-EXEMPT - REPLACEMENT WELL IN NEW LOCATION,NON-EXEMPT,1,...,-111.497833,UNUSED,,,,,,AZ_323616111295201,-111.498577,32.604005
5,-114.060088,35.617904,26,55,84693,"DESERT COMMUNITIES, INC",NO,EXEMPT,EXEMPT,111,...,-114.060000,UNUSED,6/30/1980,,,,,AZ_353705114033601,-114.060076,35.617898
6,-112.501829,34.806309,32,55,84735,MARC M. SNYDER,NO,EXEMPT,EXEMPT,54,...,-112.501389,DOMESTIC,7/2/1980,3/3/1994,377,,,AZ_344820112300501,-112.501817,34.806304
7,-112.698166,33.432670,39,55,84785,STOTZ DAIRY,YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,-112.694694,STOCK,8/27/1980,9/24/1992,160.1,,,AZ_332557112414801,-112.698155,33.432665
8,-112.696010,33.432618,40,55,84786,"STOTZ DAIRY,",YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,4,...,-112.694694,STOCK,8/4/1980,11/20/1997,154.2,,,AZ_332557112414501,-112.695999,33.432613
9,-113.941227,33.866950,61,55,84826,JUAN RAMON DEL REAL,NO,EXEMPT,EXEMPT,138,...,-113.942389,DOMESTIC,9/30/1980,11/17/2004,151.9,,,AZ_335203113563401,-113.941216,33.866945


In [12]:
print("Direct mapping columns...")
#
# directly mapped cells
# 
destCols=['SiteNativeID', 'SiteUUID', 'SiteTypeCV', 'Longitude', 'Latitude']
srsCols=['SITE_ID', 'SiteUUID', 'WELL_TYPE', 'Longitude', 'Latitude']

outdf100[destCols] = df100[srsCols]

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '') 

outdf100

Direct mapping columns...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,AZ_334708112295301,334708112295301,,,INDEX,-112.498131,33.784972,,,,...,,,,,,,,,,
1,AZ_312049110134902,312049110134902,,,GWSI,-110.231413,31.346347,,,,...,,,,,,,,,,
2,AZ_345149114342601,345149114342601,,,GWSI,-114.574574,34.863440,,,,...,,,,,,,,,,
3,AZ_345327114343301,345327114343301,,,GWSI,-114.576918,34.890559,,,,...,,,,,,,,,,
4,AZ_323616111295201,323616111295201,,,GWSI,-111.498577,32.604005,,,,...,,,,,,,,,,
5,AZ_353705114033601,353705114033601,,,GWSI,-114.060076,35.617898,,,,...,,,,,,,,,,
6,AZ_344820112300501,344820112300501,,,GWSI,-112.501817,34.806304,,,,...,,,,,,,,,,
7,AZ_332557112414801,332557112414801,,,GWSI,-112.698155,33.432665,,,,...,,,,,,,,,,
8,AZ_332557112414501,332557112414501,,,GWSI,-112.695999,33.432613,,,,...,,,,,,,,,,
9,AZ_335203113563401,335203113563401,,,GWSI,-113.941216,33.866945,,,,...,,,,,,,,,,


In [13]:
print("Empty lat/lon")
#TODO there are too many empty location coordinates so we are not dropping them here

outdf100purge = outdf100.loc[(outdf100['Longitude'] == '') | (outdf100['Longitude'] == np.nan)
                             | (outdf100['Latitude'] == '') | (outdf100['Latitude'] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_latlon_missing.csv')    #index=False,
    dropIndex = outdf100purge.index
    outdf100 = outdf100.drop(dropIndex) 
    outdf100 = outdf100.reset_index(drop=True)
    
outdf100

Empty lat/lon


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,AZ_334708112295301,334708112295301,,,INDEX,-112.498131,33.784972,,,,...,,,,,,,,,,
1,AZ_312049110134902,312049110134902,,,GWSI,-110.231413,31.346347,,,,...,,,,,,,,,,
2,AZ_345149114342601,345149114342601,,,GWSI,-114.574574,34.863440,,,,...,,,,,,,,,,
3,AZ_345327114343301,345327114343301,,,GWSI,-114.576918,34.890559,,,,...,,,,,,,,,,
4,AZ_323616111295201,323616111295201,,,GWSI,-111.498577,32.604005,,,,...,,,,,,,,,,
5,AZ_353705114033601,353705114033601,,,GWSI,-114.060076,35.617898,,,,...,,,,,,,,,,
6,AZ_344820112300501,344820112300501,,,GWSI,-112.501817,34.806304,,,,...,,,,,,,,,,
7,AZ_332557112414801,332557112414801,,,GWSI,-112.698155,33.432665,,,,...,,,,,,,,,,
8,AZ_332557112414501,332557112414501,,,GWSI,-112.695999,33.432613,,,,...,,,,,,,,,,
9,AZ_335203113563401,335203113563401,,,GWSI,-113.941216,33.866945,,,,...,,,,,,,,,,


In [14]:
print("Dropping duplicates...")
#filter the whole table based on a unique combination of site ID, SiteName, SiteType
#10.24.19 added lat lon to list
print(len(outdf100.index))
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV', 'Longitude', 'Latitude'])   #
outdf100 = outdf100.reset_index(drop=True)
print(len(outdf100.index))

Dropping duplicates...
24026
24026


In [15]:
# hardcoded columns
print("Hard coded")

outdf100.EPSGCodeCV = 'EPSG:4326'
outdf100.SiteName = "Unspecified"
outdf100.CoordinateMethodCV = "Unspecified"
outdf100.StateCV = "AZ"

outdf100

Hard coded


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,AZ_334708112295301,334708112295301,Unspecified,,INDEX,-112.498131,33.784972,,,,...,EPSG:4326,,,,,,AZ,,,
1,AZ_312049110134902,312049110134902,Unspecified,,GWSI,-110.231413,31.346347,,,,...,EPSG:4326,,,,,,AZ,,,
2,AZ_345149114342601,345149114342601,Unspecified,,GWSI,-114.574574,34.863440,,,,...,EPSG:4326,,,,,,AZ,,,
3,AZ_345327114343301,345327114343301,Unspecified,,GWSI,-114.576918,34.890559,,,,...,EPSG:4326,,,,,,AZ,,,
4,AZ_323616111295201,323616111295201,Unspecified,,GWSI,-111.498577,32.604005,,,,...,EPSG:4326,,,,,,AZ,,,
5,AZ_353705114033601,353705114033601,Unspecified,,GWSI,-114.060076,35.617898,,,,...,EPSG:4326,,,,,,AZ,,,
6,AZ_344820112300501,344820112300501,Unspecified,,GWSI,-112.501817,34.806304,,,,...,EPSG:4326,,,,,,AZ,,,
7,AZ_332557112414801,332557112414801,Unspecified,,GWSI,-112.698155,33.432665,,,,...,EPSG:4326,,,,,,AZ,,,
8,AZ_332557112414501,332557112414501,Unspecified,,GWSI,-112.695999,33.432613,,,,...,EPSG:4326,,,,,,AZ,,,
9,AZ_335203113563401,335203113563401,Unspecified,,GWSI,-113.941216,33.866945,,,,...,EPSG:4326,,,,,,AZ,,,


In [16]:
print("Droping duplicates...")

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("sites_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,SiteUUID,SiteNativeID,SiteName,USGSSiteID,SiteTypeCV,Longitude,Latitude,SitePoint,SiteNativeURL,Geometry,...,EPSGCodeCV,NHDNetworkStatusCV,NHDProductCV,NHDUpdateDate,NHDReachCode,NHDMeasureNumber,StateCV,HUC8,HUC12,County
0,AZ_334708112295301,334708112295301,Unspecified,,INDEX,-112.498131,33.784972,,,,...,EPSG:4326,,,,,,AZ,,,
1,AZ_312049110134902,312049110134902,Unspecified,,GWSI,-110.231413,31.346347,,,,...,EPSG:4326,,,,,,AZ,,,
2,AZ_345149114342601,345149114342601,Unspecified,,GWSI,-114.574574,34.863440,,,,...,EPSG:4326,,,,,,AZ,,,
3,AZ_345327114343301,345327114343301,Unspecified,,GWSI,-114.576918,34.890559,,,,...,EPSG:4326,,,,,,AZ,,,
4,AZ_323616111295201,323616111295201,Unspecified,,GWSI,-111.498577,32.604005,,,,...,EPSG:4326,,,,,,AZ,,,
5,AZ_353705114033601,353705114033601,Unspecified,,GWSI,-114.060076,35.617898,,,,...,EPSG:4326,,,,,,AZ,,,
6,AZ_344820112300501,344820112300501,Unspecified,,GWSI,-112.501817,34.806304,,,,...,EPSG:4326,,,,,,AZ,,,
7,AZ_332557112414801,332557112414801,Unspecified,,GWSI,-112.698155,33.432665,,,,...,EPSG:4326,,,,,,AZ,,,
8,AZ_332557112414501,332557112414501,Unspecified,,GWSI,-112.695999,33.432613,,,,...,EPSG:4326,,,,,,AZ,,,
9,AZ_335203113563401,335203113563401,Unspecified,,GWSI,-113.941216,33.866945,,,,...,EPSG:4326,,,,,,AZ,,,


In [17]:
print("Checking required isnot null...")
# check if any cell of these columns is null
requiredCols = ['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

# replace NaN with blank cells
outdf100 = outdf100.replace(np.nan, '')

outdf100_nullMand = outdf100.loc[(outdf100["SiteUUID"] == '') |
                                 (outdf100["SiteName"] == '') | 
                                 (outdf100["CoordinateMethodCV"] == '') |
                                 (outdf100["GNISCodeCV"] == '') | 
                                 (outdf100["EPSGCodeCV"] == '')]

if (len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,

# ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

Checking required isnot null...


In [18]:
print("Writing out...")

#write out
outdf100.to_csv(out_sitdim, index=False, encoding = "utf-8")

print("Done sites")

Writing out...
Done sites
