# Sites_dim
Code to generate sites.csv as input to the WaDE db

In [None]:
%%cmd
pip install sodapy

In [None]:
import numpy as np
import pandas as pd
from sodapy import socrata
import os
from pyproj import CRS, Transformer

In [None]:
# working directory
working_dir = "C:/tseg/jupyterWaDE"
os.chdir(working_dir)

To run the following cell, make sure the input csv file is in the working directory. To obtain the data, go to the following link and download the table: WATER_MASTER (Master Table containing Water Right and Exchange Information). 
https://www.waterrights.utah.gov/cgi-bin/pubdump.exe?DBNAME=WRDB&SECURITYKEY=wrt2012access

In [None]:
### 3. Read Utah input csv file 
#(file must be already downloaded and stored in the working directory)

# input csv
input_csv = 'Water_Master.csv'
df100 = pd.readcsv(input_csv)

In [None]:
#column names
columns=['WaDESiteUUID', 'SiteNativeID', 'SiteName', 'USGSSiteID', 'SiteTypeCV', 'Longitude_x', 'Latitude_y',
          'SitePoint', 'SiteNativeURL', 'Geometry', 'CoordinateMethodCV', 'CoordinateAccuracy', 'GNISCodeCV',
          'EPSGCodeCV', 'NHDNetworkStatusCV', 'NHDProductCV', 'NHDUpdateDate', 'NHDReachCode', 'NHDMeasureNumber',
          'StateCV']

# These are not used currently. Data types inferred from the inputs 
dtypesx = ['NVarChar(55)	NVarChar(50)	NVarChar(500)	NVarChar(250)	NVarChar(100)	Double	Double	Geometry',
           'NVarChar(250)	Geometry	NVarChar(100)	NVarChar(255)	NVarChar(50)	NVarChar(50)	NVarChar(50)',
           'NVarChar(50)	Date	NVarChar(50)	NVarChar(50)	NChar(5)']

In [None]:
# create target dataframe

#assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [None]:
# Utah directly mapped cells
destCols=['SiteNativeID']
srsCols=['WRCHEX']

outdf100[destCols] = df100[srsCols]

# UT temporary columns--these are further processed to get mapped columns below
srsdestCols = ['POD_TYPE','X_UTM','Y_UTM']
outdf100[srsdestCols] = df100[srsdestCols]

#replace blank cells by NaN
outdf100 = outdf100.replace('', np.nan) 

###### Utah SiteTypeCV

Get SiteTypeCV based on the field "POD_TYPE" and map:

    Blank to “unknown” 
    
    A to Abandoned
    
    D to Drain
    
    C, F, N, or P to Sewage
    
    G to Spring
    
    R to Point of Rediversion
    
    S to Surface
    
    T – Point of Return
    
    U - Underground


In [None]:
# UT SiteTypeCV mapping 

#beneficialUseDictionary
siteTypedict = {
    "A":"Abandoned",
    "D":"Drain",
    "C":"Sewage",
    "F":"Sewage",
    "N":"Sewage",
    "P":"Sewage",
    "G":"Spring",
    "R":"Point of Rediversion",
    "S":"Surface",
    "T":"Point of Return",
    "U":"Underground"
}

# temporary column 'POD_TYPE'  
#outdf100['POD_TYPE'] = df100['POD_TYPE']

nanIndex = outdf100.loc[outdf100['POD_TYPE'].isnull()].index
# find no-loop approach
for ix in range(len(outdf100.index)):
    #if rank == 0: print(ix)
    if ix in nanInex:
        outdf100.loc[ix, 'SiteTypeCV'] = 'Unknown'
    else:
        siteTypeListStrStr = outdf100.loc[ix, 'POD_TYPE']
        siteTypeListStr = siteTypeListStrStr.strip()  # remove whitespace chars
        outdf100.loc[ix, 'SiteTypeCV'] = ",".join(siteTypedict[inx] for inx in list(str(siteTypeListStr)))  

# drop the temporary column
# outdf100 = outdf100.drop(columns=['POD_TYPE'])

##### Utah longitude and latitude coordinates 
Project the x and y (UTM NAD 83) coordinates to WGS84 lat lon
project from the North American Datum of 1983, UTM Zone 12 North, Meters as units
to the World Geodetic System 1984 (WGS84)

Longitude_x <--- X_UTM
Latitude_y <--- Y_UTM

In [None]:
# UT temporary columns  
#outdf100['X_UTM'] = df100['X_UTM']
#outdf100['Y_UTM'] = df100['Y_UTM']

In [None]:
# use pyproj to project to lat lon

crs_from = CRS('EPSG:4326') #CRS("WGS84")
crs_to = CRS("EPSG:26912")
transformer = Transformer.from_crs(crs_from, crs_to)

X_UTM = outdf100['X_UTM'] 
Y_UTM = outdf100['Y_UTM'] 
lonX = []
latY = []
for x1, y1 in X_UTM, Y_UTM:
    lon, lat = transformer.transform(x1, y1)
    lonX.append(lon)
    latY.append(lat)
    
outdf100['Longitude_x'] = lonX
outdf100['Latitude_y'] = latY

In [2]:
A=[1, 2, 3]
B = [2, 4, 6]

uu=[]
zz=[]
for (x,y) in (A,B):
    u,z=x*y
    uu.append(u)
    zz.append(z)

ValueError: too many values to unpack (expected 2)

In [None]:
# UT drop temp columns
outdf100 = outdf100.drop(columns=srsdestCols)

In [None]:
# Dropping duplicates

#filter the whole table based on a unique combination of site ID, SiteName, SiteType
outdf100 = outdf100.drop_duplicates(subset=['SiteNativeID', 'SiteName', 'SiteTypeCV'])   #
outdf100 = outdf100.reset_index(drop=True)

In [None]:
#Dropping empty lat/lon

#drop the sites with no long and lat.
outdf100 = outdf100.replace('', np.nan) #replace blank strings by NaN
outdf100purge = outdf100.loc[(outdf100['Longitude_x'].isnull()) | (outdf100['Latitude_y'].isnull())]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('sites_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100['Longitude_x'].isnull()) | (outdf100['Latitude_y'].isnull())].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

In [None]:
# hardcoded columns 

# hard code "Unknown" for SiteTypeCV value if it is missing
#outdf100 = outdf100.replace('', np.nan) #replace blank strings by NaN
outdf100.loc[outdf100['SiteTypeCV'].isnull(),'SiteTypeCV']='Unknown'
#hardcoded
outdf100.EPSGCodeCV = 'EPSG:4326'

In [None]:
#Adding UUID

#ToDO: no-loop approach?
for ix in range(len(outdf100.index)):
    outdf100.loc[ix, 'WaDESiteUUID'] = "_".join(["UT",str(outdf100.loc[ix, 'SiteNativeID'])])

In [None]:
#print("Checking required isnot null...")

#9.9.19: Adel: check all 'required' (not NA) columns have value (not empty)
requiredCols=['WaDESiteUUID', 'SiteName', 'CoordinateMethodCV', 'GNISCodeCV', 'EPSGCodeCV']

#replace blank strings by NaN, if there are any
outdf100 = outdf100.replace('', np.nan)

# check if any cell of these columns is null
#outdf100_nullMand = outdf100.loc[outdf100.isnull().any(axis=1)] --for all cols
outdf100_nullMand = outdf100.loc[(outdf100["WaDESiteUUID"].isnull()) |
                                (outdf100["SiteName"].isnull()) | (outdf100["CoordinateMethodCV"].isnull()) |
                                (outdf100["GNISCodeCV"].isnull())|(outdf100["EPSGCodeCV"].isnull())]
#outdf100_nullMand = outdf100.loc[[False | (outdf100[varName].isnull()) for varName in requiredCols]]

if(len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('sites_mandatoryFieldMissing.csv')  # index=False,
    
#ToDO: purge these cells if there is any missing? #For now left to be inspected and reported

In [None]:
#print("Writing out...")

# output csv
sites_csv = 'sites.csv'
#write out
outdf100.to_csv(siteCSV, index=False)