# Pre-processing Nevada Site Specific data for WaDEQA upload.
Date Updated: 01/26/2022
Purpose:  To pre-process the Nevada ss data into one master file for simple DataFrame creation and extraction

In [None]:
# Libaries

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Nevada/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Input Data

In [None]:
# Dataframe Creation - Timeseries
inputFile1 = "Surface_Water_Monitoring_Sites_and_Measures.csv"
df_ts = pd.read_csv(inputFile1)
print(len(df_ts))
df_ts.head(1)

In [None]:
# Dataframe Creation - Location Info
# Export of Shapefile data
fileInput = "Shapefile/SurfaceWaterMonitoringSites.shp"
df_loc = gpd.read_file(fileInput)
print(len(df_loc))
df_loc.head(1)

In [None]:
# Left Join timeseries -to- location via 'Site_Name' field.
df_tsloc = pd.merge(df_ts, df_loc, left_on='Site_name', right_on='Site_Name', how='left')
print(len(df_tsloc))
df_tsloc.head(1)

In [None]:
df_tsloc.info()

## Clean Up / Output DataFrame

In [None]:
# Create temporary main dataframe
dfout = pd.DataFrame(index=df_tsloc.index)

# Water Source Info
dfout['in_WaterSourceName'] = df_tsloc['Source_Nam']

# Site Info
dfout['in_County'] = df_tsloc['County']
dfout['in_Latitude'] = df_tsloc['Lat_DD_NAD'].astype(float)
dfout['in_Longitude'] = df_tsloc['Lon_DD_NAD'].astype(float)
dfout['in_SiteName'] = df_tsloc['Site_Name']
dfout['in_SiteNativeID'] = df_tsloc['AutoID']
dfout['in_SiteTypeCV'] = df_tsloc['Source_Des']

# Site Variable Amount Info
dfout['Units'] = df_tsloc['Units']
dfout['in_Amount'] = df_tsloc['Discharge'].astype(float)
dfout['in_ReportYearCV'] =  df_tsloc['Measure_date']
dfout['in_TimeframeStart'] = df_tsloc['Measure_date']
dfout['in_TimeframeEnd'] = df_tsloc['Measure_date']

print(len(dfout))
dfout.head(1)

In [None]:
# Convert all GPM amounts to CFS for simplicity.

def convertGPMtoCFS(Aval, Uval):
    Uval = str(Uval).strip().lower()
    if Uval == "gpm":
        outValue = Aval * 0.00222800926
    else:
        outValue = Aval
    return outValue

dfout['in_Amount'] = dfout.apply(lambda row: convertGPMtoCFS(row['in_Amount'], row['Units']), axis=1)
dfout = dfout.drop(['Units'], axis=1)
dfout.head()

In [None]:
# Convert History Year to YYYY-MM-DD format.  # Convert ReportYearCV to int.

dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], errors = 'coerce')
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.to_period('Y').astype(str)

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout.head()

In [None]:
# For SiteTypeCV change nan values to Unspecified

def changeNanSiteTypeCV(STcv):
    STcv = str(STcv).strip()
    if STcv == "" or STcv == 'nan' or pd.isnull(STcv):
        outString = "Unspecified"
    else:
        outString = STcv
    return outString

dfout['in_SiteTypeCV'] = dfout.apply(lambda row: changeNanSiteTypeCV(row['in_SiteTypeCV']), axis=1)
dfout.head()

In [None]:
# Add Water Source Type based on SiteTypeCV

def addWaterSourceType(WSTcv):
    WSTcv = str(WSTcv).strip().lower()
    if WSTcv == "wells" or WSTcv == 'well' or WSTcv == 'flowing well' or WSTcv == 'spring':
        outString = "Groundwater"
    else:
        outString = "Surface Water"
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: addWaterSourceType(row['in_SiteTypeCV']), axis=1)
dfout.head()

## WaDE Custom Elements (due to missing info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDENV_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName']), axis=1)
dfout.head(1)

## Inspect Data

In [None]:
dfout.info()

In [None]:
dfout['in_SiteTypeCV'].unique()

In [None]:
dfout['in_ReportYearCV'].unique()

In [None]:
# Inspect amount distribution.

# This is correct, there are at least two sites with > 40,000 CFS.
sns.boxplot(data=dfout, x="in_Amount")

## Export Output Dataframe

In [None]:
#Exporting to Finished File
dfout.to_csv('P_nvOSMaster.csv', index=False)  # The output