# Preprocessing Nebraska Reservoir and Observation Site data for WaDE

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Nebraska/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## NEDNR API Data Retreival
- get site data, save to local copy for future use
- get timeseries data, save to local copy for future use

In [3]:
# already done

# %%time
# # Retrieve list of only NeDNRstream gage stations
# #############################################################################

# # API retrieval
# url = "https://nednr.nebraska.gov/IwipApi/api/v1/StreamGage/GetStationList"
# responseD = json.loads(requests.get(url).text)
# DtL = responseD['Results']
# length = len(DtL)

# # create dataframe and store
# dfsg = pd.DataFrame()
# for i in range(length):
#     row = pd.DataFrame([DtL[i]])
#     dfsg = pd.concat([dfsg, row])

# # Use only NeNDR Active provided sites
# dfsg = dfsg[dfsg['SourceName'] == 'NeDNR']


# # Update StationNumber values must have 8 digits, with leading 0s
# def updateStationNumber(x):
#     x = str(x).strip()
#     if len(x) == 4:
#         x = "0000" + x
#     if len(x) == 5:
#         x = "000" + x
#     if len(x) == 6:
#         x = "00" + x
#     if len(x) == 7:
#         x = "0" + x
#     return x

# dfsg['StationNumber'] = dfsg.apply(lambda row: updateStationNumber(row['StationNumber']), axis=1)

# # Exporting output files.
# dfsg.to_csv('api_StreamGageGetStationList.zip', compression=dict(method='zip', archive_name='api_StreamGageGetStationList.csv'), index=False)

# print(len(dfsg))
# dfsg

In [4]:
# # Get list of StationNumber, use for timeseries api
# streamgageIdList = dfsg['StationNumber'].tolist()   
# print(len(streamgageIdList))
# streamgageIdList

In [5]:
# # Year list
# yearList = ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009",
#             "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019",
#             "2020", "2021", "2022"]

In [6]:
# already done

# %%time
# # get timeseries results
# # use StationNumber in url
# # use yearList

# # create empty dataframe
# dfts = pd.DataFrame()

# sgLength = len(streamgageIdList)
# ylLength = len(yearList)

# for i in range(sgLength):
#     serviceStr = "DailyMeanByYear" # change here
#     for j in range(ylLength):
#         url = "https://nednr.nebraska.gov/IwipApi/api/v1/StreamGage/" + serviceStr + "?StationNumber=" + str(streamgageIdList[i]) + "&MeanYear=" + str(yearList[j])
#         print(url)
#         try:
#             responseD = json.loads(requests.get(url).text)
#             DtL = responseD['Results']

#             # store in dataframe
#             dftemp = pd.DataFrame()
#             length = len(DtL)
#             for x in range(length):
#                 row = pd.DataFrame([DtL[x]])
#                 row['url'] = url
#                 row['service'] = serviceStr
#                 dftemp = pd.concat([dftemp, row])

#             dfts = pd.concat([dfts, dftemp])
#         except:
#             dftemp = pd.DataFrame()
#             dftemp['url'] = url
#             dfts = pd.concat([dfts, dftemp])
#             print("Error, issue with API return.")

# # Update StationNumber values must have 8 digits, with leading 0s  
# def updateStationNumber(x):
#     x = str(x).strip()
#     if len(x) == 4:
#         x = "0000" + x
#     if len(x) == 5:
#         x = "000" + x
#     if len(x) == 6:
#         x = "00" + x
#     if len(x) == 7:
#         x = "0" + x
#     return x
# dfts['StationNumber'] = dfts.apply(lambda row: updateStationNumber(row['StationNumber']), axis=1)

# # Exporting output files.
# dfts.to_csv('api_BDailyMeanByYear.zip', compression=dict(method='zip', archive_name='api_BDailyMeanByYear.csv'), index=False)

# print(len(dfts))
# dfts

## NEDNR Local Data
- site info
- timeseries info

In [7]:
# Input File - gage stations
fileInput = "NEDNR SW Data/api_StreamGageGetStationList.zip"
dfsg = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "sgNE" + dfsg.index.astype(str)
    dfsg.to_csv('NEDNR SW Data/api_StreamGageGetStationList.zip', compression=dict(method='zip', archive_name='api_StreamGageGetStationList.csv'), index=False)

print(len(dfsg))
dfsg.head()

209


Unnamed: 0,StationName,SourceName,StationNumber,RiverBasin,FieldOffice,StationTypeDescription,Downstream,TimeZone,Elevation,ElevationUnits,Longitude,Latitude,IsActive,WaDEUUID
0,Hat Creek at Montrose,NeDNR,62500,White River,Bridgeport,Stream,1500,MST,3635.12,ft,-103.7428,42.92226,True,sgNE0
1,White River at Crawford,NeDNR,6444000,White River,Bridgeport,Stream,2300,MST,3658.29,ft,-103.4177,42.68664,True,sgNE1
2,Whitney Reservoir from White River,NeDNR,410000,White River,Bridgeport,Reservoir,2350,MST,0.0,ft,-103.3094,42.79211,True,sgNE2
3,Niobrara River at Wyoming-Nebraska State,NeDNR,6454000,Niobrara River,Bridgeport,Stream,4005,MST,4685.99,ft,-104.0516,42.65252,True,sgNE3
4,Johnson Canal from Niobrara River,NeDNR,72000,Niobrara River,Bridgeport,Canal/Pump,4010,MST,0.0,ft,-103.9648,42.61062,True,sgNE4


In [8]:
# Input File - timeseries
fileInput = "NEDNR SW Data/api_BDailyMeanByYear.zip"
dfts = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfts:
    dfts['WaDEUUID'] = "tsNE" + dfts.index.astype(str)
    dfts.to_csv('NEDNR SW Data/api_BDailyMeanByYear.zip', compression=dict(method='zip', archive_name='api_BDailyMeanByYear.csv'), index=False)

print(len(dfts))
dfts.head()

1271609


Unnamed: 0,StationNumber,Date,Value,Status,Qualifier,Unit,url,service,WaDEUUID
0,62500,2012-05-18T00:00:00-05:00,3.4,Approved,,ft^3/s,https://nednr.nebraska.gov/IwipApi/api/v1/Stre...,DailyMeanByYear,tsNE0
1,62500,2012-05-19T00:00:00-05:00,2.6,Approved,,ft^3/s,https://nednr.nebraska.gov/IwipApi/api/v1/Stre...,DailyMeanByYear,tsNE1
2,62500,2012-05-20T00:00:00-05:00,1.9,Approved,,ft^3/s,https://nednr.nebraska.gov/IwipApi/api/v1/Stre...,DailyMeanByYear,tsNE2
3,62500,2012-05-21T00:00:00-05:00,1.3,Approved,,ft^3/s,https://nednr.nebraska.gov/IwipApi/api/v1/Stre...,DailyMeanByYear,tsNE3
4,62500,2012-05-22T00:00:00-05:00,0.81,Approved,,ft^3/s,https://nednr.nebraska.gov/IwipApi/api/v1/Stre...,DailyMeanByYear,tsNE4


In [9]:
 #Merging dataframes into one, using left-join.
dfin1 = pd.merge(dfts, dfsg, on='StationNumber', how='left')
print(len(dfin1))
dfin1.head(1)

1271609


Unnamed: 0,StationNumber,Date,Value,Status,Qualifier,Unit,url,service,WaDEUUID_x,StationName,SourceName,RiverBasin,FieldOffice,StationTypeDescription,Downstream,TimeZone,Elevation,ElevationUnits,Longitude,Latitude,IsActive,WaDEUUID_y
0,62500,2012-05-18T00:00:00-05:00,3.4,Approved,,ft^3/s,https://nednr.nebraska.gov/IwipApi/api/v1/Stre...,DailyMeanByYear,tsNE0,Hat Creek at Montrose,NeDNR,White River,Bridgeport,Stream,1500,MST,3635.12,ft,-103.7428,42.92226,True,sgNE0


In [10]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "NEssro_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "NEssro_V1" # for flow

# Organization Info
df['in_OrganizationUUID'] = "NEssro_O1" # NEDNR

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1['Latitude']
df['in_Longitude'] = dfin1['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "Gage" 
df['in_SiteName'] = dfin1['StationName']
df['in_SiteNativeID'] = dfin1['StationNumber']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin1['StationTypeDescription']
df['in_StateCV'] = "NE"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin1['Value']
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = "WaDE Blank"
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin1['Date']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['Date']
df['in_TimeframeStart'] = dfin1['Date']

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

1271609


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,tsNE0,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,,Surface Water,,,,4326,,,,42.92226,-103.7428,,,Gage,Hat Creek at Montrose,62500,,Stream,NE,,3.4,,,WaDE Blank,,,,,,,,,,,,2012-05-18T00:00:00-05:00,,2012-05-18T00:00:00-05:00,2012-05-18T00:00:00-05:00
1,tsNE1,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,,Surface Water,,,,4326,,,,42.92226,-103.7428,,,Gage,Hat Creek at Montrose,62500,,Stream,NE,,2.6,,,WaDE Blank,,,,,,,,,,,,2012-05-19T00:00:00-05:00,,2012-05-19T00:00:00-05:00,2012-05-19T00:00:00-05:00
2,tsNE2,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,,Surface Water,,,,4326,,,,42.92226,-103.7428,,,Gage,Hat Creek at Montrose,62500,,Stream,NE,,1.9,,,WaDE Blank,,,,,,,,,,,,2012-05-20T00:00:00-05:00,,2012-05-20T00:00:00-05:00,2012-05-20T00:00:00-05:00
3,tsNE3,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,,Surface Water,,,,4326,,,,42.92226,-103.7428,,,Gage,Hat Creek at Montrose,62500,,Stream,NE,,1.3,,,WaDE Blank,,,,,,,,,,,,2012-05-21T00:00:00-05:00,,2012-05-21T00:00:00-05:00,2012-05-21T00:00:00-05:00
4,tsNE4,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,,Surface Water,,,,4326,,,,42.92226,-103.7428,,,Gage,Hat Creek at Montrose,62500,,Stream,NE,,0.81,,,WaDE Blank,,,,,,,,,,,,2012-05-22T00:00:00-05:00,,2012-05-22T00:00:00-05:00,2012-05-22T00:00:00-05:00


## University of Nebraska-Lincoln GW Data Local Data
- site info
- timeseries info

In [11]:
# Input File - well location info
fileInput = "University of NebraskaLincoln GW Data/Well_Info.zip"
dfwl = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfwl:
    dfwl['WaDEUUID'] = "wlNE" + dfwl.index.astype(str)
    dfwl.to_csv('University of NebraskaLincoln GW Data/Well_Info.zip', compression=dict(method='zip', archive_name='Well_Info.csv'), index=False)

print(len(dfwl))
dfwl.head()

23990


  dfwl = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")


Unnamed: 0,CSD_ID,DNR_Well_ID,REGNUM,Legal,County,NRD,TWN,RNG,RLTR,SEC,QTR,LatDD,LongDD,LLCoordMeth,Agency,Predevelopment,Surf_Elev,Surf_Elev_Method,Constr_Date,WellType,Well_Dpth,Casing_Dia,Scrn_Top,Scrn_Bot,Scrn_Lngth,Slot_Size,Scrn_Top2,Scrn_Bot2,Scrn_Lngth2,Scrn_Top3,Scrn_Bot3,Scrn_Lngth3,WaDEUUID
0,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.6845,-98.6816,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.0,,,,,,,,,,,,wlNE0
1,08N 14W 03 CBB,,,8N 14W 3CBB,BUFFALO,CENTRAL PLATTE,8,14,W,3,CBB,40.69077,-98.89306,,CPNRD,,2067.2,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE1
2,08N 14W 04 CBC,,,8N 14W 4CBC,BUFFALO,CENTRAL PLATTE,8,14,W,4,CBC,40.68934,-98.91206,,CPNRD,,2075.86,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE2
3,08N 14W 17 ADD,,,8N 14W 17ADD,KEARNEY,TRI-BASIN,8,14,W,17,ADD,40.64878,-98.91551,,CPNRD,,2084.75,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE3
4,08N 14W 17 DAD,,,8N 14W 17DAD,KEARNEY,TRI-BASIN,8,14,W,17,DAD,40.64591,-98.91551,,CPNRD,,2084.63,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE4


In [12]:
# Input File - water level timeseries
fileInput = "University of NebraskaLincoln GW Data/Water_Level_Data.zip"
dfwld = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfwld:
    dfwld['WaDEUUID'] = "wldNE" + dfwld.index.astype(str)
    dfwld.to_csv('University of NebraskaLincoln GW Data/Water_Level_Data.zip', compression=dict(method='zip', archive_name='Water_Level_Data.csv'), index=False)

print(len(dfwld))
dfwld.head()

  dfwld = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")


777343


Unnamed: 0,CSD_ID,YearMsr,DateMsr,Season,WatLevel,WaDEUUID
0,403956101531901,1995,4/15/1995,Spring,113.17,wldNE0
1,G-056892,1995,4/15/1995,Spring,127.32,wldNE1
2,404001099463601,1995,4/7/1995,Spring,132.26,wldNE2
3,404002098053101,1995,5/13/1995,Spring,83.42,wldNE3
4,404007099294001,1995,3/15/1995,Spring,8.12,wldNE4


In [13]:
 #Merging dataframes into one, using left-join.
dfin2 = pd.merge(dfwl, dfwld, on='CSD_ID', how='left')
print(len(dfin2))
dfin2.head(1)

654488


Unnamed: 0,CSD_ID,DNR_Well_ID,REGNUM,Legal,County,NRD,TWN,RNG,RLTR,SEC,QTR,LatDD,LongDD,LLCoordMeth,Agency,Predevelopment,Surf_Elev,Surf_Elev_Method,Constr_Date,WellType,Well_Dpth,Casing_Dia,Scrn_Top,Scrn_Bot,Scrn_Lngth,Slot_Size,Scrn_Top2,Scrn_Bot2,Scrn_Lngth2,Scrn_Top3,Scrn_Bot3,Scrn_Lngth3,WaDEUUID_x,YearMsr,DateMsr,Season,WatLevel,WaDEUUID_y
0,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.6845,-98.6816,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.0,,,,,,,,,,,,wlNE0,2002.0,4/15/2002,Spring,12.33,wldNE27133


In [14]:
dfin2

Unnamed: 0,CSD_ID,DNR_Well_ID,REGNUM,Legal,County,NRD,TWN,RNG,RLTR,SEC,QTR,LatDD,LongDD,LLCoordMeth,Agency,Predevelopment,Surf_Elev,Surf_Elev_Method,Constr_Date,WellType,Well_Dpth,Casing_Dia,Scrn_Top,Scrn_Bot,Scrn_Lngth,Slot_Size,Scrn_Top2,Scrn_Bot2,Scrn_Lngth2,Scrn_Top3,Scrn_Bot3,Scrn_Lngth3,WaDEUUID_x,YearMsr,DateMsr,Season,WatLevel,WaDEUUID_y
0,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.68450,-98.68160,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.00000,,,,,,,,,,,,wlNE0,2002.00000,4/15/2002,Spring,12.33000,wldNE27133
1,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.68450,-98.68160,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.00000,,,,,,,,,,,,wlNE0,2003.00000,4/15/2003,Spring,14.63000,wldNE32619
2,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.68450,-98.68160,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.00000,,,,,,,,,,,,wlNE0,2013.00000,4/23/2013,Spring,15.10000,wldNE49974
3,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.68450,-98.68160,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.00000,,,,,,,,,,,,wlNE0,2004.00000,4/15/2004,Spring,15.89000,wldNE113897
4,08N 12W 04 CCA,,,8N 12W 4CCA,ADAMS,LITTLE BLUE,8,12,W,4,CCA,40.68450,-98.68160,,LBNRD,,2005.36,Interpolated from 1m LiDAR,,,100.00000,,,,,,,,,,,,wlNE0,2008.00000,4/15/2008,Spring,13.20000,wldNE136068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654483,W-4S,,,24N 10W 33C,WHEELER,UPPER ELKHORN,24,10,W,33,C,42.00850,-98.48952,,LLNRD,,2054.24000,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE23988,2006.00000,10/24/2006,Fall,17.57000,wldNE89838
654484,W-4S,,,24N 10W 33C,WHEELER,UPPER ELKHORN,24,10,W,33,C,42.00850,-98.48952,,LLNRD,,2054.24000,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE23988,2020.00000,3/23/2020,Spring,13.15000,wldNE96923
654485,W-4S,,,24N 10W 33C,WHEELER,UPPER ELKHORN,24,10,W,33,C,42.00850,-98.48952,,LLNRD,,2054.24000,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE23988,2021.00000,3/22/2021,Spring,14.85000,wldNE100121
654486,W-4S,,,24N 10W 33C,WHEELER,UPPER ELKHORN,24,10,W,33,C,42.00850,-98.48952,,LLNRD,,2054.24000,Interpolated from 1m LiDAR,,,,,,,,,,,,,,,wlNE23988,2022.00000,3/17/2022,Spring,16.70000,wldNE108404


In [15]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "NEssro_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "NEssro_V2" # for water level depth

# Organization Info
df['in_OrganizationUUID'] = "NEssro_O2" # University of Nebraska-Lincoln

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = dfin2['County']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin2['LatDD']
df['in_Longitude'] = dfin2['LongDD']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "Gage" 
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfin2['CSD_ID']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "NE"
df['in_USGSSiteID'] = ""

# Site VariableAmounts Info
df['in_Amount'] = dfin2['WatLevel']
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = "WaDE Blank"
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin2['YearMsr']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin2['DateMsr']
df['in_TimeframeStart'] = dfin2['DateMsr']

outdf2 = df.copy()
outdf2 = outdf2.drop_duplicates().reset_index(drop=True)
print(len(outdf2))
outdf2.head()

654034


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,wlNE0,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,,Groundwater,,,ADAMS,4326,,,,40.6845,-98.6816,,,Gage,,08N 12W 04 CCA,,,NE,,12.33,,,WaDE Blank,,,,,,,,,,,,2002.0,,4/15/2002,4/15/2002
1,wlNE0,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,,Groundwater,,,ADAMS,4326,,,,40.6845,-98.6816,,,Gage,,08N 12W 04 CCA,,,NE,,14.63,,,WaDE Blank,,,,,,,,,,,,2003.0,,4/15/2003,4/15/2003
2,wlNE0,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,,Groundwater,,,ADAMS,4326,,,,40.6845,-98.6816,,,Gage,,08N 12W 04 CCA,,,NE,,15.1,,,WaDE Blank,,,,,,,,,,,,2013.0,,4/23/2013,4/23/2013
3,wlNE0,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,,Groundwater,,,ADAMS,4326,,,,40.6845,-98.6816,,,Gage,,08N 12W 04 CCA,,,NE,,15.89,,,WaDE Blank,,,,,,,,,,,,2004.0,,4/15/2004,4/15/2004
4,wlNE0,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,,Groundwater,,,ADAMS,4326,,,,40.6845,-98.6816,,,Gage,,08N 12W 04 CCA,,,NE,,13.2,,,WaDE Blank,,,,,,,,,,,,2008.0,,4/15/2008,4/15/2008


In [16]:
# Concatenate dataframes
frames = [outdf1, outdf2]
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

1925643


## Clean Data / data types

In [17]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [18]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [19]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Hat Creek At Montrose', 'White River At Crawford',
       'Niobrara River At Wyomingnebraska State',
       'Johnson Canal From Niobrara River',
       'Lakotah Canal From Niobrara River', 'Niobrara River At 33 Ranch',
       'Earnest Canal South From Niobrara River',
       'Earnest Canal North From Niobrara River',
       'Mcginleystover Canal From Niobrara River',
       'Cook Canal No 1 From Niobrara River',
       'Harrisneece Canal From Niobrara River', 'Niobrara River At Agate',
       'Labelle Canal From Niobrara River',
       'Mettlen Canal From Niobrara River',
       'Bennettkay Canal From Niobrara River',
       'Moorekay Canal From Niobrara River',
       'Geo Hitshew Canal From Niobrara River',
       'Mclaughlin Canal From Niobrara River',
       'Excelsior Canal From Niobrara River',
       'Hughes Canal From Niobrara River',
       'Pioneer Canal From Niobrara River',
       'Niobrara River Above Box Butte Reservoir',
       'Niobrara River Below Box Butte Res

In [20]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['', 'Adams', 'Buffalo', 'Kearney', 'Dawson', 'Phelps', 'Frontier',
       'Hall', 'Perkins', 'Merrick', 'Keith', 'Howard', 'Custer',
       'Lincoln', 'Nance', 'Webster', 'Dundy', 'Thayer', 'Hooker',
       'Harlan', 'Brown', 'Cherry', 'Franklin', 'Furnas', 'Richardson',
       'Nuckolls', 'Hitchcock', 'Red Willow', 'Jefferson', 'Pawnee',
       'Gage', 'Chase', 'Johnson', 'Saline', 'Hayes', 'Fillmore',
       'Gosper', 'Clay', 'Nemaha', 'Hamilton', 'Lancaster', 'Otoe',
       'Seward', 'York', 'Cass', 'Greeley', 'Deuel', 'Cheyenne', 'Duel',
       'Kimball', 'Sarpy', 'Polk', 'Saunders', 'Sherman', 'Butler',
       'Madison', 'Douglas', 'Garden', 'Platte', 'Banner', 'Logan',
       'Valley', 'Arthur', 'Colfax', 'Dodge', 'Mcpherson', 'Washington',
       'Morrill', 'Boone', 'Burt', 'Scotts Bluff', 'Loup', 'Cuming',
       'Wheeler', 'Garfield', 'Blaine', 'Grant', 'Stanton', 'Thomas',
       'Keya Paha', 'Scottsbluff', 'Antelope', 'Sioux', 'Box Butte',
       'Thurston', 'Sheridan

In [21]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [22]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [23]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater'], dtype=object)

In [24]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Stream', 'Canal/Pump', 'Return', 'Reservoir', ''], dtype=object)

In [25]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Hat Creek At Montrose', 'White River At Crawford',
       'Niobrara River At Wyomingnebraska State',
       'Johnson Canal From Niobrara River',
       'Lakotah Canal From Niobrara River', 'Niobrara River At 33 Ranch',
       'Earnest Canal South From Niobrara River',
       'Earnest Canal North From Niobrara River',
       'Mcginleystover Canal From Niobrara River',
       'Cook Canal No 1 From Niobrara River',
       'Harrisneece Canal From Niobrara River', 'Niobrara River At Agate',
       'Labelle Canal From Niobrara River',
       'Mettlen Canal From Niobrara River',
       'Bennettkay Canal From Niobrara River',
       'Moorekay Canal From Niobrara River',
       'Geo Hitshew Canal From Niobrara River',
       'Mclaughlin Canal From Niobrara River',
       'Excelsior Canal From Niobrara River',
       'Hughes Canal From Niobrara River',
       'Pioneer Canal From Niobrara River',
       'Niobrara River Above Box Butte Reservoir',
       'Niobrara River Below Box Butte Res

In [26]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['', 'Adams', 'Buffalo', 'Kearney', 'Dawson', 'Phelps', 'Frontier',
       'Hall', 'Perkins', 'Merrick', 'Keith', 'Howard', 'Custer',
       'Lincoln', 'Nance', 'Webster', 'Dundy', 'Thayer', 'Hooker',
       'Harlan', 'Brown', 'Cherry', 'Franklin', 'Furnas', 'Richardson',
       'Nuckolls', 'Hitchcock', 'Red Willow', 'Jefferson', 'Pawnee',
       'Gage', 'Chase', 'Johnson', 'Saline', 'Hayes', 'Fillmore',
       'Gosper', 'Clay', 'Nemaha', 'Hamilton', 'Lancaster', 'Otoe',
       'Seward', 'York', 'Cass', 'Greeley', 'Deuel', 'Cheyenne', 'Duel',
       'Kimball', 'Sarpy', 'Polk', 'Saunders', 'Sherman', 'Butler',
       'Madison', 'Douglas', 'Garden', 'Platte', 'Banner', 'Logan',
       'Valley', 'Arthur', 'Colfax', 'Dodge', 'Mcpherson', 'Washington',
       'Morrill', 'Boone', 'Burt', 'Scotts Bluff', 'Loup', 'Cuming',
       'Wheeler', 'Garfield', 'Blaine', 'Grant', 'Stanton', 'Thomas',
       'Keya Paha', 'Scottsbluff', 'Antelope', 'Sioux', 'Box Butte',
       'Thurston', 'Sheridan

In [27]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['WaDE Blank'], dtype=object)

In [28]:
# Ensure Latitude entry is either numireic or blank, no 0 entries
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("").replace(" ", "")
outdf['in_Latitude'].unique()

array([42.92226 , 42.68664 , 42.65252 , ..., 42.008545, 42.0085  ,
       40.22045 ])

In [29]:
# Ensure Longitude entry is either numireic or blank, no 0 entries
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("").replace(" ", "")
outdf['in_Longitude'].unique()

array([-103.7428  , -103.4177  , -104.0516  , ...,  -98.489546,
        -98.489522, -100.63083 ])

In [30]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').replace(0,"").fillna("")
outdf['in_Amount'].unique()

array([3.4, 2.6, 1.9, ..., 55.57000006, 55.54000006, 234.04], dtype=object)

In [31]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

array(['2012-05-18T00:00:00.000000000', '2012-05-19T00:00:00.000000000',
       '2012-05-20T00:00:00.000000000', ...,
       '1934-12-12T00:00:00.000000000', '1942-03-26T00:00:00.000000000',
       '1968-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [32]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

array(['2012-05-18T00:00:00.000000000', '2012-05-19T00:00:00.000000000',
       '2012-05-20T00:00:00.000000000', ...,
       '1934-12-12T00:00:00.000000000', '1942-03-26T00:00:00.000000000',
       '1968-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [33]:
# extract year out
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True)
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'].unique()

array(['2012-05-18T00:00:00-05:00', '2012-05-19T00:00:00-05:00',
       '2012-05-20T00:00:00-05:00', ..., 1885.0, 1924.0, 1900.0],
      dtype=object)

In [34]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2'], dtype=object)

## Export Outputs

In [35]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925643 entries, 0 to 1925642
Data columns (total 47 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   WaDEUUID                          object        
 1   in_MethodUUID                     object        
 2   in_VariableSpecificUUID           object        
 3   in_OrganizationUUID               object        
 4   in_Geometry                       object        
 5   in_GNISFeatureNameCV              object        
 6   in_WaterQualityIndicatorCV        object        
 7   in_WaterSourceName                object        
 8   in_WaterSourceNativeID            object        
 9   in_WaterSourceTypeCV              object        
 10  in_CoordinateAccuracy             object        
 11  in_CoordinateMethodCV             object        
 12  in_County                         object        
 13  in_EPSGCodeCV                     int64         
 14  in_GNISCodeCV     

In [36]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,tsNE0,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,wadeID1,Surface Water,,,,4326,,,,42.92226,-103.74280,,,Gage,Hat Creek At Montrose,62500,,Stream,NE,,3.40000,,,WaDE Blank,,,,,,,,,,,,2012-05-18T00:00:00-05:00,,2012-05-18,2012-05-18
1,tsNE1,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,wadeID1,Surface Water,,,,4326,,,,42.92226,-103.74280,,,Gage,Hat Creek At Montrose,62500,,Stream,NE,,2.60000,,,WaDE Blank,,,,,,,,,,,,2012-05-19T00:00:00-05:00,,2012-05-19,2012-05-19
2,tsNE2,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,wadeID1,Surface Water,,,,4326,,,,42.92226,-103.74280,,,Gage,Hat Creek At Montrose,62500,,Stream,NE,,1.90000,,,WaDE Blank,,,,,,,,,,,,2012-05-20T00:00:00-05:00,,2012-05-20,2012-05-20
3,tsNE3,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,wadeID1,Surface Water,,,,4326,,,,42.92226,-103.74280,,,Gage,Hat Creek At Montrose,62500,,Stream,NE,,1.30000,,,WaDE Blank,,,,,,,,,,,,2012-05-21T00:00:00-05:00,,2012-05-21,2012-05-21
4,tsNE4,NEssro_M1,NEssro_V1,NEssro_O1,,,Fresh,,wadeID1,Surface Water,,,,4326,,,,42.92226,-103.74280,,,Gage,Hat Creek At Montrose,62500,,Stream,NE,,0.81000,,,WaDE Blank,,,,,,,,,,,,2012-05-22T00:00:00-05:00,,2012-05-22,2012-05-22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1925638,wlNE23988,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,wadeID2,Groundwater,,,Wheeler,4326,,,,42.00850,-98.48952,,,Gage,,W-4S,,,NE,,17.57000,,,WaDE Blank,,,,,,,,,,,,2006.00000,,2006-10-24,2006-10-24
1925639,wlNE23988,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,wadeID2,Groundwater,,,Wheeler,4326,,,,42.00850,-98.48952,,,Gage,,W-4S,,,NE,,13.15000,,,WaDE Blank,,,,,,,,,,,,2020.00000,,2020-03-23,2020-03-23
1925640,wlNE23988,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,wadeID2,Groundwater,,,Wheeler,4326,,,,42.00850,-98.48952,,,Gage,,W-4S,,,NE,,14.85000,,,WaDE Blank,,,,,,,,,,,,2021.00000,,2021-03-22,2021-03-22
1925641,wlNE23988,NEssro_M1,NEssro_V2,NEssro_O2,,,Fresh,,wadeID2,Groundwater,,,Wheeler,4326,,,,42.00850,-98.48952,,,Gage,,W-4S,,,NE,,16.70000,,,WaDE Blank,,,,,,,,,,,,2022.00000,,2022-03-17,2022-03-17


In [37]:
# Export the output dataframe
outdf.to_csv('Pssro_neMain.zip', compression=dict(method='zip', archive_name='Pssro_neMain.csv'), index=False)  # The output, save as a zip