# Pre-processing Colorado Site Specific Reservoir and Gage data for WaDE upload.

Notes:
- Using two different API Colorado CDSS REST web service. 1) [**Division Data**](https://dwr.state.co.us/Rest/GET/Help/Api/GET-api-v2-structures-divrec-waterclasses) api for Division 1-7 site specific information. 2) [**Annual WDID Time Series Data**](https://dwr.state.co.us/Rest/GET/Help/Api/GET-api-v2-structures-divrec-divrecyear) api using sites of interest wdid list produced from Division 1-7.

In [1]:
# Needed Libararies

# Working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# Visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Working with API
import requests
import io
import json

# Cleanup
import re
import time
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Colorado/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Load Site Data

In [3]:
# done already

# Query by Division (1-7) to get a full list of WDIDs per division.
# Plug in "division=1" etc into API request.
# Save results as Division1.csv, etc.
# Rinse and Repeat to retreive all data for Divisions 1-7.

# url = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/waterclasses/?division=7&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"
# responseD = json.loads(requests.get(url).text)
# LD = responseD['ResultList']

# df_ts = pd.DataFrame()
# for n in range(len(LD)):
#     row = pd.DataFrame([LD[n]])
#     df_ts = df_ts.append(row)
# df_ts

# #Exporting to Finished File
# df_ts.to_csv('Division7.csv', index=False)  # The output

In [4]:
# Division 1
fileInput = "Success/Division1.csv"
dfs1 = pd.read_csv(fileInput)
print(len(dfs1))
dfs1.head(1)

50000


  dfs1 = pd.read_csv(fileInput)


Unnamed: 0,divrectype,availableTimesteps,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps.1,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,Year,30100500,0100500 DivComment,100500,,HOOVER DITCH,,,,,,,,,,,,,,,,,,,1950-10-31T00:00:00,2019-10-31T00:00:00,2020-09-30T16:00:45.72,1,1,WELD,1950-10-31T00:00:00,Year,SOUTH PLATTE RIVER,166.09,DITCH,201759.0,A,Active Structure with contemporary diversion r...,,,,S,5.0 N,64.0 W,15.0,,SE,NE,,,,,539867.0,4472548.0,40.402594,-104.530193


In [5]:
# Division 2
fileInput = "Success/Division2.csv"
dfs2 = pd.read_csv(fileInput)
print(len(dfs2))
dfs2.head(1)

50000


  dfs2 = pd.read_csv(fileInput)


Unnamed: 0,divrectype,availableTimesteps,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,Year,31000500,1000500 DivComment,1000500,,SEVENTY FOUR DITCH,,,,,,,,,,,,,,,,,,,1971-10-31T00:00:00,1978-10-31T00:00:00,2004-06-09T17:09:50.573,2,10,EL PASO,1971-10-31T00:00:00,MONUMENT CREEK,25.61,DITCH,193535.0,H,Historical structure only - no longer exists o...,,,,S,11.0 S,67.0 W,15.0,,SE,NW,,,,,510398.2,4327298.6,39.094762,-104.879767


In [6]:
# Division 3
fileInput = "Success/Division3.csv"
dfs3 = pd.read_csv(fileInput)
print(len(dfs3))
dfs3.head(1)

21282


Unnamed: 0,divrectype,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,32000500,2000500 DivComment,2000500,,ADAMS D 1,,,,,,,,,,,,,,,,,,,1970-10-31T00:00:00,2013-10-31T00:00:00,2014-04-04T13:51:40.89,3,20,RIO GRANDE,1970-10-31T00:00:00,Year,RIO GRANDE,,DITCH,1385432.0,U,Active structure but diversion records are not...,,,,N,40.0 N,5.0 E,27.0,,SE,SW,,,,,375104.8,4171056.9,37.678221,-106.416346


In [7]:
# Division 4
fileInput = "Success/Division4.csv"
dfs4 = pd.read_csv(fileInput)
print(len(dfs4))
dfs4.head(1)

24273


Unnamed: 0,divrectype,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,32800500,2800500 DivComment,2800500,,ADAMS NO 1 DITCH,,,,,,,,,,,,,,,,,,,1991-10-31T00:00:00,1993-10-31T00:00:00,2001-09-26T19:02:08.89,4,28,GUNNISON,1991-10-31T00:00:00,Year,TOMICHI CREEK,5.62,DITCH,189371.0,A,Active Structure with contemporary diversion r...,,,,N,49.0 N,1.0 E,6.0,SE,SE,NW,,,,,333922.0,4267277.0,38.538421,-106.905554


In [8]:
# Division 5
fileInput = "Success/Division5.csv"
dfs5 = pd.read_csv(fileInput)
print(len(dfs5))
dfs5.head(1)

50000


  dfs5 = pd.read_csv(fileInput)


Unnamed: 0,divrectype,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,33600501,3600501 DivComment,3600501,,ABBETT AND BENNETT DITCH,,,,,,,,,,,,,,,,,,,1967-10-31T00:00:00,2019-10-31T00:00:00,2020-02-04T18:15:49.233,5,36,SUMMIT,1967-10-31T00:00:00,Year,SPRUCE CREEK,4.04,DITCH,175604.0,A,Active Structure with contemporary diversion r...,,,,S,2.0 S,81.0 W,2.0,,NW,SE,,,,,378567.0,4419574.0,39.91757,-106.420881


In [9]:
# Division 6
fileInput = "Success/Division6.csv"
dfs6 = pd.read_csv(fileInput)
print(len(dfs6))
dfs6.head(1)

20835


  dfs6 = pd.read_csv(fileInput)


Unnamed: 0,divrectype,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,34300500,4300500 DivComment,4300500,,ADAMS & OWENS DITCH,,,,,,,,,,,,,,,,,,,1977-10-31T00:00:00,2019-10-31T00:00:00,2020-02-06T12:30:34.11,6,43,RIO BLANCO,1977-10-31T00:00:00,Year,WEST CREEK,7.66,DITCH,172009.0,A,Active Structure with contemporary diversion r...,,,,S,4.0 S,102.0 W,33.0,,NW,NW,,,,,169228.0,4397443.0,39.662817,-108.855604


In [10]:
# Division 7
fileInput = "Success/Division7.csv"
dfs7 = pd.read_csv(fileInput)
print(len(dfs7))
dfs7.head(1)

29123


Unnamed: 0,divrectype,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivComment,32900500,2900500 DivComment,2900500,,A D ARCHULETA DITCH,,,,,,,,,,,,,,,,,,,1965-10-31T00:00:00,1979-10-31T00:00:00,2009-06-15T15:20:23.87,7,29,ARCHULETA,1965-10-31T00:00:00,Year,SAN JUAN RIVER,174.73,DITCH,1454998.0,H,Historical structure only - no longer exists o...,,,,N,34.0 N,2.0 W,26,,SE,NW,,,,,318843.9,4114927.6,37.163225,-107.04027


In [11]:
# Concatenate
frames = [dfs1, dfs2, dfs3, dfs4, dfs5, dfs6, dfs7]
dfs = pd.concat(frames)
print(len(dfs))

245513


In [12]:
# want DivTotal, Year, and Active Records
dfs = dfs[dfs["divrectype"] == 'DivTotal']
dfs = dfs[dfs["availableTimesteps"] == 'Year']
dfs = dfs[dfs["ciuCode"] == 'A']
print(len(dfs))

8950


In [13]:
# fix wdid values that are less then 7 chars long.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n

    return int(outString)
dfs['wdid'] = dfs.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)
dfs['wdid'].unique()

array([ 100539,  100565,  100568, ..., 7807002, 7807003, 7817003],
      dtype=int64)

In [14]:
# drop duplicates
dfs = dfs.drop_duplicates(subset='wdid', keep="first").reset_index(drop=True)
print(len(dfs))
dfs.head(1)

8950


Unnamed: 0,divrectype,availableTimesteps,waterclassNum,wcIdentifier,wdid,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified,availableTimesteps.1,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg
0,DivTotal,Year,10100539,0100539 Total (Diversions),100539,,MIDDLEMIST DITCH 2,,,,,,,,,,,,,,,,,,,2007-10-31T00:00:00,2007-10-31T00:00:00,2008-04-07T13:22:00,1,1,ELBERT,2011-06-02T08:52:50.99,Year,MIDDLEMIST CREEK,32.97,DITCH,195017.0,A,Active Structure with contemporary diversion r...,,,,S,6.0 S,57.0 W,25.0,SW,SW,SW,,,,,609064.0,4372109.0,39.491687,-103.731685


## Retrieve Timeseries info

In [15]:
# done already

# %%time

# # create wdid list for API
# wdidList = dfs['wdid'].tolist()
# # Split list into catagories that are 100 long. Issue with CO API timing out after too long.
# wdidListB = [wdidList[i:i + 100] for i in range(0, len(wdidList), 100)]

# # Use list of WDIDs (from Division data) as inputs, retreive time series data.

# # Time Series Dataframe
# dfts = pd.DataFrame()

# str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
# str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

# for i in range(len(wdidListB)):
#     lstC = wdidListB[i]
#     lstCa = '%2C'.join([str(n) for n in lstC]) 
    
#     url = str2 + lstCa + str3
#     responseD = json.loads(requests.get(url).text)
#     LD = responseD['ResultList']
    
#     for n in range(len(LD)):
#         row = pd.DataFrame([LD[n]])
#         dfts = dfts.append(row)

# print(len(dfts))
# dfts.head()

In [16]:
# done already

# # Exporting finished time series file for records sake.
# dfts.to_excel('P_TimeSeries.xlsx', index=False)  # The output

In [17]:
# timeseris data
fileInput = "P_TimeSeries.xlsx"
dfts = pd.read_excel(fileInput)
print(len(dfts))
dfts.head(1)

232982


Unnamed: 0,wdid,waterClassNum,wcIdentifier,measInterval,measCount,dataMeasDate,dataValue,measUnits,obsCode,approvalStatus,modified
0,100539,54342,0100539 S:1 F: U:3 T: G: To:,Annual,1,2007,0.92,ACFT,U,Approved,2008-04-07T13:22:00


In [18]:
# use measInterval = 'Annual' data
dfts = dfts[dfts['measInterval'] == "Annual"].reset_index(drop=True)
print(len(dfts))
dfts.head(1)

231430


Unnamed: 0,wdid,waterClassNum,wcIdentifier,measInterval,measCount,dataMeasDate,dataValue,measUnits,obsCode,approvalStatus,modified
0,100539,54342,0100539 S:1 F: U:3 T: G: To:,Annual,1,2007,0.92,ACFT,U,Approved,2008-04-07T13:22:00


## Output DataFrame

In [19]:
# Merging division & wdid dataframes into one, using left-join.
# need to create a unique join key, use wdid & waterclassNum value.

dfs['key'] = dfs['wdid'].astype(str) + dfs['waterclassNum'].astype(str)
dfts['key'] = dfts['wdid'].astype(str) + dfts['waterClassNum'].astype(str) 

df = pd.merge(dfs, dfts, on='key', how='left')
print(len(df))
df.head()

105514


Unnamed: 0,divrectype,availableTimesteps,waterclassNum,wcIdentifier_x,wdid_x,wdidAcctId,structureName,wdidAcctName,sourceCode,sourceDescr,fromWdid,fromWdidAcctId,fromStructureName,fromWdidAcctName,useCode,useDescr,typeCode,typeDescr,groupWdid,groupStructureName,toWdid,toWdidAcctId,toStructureName,toWdidAcctName,wcDescr,porStart,porEnd,porLastModified,division,waterDistrict,county,modified_x,availableTimesteps.1,waterSource,streamMile,structureType,gnisId,ciuCode,ciuCodeLong,subdistrict,designatedBasinName,managementDistrictName,pm,township,range,section,q10,q40,q160,coordsew,coordsewDir,coordsns,coordsnsDir,utmX,utmY,latdecdeg,longdecdeg,key,wdid_y,waterClassNum,wcIdentifier_y,measInterval,measCount,dataMeasDate,dataValue,measUnits,obsCode,approvalStatus,modified_y
0,DivTotal,Year,10100539,0100539 Total (Diversions),100539,,MIDDLEMIST DITCH 2,,,,,,,,,,,,,,,,,,,2007-10-31T00:00:00,2007-10-31T00:00:00,2008-04-07T13:22:00,1,1,ELBERT,2011-06-02T08:52:50.99,Year,MIDDLEMIST CREEK,32.97,DITCH,195017.0,A,Active Structure with contemporary diversion r...,,,,S,6.0 S,57.0 W,25.0,SW,SW,SW,,,,,609064.0,4372109.0,39.491687,-103.731685,10053910100539,100539.0,10100539.0,0100539 Total (Diversion),Annual,1.0,2007.0,0.92,ACFT,U,Approved,2008-04-07T13:22:00
1,DivTotal,Year,10100565,0100565 Total (Diversions),100565,,MAGUIRE DITCH,,,,,,,,,,,,,,,,,,,1980-10-31T00:00:00,2010-10-31T00:00:00,2011-03-30T10:41:00,1,1,ELBERT,2011-06-02T08:52:50.99,Year,WEST BIJOU CREEK,58.16,DITCH,184779.0,A,Active Structure with contemporary diversion r...,,KIOWA-BIJOU,,S,9.0 S,62.0 W,16.0,SW,NE,NE,1304.0,E,951.0,N,557541.0,4347071.0,39.271083,-104.332951,10056510100565,100565.0,10100565.0,0100565 Total (Diversion),Annual,1.0,1980.0,226.0,ACFT,*,Approved,2001-08-16T17:46:00
2,DivTotal,Year,10100565,0100565 Total (Diversions),100565,,MAGUIRE DITCH,,,,,,,,,,,,,,,,,,,1980-10-31T00:00:00,2010-10-31T00:00:00,2011-03-30T10:41:00,1,1,ELBERT,2011-06-02T08:52:50.99,Year,WEST BIJOU CREEK,58.16,DITCH,184779.0,A,Active Structure with contemporary diversion r...,,KIOWA-BIJOU,,S,9.0 S,62.0 W,16.0,SW,NE,NE,1304.0,E,951.0,N,557541.0,4347071.0,39.271083,-104.332951,10056510100565,100565.0,10100565.0,0100565 Total (Diversion),Annual,1.0,1983.0,800.0,ACFT,*,Approved,2001-08-16T17:46:00
3,DivTotal,Year,10100565,0100565 Total (Diversions),100565,,MAGUIRE DITCH,,,,,,,,,,,,,,,,,,,1980-10-31T00:00:00,2010-10-31T00:00:00,2011-03-30T10:41:00,1,1,ELBERT,2011-06-02T08:52:50.99,Year,WEST BIJOU CREEK,58.16,DITCH,184779.0,A,Active Structure with contemporary diversion r...,,KIOWA-BIJOU,,S,9.0 S,62.0 W,16.0,SW,NE,NE,1304.0,E,951.0,N,557541.0,4347071.0,39.271083,-104.332951,10056510100565,100565.0,10100565.0,0100565 Total (Diversion),Annual,1.0,1984.0,900.0,ACFT,*,Approved,2001-08-16T17:46:00
4,DivTotal,Year,10100565,0100565 Total (Diversions),100565,,MAGUIRE DITCH,,,,,,,,,,,,,,,,,,,1980-10-31T00:00:00,2010-10-31T00:00:00,2011-03-30T10:41:00,1,1,ELBERT,2011-06-02T08:52:50.99,Year,WEST BIJOU CREEK,58.16,DITCH,184779.0,A,Active Structure with contemporary diversion r...,,KIOWA-BIJOU,,S,9.0 S,62.0 W,16.0,SW,NE,NE,1304.0,E,951.0,N,557541.0,4347071.0,39.271083,-104.332951,10056510100565,100565.0,10100565.0,0100565 Total (Diversion),Annual,1.0,1985.0,950.0,ACFT,U,Approved,2011-02-21T15:30:00


In [20]:
dfout = pd.DataFrame(index=df.index)

# Water Source Info
dfout['in_WaterSourceName'] = df['waterSource']
dfout['in_WaterSourceTypeCV'] = df['waterSource']

# Site Info
dfout['in_County'] = df['county']
dfout['in_GNISCodeCV'] = df['gnisId']
dfout['in_Latitude'] = df['latdecdeg'].astype(float)
dfout['in_Longitude'] = df['longdecdeg'].astype(float)
dfout['in_PODorPOUSite'] = "Observation Site"
dfout['in_SiteName'] = df['structureName']
dfout['in_SiteNativeID'] = df['wdid_x']
dfout['in_SiteTypeCV'] = df['structureType']

# Site VariableAmounts Info
dfout['in_Amount'] = df['dataValue'].astype(float)
dfout['in_BeneficialUseCategory'] = "DivTotal"
dfout['in_ReportYearCV'] = df['dataMeasDate']
dfout['in_TimeframeEnd'] = "" # will fill in below with dataMeasDate value
dfout['in_TimeframeStart'] = "" # will fill in below with dataMeasDate value

print(len(dfout))
dfout.head(1)

105514


Unnamed: 0,in_WaterSourceName,in_WaterSourceTypeCV,in_County,in_GNISCodeCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,MIDDLEMIST CREEK,MIDDLEMIST CREEK,ELBERT,195017.0,39.491687,-103.731685,Observation Site,MIDDLEMIST DITCH 2,100539,DITCH,0.92,DivTotal,2007.0,,


## WaDE Custom Elements (due to missing sate info)

In [21]:
# Create WaterSourceTypeCV
    
def createWaterSourceTypeCV(valA):
    if "GROUNDWATER:" in valA:
        outString = "Groundwater"
    else:
        outString = "Surface Water"
        
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: createWaterSourceTypeCV( row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater'], dtype=object)

In [22]:
%%time
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECO_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A)& 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
dfout.head(3)

Wall time: 41 s


Unnamed: 0,in_WaterSourceName,in_WaterSourceTypeCV,in_County,in_GNISCodeCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID
0,MIDDLEMIST CREEK,Surface Water,ELBERT,195017.0,39.491687,-103.731685,Observation Site,MIDDLEMIST DITCH 2,100539,DITCH,0.92,DivTotal,2007.0,,,WaDECO_WS1
1,WEST BIJOU CREEK,Surface Water,ELBERT,184779.0,39.271083,-104.332951,Observation Site,MAGUIRE DITCH,100565,DITCH,226.0,DivTotal,1980.0,,,WaDECO_WS2
2,WEST BIJOU CREEK,Surface Water,ELBERT,184779.0,39.271083,-104.332951,Observation Site,MAGUIRE DITCH,100565,DITCH,800.0,DivTotal,1983.0,,,WaDECO_WS2


In [23]:
def creatVarspec(WST):
    WST = WST.strip()
    outString = "Discharge Flow_Annual_DivTotal_" + WST
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: creatVarspec(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_VariableSpecificCV'].unique()

array(['Discharge Flow_Annual_DivTotal_Surface Water',
       'Discharge Flow_Annual_DivTotal_Groundwater'], dtype=object)

In [24]:
# Drop null values of ReportYearCV. Convert to int

dfout = dfout.dropna(subset=['in_ReportYearCV']).reset_index(drop=True)
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].astype(int)
print(len(dfout))
dfout.head(1)

105513


Unnamed: 0,in_WaterSourceName,in_WaterSourceTypeCV,in_County,in_GNISCodeCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID,in_VariableSpecificCV
0,MIDDLEMIST CREEK,Surface Water,ELBERT,195017.0,39.491687,-103.731685,Observation Site,MIDDLEMIST DITCH 2,100539,DITCH,0.92,DivTotal,2007,,,WaDECO_WS1,Discharge Flow_Annual_DivTotal_Surface Water


In [25]:
# TimeframeStart & TimeframeEnd

dfout['in_TimeframeStart'] = '01/01/' + dfout['in_ReportYearCV'].astype(str)
dfout['in_TimeframeEnd'] = '12/31/' + dfout['in_ReportYearCV'].astype(str)
dfout.head(1)

Unnamed: 0,in_WaterSourceName,in_WaterSourceTypeCV,in_County,in_GNISCodeCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID,in_VariableSpecificCV
0,MIDDLEMIST CREEK,Surface Water,ELBERT,195017.0,39.491687,-103.731685,Observation Site,MIDDLEMIST DITCH 2,100539,DITCH,0.92,DivTotal,2007,12/31/2007,01/01/2007,WaDECO_WS1,Discharge Flow_Annual_DivTotal_Surface Water


In [26]:
# in_Latitude & in_Longitude
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout.head(1)

Unnamed: 0,in_WaterSourceName,in_WaterSourceTypeCV,in_County,in_GNISCodeCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID,in_VariableSpecificCV
0,MIDDLEMIST CREEK,Surface Water,ELBERT,195017.0,39.491687,-103.731685,Observation Site,MIDDLEMIST DITCH 2,100539,DITCH,0.92,DivTotal,2007,12/31/2007,01/01/2007,WaDECO_WS1,Discharge Flow_Annual_DivTotal_Surface Water


## Review & Export Output

In [27]:
dfout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105513 entries, 0 to 105512
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   in_WaterSourceName        105513 non-null  object 
 1   in_WaterSourceTypeCV      105513 non-null  object 
 2   in_County                 105257 non-null  object 
 3   in_GNISCodeCV             58772 non-null   float64
 4   in_Latitude               105513 non-null  float64
 5   in_Longitude              105513 non-null  float64
 6   in_PODorPOUSite           105513 non-null  object 
 7   in_SiteName               105513 non-null  object 
 8   in_SiteNativeID           105513 non-null  int64  
 9   in_SiteTypeCV             105513 non-null  object 
 10  in_Amount                 105513 non-null  float64
 11  in_BeneficialUseCategory  105513 non-null  object 
 12  in_ReportYearCV           105513 non-null  int32  
 13  in_TimeframeEnd           105513 non-null  o

In [28]:
dfout

Unnamed: 0,in_WaterSourceName,in_WaterSourceTypeCV,in_County,in_GNISCodeCV,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID,in_VariableSpecificCV
0,MIDDLEMIST CREEK,Surface Water,ELBERT,195017.0,39.491687,-103.731685,Observation Site,MIDDLEMIST DITCH 2,100539,DITCH,0.92,DivTotal,2007,12/31/2007,01/01/2007,WaDECO_WS1,Discharge Flow_Annual_DivTotal_Surface Water
1,WEST BIJOU CREEK,Surface Water,ELBERT,184779.0,39.271083,-104.332951,Observation Site,MAGUIRE DITCH,100565,DITCH,226.00,DivTotal,1980,12/31/1980,01/01/1980,WaDECO_WS2,Discharge Flow_Annual_DivTotal_Surface Water
2,WEST BIJOU CREEK,Surface Water,ELBERT,184779.0,39.271083,-104.332951,Observation Site,MAGUIRE DITCH,100565,DITCH,800.00,DivTotal,1983,12/31/1983,01/01/1983,WaDECO_WS2,Discharge Flow_Annual_DivTotal_Surface Water
3,WEST BIJOU CREEK,Surface Water,ELBERT,184779.0,39.271083,-104.332951,Observation Site,MAGUIRE DITCH,100565,DITCH,900.00,DivTotal,1984,12/31/1984,01/01/1984,WaDECO_WS2,Discharge Flow_Annual_DivTotal_Surface Water
4,WEST BIJOU CREEK,Surface Water,ELBERT,184779.0,39.271083,-104.332951,Observation Site,MAGUIRE DITCH,100565,DITCH,950.00,DivTotal,1985,12/31/1985,01/01/1985,WaDECO_WS2,Discharge Flow_Annual_DivTotal_Surface Water
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105508,DEVIL CREEK,Surface Water,ARCHULETA,184283.0,37.238734,-107.233864,Observation Site,KEYAH GRANDE AUG PLAN REACH,7817003,REACH,3.10,DivTotal,2017,12/31/2017,01/01/2017,WaDECO_WS654,Discharge Flow_Annual_DivTotal_Surface Water
105509,DEVIL CREEK,Surface Water,ARCHULETA,184283.0,37.238734,-107.233864,Observation Site,KEYAH GRANDE AUG PLAN REACH,7817003,REACH,3.61,DivTotal,2018,12/31/2018,01/01/2018,WaDECO_WS654,Discharge Flow_Annual_DivTotal_Surface Water
105510,DEVIL CREEK,Surface Water,ARCHULETA,184283.0,37.238734,-107.233864,Observation Site,KEYAH GRANDE AUG PLAN REACH,7817003,REACH,1.92,DivTotal,2019,12/31/2019,01/01/2019,WaDECO_WS654,Discharge Flow_Annual_DivTotal_Surface Water
105511,DEVIL CREEK,Surface Water,ARCHULETA,184283.0,37.238734,-107.233864,Observation Site,KEYAH GRANDE AUG PLAN REACH,7817003,REACH,3.84,DivTotal,2020,12/31/2020,01/01/2020,WaDECO_WS654,Discharge Flow_Annual_DivTotal_Surface Water


In [29]:
# Exporting to Finished File.
dfout.to_csv('P_coSSRGMain.csv', index=False)  # The output