# Pre-processing Colorado Site Specific Reservoir and Gage data for WaDE upload.

Notes:
- Using two different API Colorado CDSS REST web service. 1) [**Division Data**](https://dwr.state.co.us/Rest/GET/Help/Api/GET-api-v2-structures-divrec-waterclasses) api for Division 1-7 site specific information. 2) [**Annual WDID Time Series Data**](https://dwr.state.co.us/Rest/GET/Help/Api/GET-api-v2-structures-divrec-divrecyear) api using sites of interest wdid list produced from Division 1-7.

In [None]:
# Needed Libararies

# Working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# Visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Working with API
import requests
import io
import json

# Cleanup
import re
import time
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Colorado/SS_ReservoirsGages/RawInputData"
os.chdir(workingDir)

## Load Site Data

In [None]:
# done already

# Query by Division (1-7) to get a full list of WDIDs per division.
# Plug in "division=1" etc into API request.
# Save results as Division1.csv, etc.
# Rinse and Repeat to retreive all data for Divisions 1-7.

# url = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/waterclasses/?division=7&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"
# responseD = json.loads(requests.get(url).text)
# LD = responseD['ResultList']

# df_ts = pd.DataFrame()
# for n in range(len(LD)):
#     row = pd.DataFrame([LD[n]])
#     df_ts = df_ts.append(row)
# df_ts

# #Exporting to Finished File
# df_ts.to_csv('Division7.csv', index=False)  # The output

In [None]:
# Division 1
fileInput = "Success/Division1.csv"
dfs1 = pd.read_csv(fileInput)
print(len(dfs1))
dfs1.head(1)

In [None]:
# Division 2
fileInput = "Success/Division2.csv"
dfs2 = pd.read_csv(fileInput)
print(len(dfs2))
dfs2.head(1)

In [None]:
# Division 3
fileInput = "Success/Division3.csv"
dfs3 = pd.read_csv(fileInput)
print(len(dfs3))
dfs3.head(1)

In [None]:
# Division 4
fileInput = "Success/Division4.csv"
dfs4 = pd.read_csv(fileInput)
print(len(dfs4))
dfs4.head(1)

In [None]:
# Division 5
fileInput = "Success/Division5.csv"
dfs5 = pd.read_csv(fileInput)
print(len(dfs5))
dfs5.head(1)

In [None]:
# Division 6
fileInput = "Success/Division6.csv"
dfs6 = pd.read_csv(fileInput)
print(len(dfs6))
dfs6.head(1)

In [None]:
# Division 7
fileInput = "Success/Division7.csv"
dfs7 = pd.read_csv(fileInput)
print(len(dfs7))
dfs7.head(1)

In [None]:
# Concatenate
frames = [dfs1, dfs2, dfs3, dfs4, dfs5, dfs6, dfs7]
dfs = pd.concat(frames)
print(len(dfs))

In [None]:
# want DivTotal, Year, and Active Records
dfs = dfs[dfs["divrectype"] == 'DivTotal']
dfs = dfs[dfs["availableTimesteps"] == 'Year']
dfs = dfs[dfs["ciuCode"] == 'A']
print(len(dfs))

In [None]:
# fix wdid values that are less then 7 chars long.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n

    return int(outString)
dfs['wdid'] = dfs.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)
dfs['wdid'].unique()

In [None]:
# drop duplicates
dfs = dfs.drop_duplicates(subset='wdid', keep="first").reset_index(drop=True)
print(len(dfs))
dfs.head(1)

## Retrieve Timeseries info

In [None]:
# done already

# %%time

# # create wdid list for API
# wdidList = dfs['wdid'].tolist()
# # Split list into catagories that are 100 long. Issue with CO API timing out after too long.
# wdidListB = [wdidList[i:i + 100] for i in range(0, len(wdidList), 100)]

# # Use list of WDIDs (from Division data) as inputs, retreive time series data.

# # Time Series Dataframe
# dfts = pd.DataFrame()

# str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
# str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

# for i in range(len(wdidListB)):
#     lstC = wdidListB[i]
#     lstCa = '%2C'.join([str(n) for n in lstC]) 
    
#     url = str2 + lstCa + str3
#     responseD = json.loads(requests.get(url).text)
#     LD = responseD['ResultList']
    
#     for n in range(len(LD)):
#         row = pd.DataFrame([LD[n]])
#         dfts = dfts.append(row)

# print(len(dfts))
# dfts.head()

In [None]:
# done already

# # Exporting finished time series file for records sake.
# dfts.to_excel('P_TimeSeries.xlsx', index=False)  # The output

In [None]:
# timeseris data
fileInput = "P_TimeSeries.xlsx"
dfts = pd.read_excel(fileInput)
print(len(dfts))
dfts.head(1)

In [None]:
# use measInterval = 'Annual' data
dfts = dfts[dfts['measInterval'] == "Annual"].reset_index(drop=True)
print(len(dfts))
dfts.head(1)

## Output DataFrame

In [None]:
# Merging division & wdid dataframes into one, using left-join.
# need to create a unique join key, use wdid & waterclassNum value.

dfs['key'] = dfs['wdid'].astype(str) + dfs['waterclassNum'].astype(str)
dfts['key'] = dfts['wdid'].astype(str) + dfts['waterClassNum'].astype(str) 

df = pd.merge(dfs, dfts, on='key', how='left')
print(len(df))
df.head()

In [None]:
dfout = pd.DataFrame(index=df.index)

# Water Source Info
dfout['in_WaterSourceName'] = df['waterSource']
dfout['in_WaterSourceTypeCV'] = df['waterSource']

# Site Info
dfout['in_County'] = df['county']
dfout['in_GNISCodeCV'] = df['gnisId']
dfout['in_Latitude'] = df['latdecdeg'].astype(float)
dfout['in_Longitude'] = df['longdecdeg'].astype(float)
dfout['in_PODorPOUSite'] = "Gage"
dfout['in_SiteName'] = df['structureName']
dfout['in_SiteNativeID'] = df['wdid_x']
dfout['in_SiteTypeCV'] = df['structureType']

# Site VariableAmounts Info
dfout['in_Amount'] = df['dataValue'].astype(float)
dfout['in_BeneficialUseCategory'] = "DivTotal"
dfout['in_ReportYearCV'] = df['dataMeasDate']
dfout['in_TimeframeEnd'] = "" # will fill in below with dataMeasDate value
dfout['in_TimeframeStart'] = "" # will fill in below with dataMeasDate value

print(len(dfout))
dfout.head(1)

## WaDE Custom Elements (due to missing sate info)

In [None]:
# Create WaterSourceTypeCV
    
def createWaterSourceTypeCV(valA):
    if "GROUNDWATER:" in valA:
        outString = "Groundwater"
    else:
        outString = "Surface Water"
        
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: createWaterSourceTypeCV( row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

In [None]:
%%time
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECO_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A)& 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
dfout.head(3)

In [None]:
def creatVarspec(WST):
    WST = WST.strip()
    outString = "Stream Gage_Annual_DivTotal_" + WST
    return outString

dfout['in_VariableSpecificCV'] = dfout.apply(lambda row: creatVarspec(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_VariableSpecificCV'].unique()

In [None]:
# Drop null values of ReportYearCV. Convert to int

dfout = dfout.dropna(subset=['in_ReportYearCV']).reset_index(drop=True)
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].astype(int)
print(len(dfout))
dfout.head(1)

In [None]:
# TimeframeStart & TimeframeEnd

dfout['in_TimeframeStart'] = '01/01/' + dfout['in_ReportYearCV'].astype(str)
dfout['in_TimeframeEnd'] = '12/31/' + dfout['in_ReportYearCV'].astype(str)
dfout.head(1)

In [None]:
# Exporting to Finished File.
dfout.to_csv('P_coSSRGMain.csv', index=False)  # The output