# Pre-processing Colorado Site Specific data for WaDEQA upload.
Date Updated: 03/09/2021
Purpose:  To pre-process the Colorado data into one master file for simple DataFrame creation and extraction

Notes:
- Using two different API Colorado CDSS REST web service. 1) [**Division Data**](https://dwr.state.co.us/Rest/GET/Help/Api/GET-api-v2-structures-divrec-waterclasses) api for Division 1-7 site specific information. 2) [**Annual WDID Time Series Data**](https://dwr.state.co.us/Rest/GET/Help/Api/GET-api-v2-structures-divrec-divrecyear) api using sites of interest wdid list produced from Division 1-7.
- Retreiving WDID data that is divrectype = WaterClass, availableTimesteps = Year, and ciuCode = A (for active) values only.
- Full script run time took the better of the morning.  Will save each division incremental step as individiual csv output for ease of review.

In [None]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
import requests
import json
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Colorado/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

In [None]:
# Query by Division (1-7) to get a full list of WDIDs per division.
# Plug in "division=1" etc into API request.
# Save results as Division1.csv, etc.
# Rinse and Repeat to retreive all data for Divisions 1-7.

# url = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/waterclasses/?division=7&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"
# responseD = json.loads(requests.get(url).text)
# LD = responseD['ResultList']

# df_ts = pd.DataFrame()
# for n in range(len(LD)):
#     row = pd.DataFrame([LD[n]])
#     df_ts = df_ts.append(row)
# df_ts

# #Exporting to Finished File
# df_ts.to_csv('Division7.csv', index=False)  # The output

# Division 1

In [None]:
# The Dvision 1 Time Series Dataframe
df_div1ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 1
fileInput = "Success/Division1.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
#Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div1ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div1ts = df_div1ts.append(row)

print(len(df_div1ts))
df_div1ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div1ts = pd.merge(df_div1ts, dftemp, left_on='wdid', right_on='wdid', how='left') 

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div1ts.
df_div1ts.to_excel('P_Div1WDIDTimeSeries.xlsx', index=False)  # The output

# Division 2

In [None]:
# The Dvision 2 Time Series Dataframe
df_div2ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 2
fileInput = "Success/Division2.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
#Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div2ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div2ts = df_div2ts.append(row)

print(len(df_div2ts))
df_div2ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div2ts = pd.merge(df_div2ts, dftemp, left_on='wdid', right_on='wdid', how='left')

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div2ts.
df_div2ts.to_excel('P_Div2WDIDTimeSeries.xlsx', index=False)  # The output

# Division 3

In [None]:
# The Dvision 3 Time Series Dataframe
df_div3ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 3.
fileInput = "Success/Division3.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
# Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div3ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div3ts = df_div3ts.append(row)

print(len(df_div3ts))
df_div3ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div3ts = pd.merge(df_div3ts, dftemp, left_on='wdid', right_on='wdid', how='left')

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div3ts.
df_div3ts.to_excel('P_Div3WDIDTimeSeries.xlsx', index=False)  # The output

# Division 4

In [None]:
# The Dvision 4 Time Series Dataframe
df_div4ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 4.
fileInput = "Success/Division4.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
# Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div4ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div4ts = df_div4ts.append(row)

print(len(df_div4ts))
df_div4ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div4ts = pd.merge(df_div4ts, dftemp, left_on='wdid', right_on='wdid', how='left')

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div4ts.
df_div4ts.to_excel('P_Div4WDIDTimeSeries.xlsx', index=False)  # The output

# Division 5

In [None]:
# The Dvision 5 Time Series Dataframe
df_div5ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 5.
fileInput = "Success/Division5.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
# Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div5ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div5ts = df_div5ts.append(row)

print(len(df_div5ts))
df_div5ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div5ts = pd.merge(df_div5ts, dftemp, left_on='wdid', right_on='wdid', how='left')

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div5ts.
df_div5ts.to_excel('P_Div5WDIDTimeSeries.xlsx', index=False)  # The output

# Division 6

In [None]:
# The Dvision 6 Time Series Dataframe
df_div6ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 6.
fileInput = "Success/Division6.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
# Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div6ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div6ts = df_div6ts.append(row)

print(len(df_div6ts))
df_div6ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div6ts = pd.merge(df_div6ts, dftemp, left_on='wdid', right_on='wdid', how='left')

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div6ts.
df_div6ts.to_excel('P_Div6WDIDTimeSeries.xlsx', index=False)  # The output

# Division 7

In [None]:
# The Dvision 7 Time Series Dataframe
df_div7ts = pd.DataFrame()

In [None]:
# Read in division csv file.
# Change input to Division 7.
fileInput = "Success/Division7.csv"
dftemp = pd.read_csv(fileInput)
print(len(dftemp))
dftemp.head(3)

In [None]:
# Trim down division dataframe for divrectype = DivTotal, availableTimesteps = Year, and ciuCode = A (for active) values only.
# Fix wdid values that are less then 7 chars long.  Covert to string.
dftemp = dftemp[dftemp.divrectype == 'WaterClass']
dftemp = dftemp[dftemp.availableTimesteps == 'Year']
dftemp = dftemp[dftemp.ciuCode == 'A']

# fix wdid values that are less then 7 chars long.  Covert to string.
def formatWDIDValue(colVal):
    n = str(colVal)
    if len(n) < 7:
        outString = "0" + n
    else:
        outString = n
    return outString
dftemp['wdid'] = dftemp.apply(lambda row: formatWDIDValue(row['wdid']), axis=1)

print(len(dftemp))
dftemp.head(3)

In [None]:
# Converted trimed down dataframe wdid to list.
dftemp = dftemp.drop_duplicates(subset='wdid', keep="first")
wdidlist = dftemp['wdid'].tolist()
type(wdidlist)

In [None]:
# Split list into catagories that are 100 long.
wdidlistB = [wdidlist[i:i + 100] for i in range(0, len(wdidlist), 100)]

In [None]:
# Use list of WDIDs (from Divisoin data) as inputs, retreive time series data.
# Change dataframe to df_div7ts

str2 = "https://dwr.state.co.us/Rest/GET/api/v2/structures/divrec/divrecyear/?wdid="
str3 = "%2C&apiKey=wAC6ZmzcPJ30dyy6nYu6jQmG7BBedcem"

for i in range(len(wdidlistB)):
    lstC = wdidlistB[i]
    lstCa = '%2C'.join([str(n) for n in lstC]) 
    
    url = str2 + lstCa + str3
    responseD = json.loads(requests.get(url).text)
    LD = responseD['ResultList']
    
    for n in range(len(LD)):
        row = pd.DataFrame([LD[n]])
        df_div7ts = df_div7ts.append(row)

print(len(df_div7ts))
df_div7ts.tail(3)

In [None]:
# Merging wdid and division dataframes into one, using left-join.
df_div7ts = pd.merge(df_div7ts, dftemp, left_on='wdid', right_on='wdid', how='left')

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div7ts.
df_div7ts.to_excel('P_Div7WDIDTimeSeries.xlsx', index=False)  # The output

# Concatenate

In [None]:
# Concatenate
frames = [df_div1ts, df_div2ts, df_div3ts, df_div4ts, df_div5ts, df_div6ts, df_div7ts]
dfout = pd.concat(frames)

In [None]:
print(len(dfout))
dfout

# WaDE Custom Elements (due to missing sate info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECO_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['waterSource']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['waterSource']), axis=1)
dfout.head(3)

In [None]:
# Creating WaDE Custom site native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp SiteNativeID dataframe of unique site.
def assignSiteUUID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECO_S" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = dfout['latdecdeg']
dfSiteNativeID['in_Longitude'] = dfout['longdecdeg']
dfSiteNativeID['in_SiteTypeCV'] = dfout['structureType']
dfSiteNativeID['in_SiteName'] = dfout['structureName']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteUUID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveSiteNativeID(A, B, C, D):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfSiteNativeID.loc[(dfSiteNativeID['in_Latitude'] == A) & 
                                (dfSiteNativeID['in_Longitude'] == B) &
                                (dfSiteNativeID['in_SiteTypeCV'] == C) &
                                (dfSiteNativeID['in_SiteName'] == D), 'in_SiteNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_SiteNativeID'] = dfout.apply(lambda row: retrieveSiteNativeID( row['latdecdeg'], row['longdecdeg'], row['structureType'], row['structureName']), axis=1)
dfout.head(3)

In [None]:
# TimeframeStart & TimeframeEnd

dfout['inTimeframeStart'] = '01/01/' + dfout['dataMeasDate'].astype(str)
dfout['inTimeframeEnd'] = '12/31/' + dfout['dataMeasDate'].astype(str)
dfout

In [None]:
# Exporting to Finished File.
# Change dataframe to df_div7ts.
dfout.to_csv('P_coSSMaster.csv', index=False)  # The output