# Pre-processing Idaho Site Specific data for WaDEQA upload.
Date Updated: 03/09/2021
Purpose:  To pre-process the Idaho ss data into one master file for simple DataFrame creation and extraction

Notes:
- Runtime to aquire time series data took approximetly 4 hrs.  Will save output as csv to save time in future.
- Combined site info and time series info together on unique timeseries id.



#### Idaho Nested Data Aqua API Data
The Aqua Info interface has an API behind it.  Here's a sample using “Pole Creek” site in the Upper Salmon Basin, using the GUID (99db207c15774d1c9a2f2a9daad85efa) in the URL, followed by the requsted time stamp.
- https://research.idwr.idaho.gov/apps/hydrologic/aquainfo/api/telemetry/99db207c15774d1c9a2f2a9daad85efa?fromDate=2020-09-21T14:33:47.134Z&toDate=2020-10-21T14:33:47.134Z

The GUID is inside each object’s “locDescription” property.
- https://research.idwr.idaho.gov/apps/hydrologic/aquainfo/api/telemetry/locations

In [4]:
import os
import numpy as np
import pandas as pd
import requests
import json
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input Files
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Idaho/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

In [None]:
# The general API search.  Returns a nestede data list with 228 entires.

url = "https://research.idwr.idaho.gov/apps/hydrologic/aquainfo/api/telemetry/locations"
responseList = json.loads(requests.get(url).text)

print(type(responseList))
print(len(responseList))
responseList

In [None]:
#This is working.  Yay!  Just need to finish.
#This will produce step 1 of using the API.  Step two will be to take the output here and use the other API format using the uniqueID and the time start and end date.

# Disecting the nested list per nested list.  Input based on index of dataframe.

url = "https://research.idwr.idaho.gov/apps/hydrologic/aquainfo/api/telemetry/locations"
responseList = json.loads(requests.get(url).text)
SearchIndex = 0
outdfIndex = 0
datasetsIndex = 0

outdf = pd.DataFrame(columns=['locationName', 'identifier', 'loc_uniqueId', 'locationType', 'latitude', 'longitude', 'srid',
                              'ds_uniqueId', 'parameter', 'unit', 'rawStartTime', 'rawEndTime', 'label']) 
dfdatasets = pd.DataFrame(columns=['locationName', 'identifier', 'loc_uniqueId', 'locationType', 'latitude', 'longitude', 'srid',
                                   'ds_uniqueId', 'parameter', 'unit', 'rawStartTime', 'rawEndTime', 'label']) 


while SearchIndex < len(responseList):
    LtDi = responseList[SearchIndex]
    
    #locationName
    LtDi_lsn = LtDi['locationName']
    outdf.loc[outdfIndex, 'locationName'] = LtDi_lsn
    
    #list of identifier
    LtDi_is = LtDi['identifier']
    outdf.loc[outdfIndex, 'identifier'] = LtDi_is
    
    #list of loc_uniqueId
    LtDi_un = LtDi['uniqueId']
    outdf.loc[outdfIndex, 'loc_uniqueId'] = LtDi_un
    
    #list of locationType
    LtDi_lt = LtDi['locationType']
    outdf.loc[outdfIndex, 'locationType'] = LtDi_lt
    
    #list of latitude
    LtDi_la = LtDi['latitude']
    outdf.loc[outdfIndex, 'latitude'] = LtDi_la
    
    #list of longitude
    LtDi_lo = LtDi['longitude']
    outdf.loc[outdfIndex, 'longitude'] = LtDi_lo
    
    #list of srid
    LtDi_sr = LtDi['srid']
    outdf.loc[outdfIndex, 'srid'] = LtDi_sr
       
    # Time Series Info in New Nested List
    DL = LtDi['datasets']
    if DL is None:
        outdf.loc[outdfIndex, 'ds_uniqueId'] = ""
        outdf.loc[outdfIndex, 'parameter'] = ""
        outdf.loc[outdfIndex, 'unit'] = ""
        outdf.loc[outdfIndex, 'rawEndTime'] = ""
        outdf.loc[outdfIndex, 'rawStartTime'] = ""
        outdf.loc[outdfIndex, 'label'] = ""
    else:
        while datasetsIndex < len(DL):
            DLD = DL[datasetsIndex]
            
            #list of ds_uniqueId
            DLDstr = DLD['uniqueId']
            outdf.loc[outdfIndex, 'ds_uniqueId'] = DLDstr
            
            #list of parameter
            DLDstr = DLD['parameter']
            outdf.loc[outdfIndex, 'parameter'] = DLDstr
            
            #list of unit
            DLDstr = DLD['unit']
            outdf.loc[outdfIndex, 'unit'] = DLDstr
            
            #list of rawEndTime
            DLDstr = DLD['rawEndTime']
            outdf.loc[outdfIndex, 'rawEndTime'] = DLDstr
            
            #list of rawStartTime
            DLDstr = DLD['rawStartTime']
            outdf.loc[outdfIndex, 'rawStartTime'] = DLDstr
            
            #list of label
            DLDstr = DLD['label']
            outdf.loc[outdfIndex, 'label'] = DLDstr
            
            #Copy exiting rows from index row, repeat with index - datasetsIndex.
            outdf.loc[outdfIndex, 'locationName'] = outdf.loc[outdfIndex-datasetsIndex, 'locationName']
            outdf.loc[outdfIndex, 'identifier'] = outdf.loc[outdfIndex-datasetsIndex, 'identifier']
            outdf.loc[outdfIndex, 'loc_uniqueId'] = outdf.loc[outdfIndex-datasetsIndex, 'loc_uniqueId']
            outdf.loc[outdfIndex, 'locationType'] = outdf.loc[outdfIndex-datasetsIndex, 'locationType']
            outdf.loc[outdfIndex, 'latitude'] = outdf.loc[outdfIndex-datasetsIndex, 'latitude']
            outdf.loc[outdfIndex, 'longitude'] = outdf.loc[outdfIndex-datasetsIndex, 'longitude']
            outdf.loc[outdfIndex, 'srid'] = outdf.loc[outdfIndex-datasetsIndex, 'srid']
            datasetsIndex += 1  # Advanced datasetsIndex counter
            outdfIndex += 1 # Advanced outdfIndex counter due to extra rows if datasets is not None.
            
        datasetsIndex = 0  # reset datasetsIndex
    
    # Advanced Index Counter
    SearchIndex += 1
    outdfIndex += 1

print(len(outdf))
outdf.head(3)

In [None]:
# Creating outputString for the url.

str1 = "https://research.idwr.idaho.gov/apps/hydrologic/aquainfo/api/telemetry/"

def assignoutputString(A, B, C):
    if A == '' or pd.isnull(A):
        outString = "Unspecified"
    else:
        outString = str1 + str(A) + "?fromDate=" + str(B) + "&toDate=" + str(C)
    return outString

outdf['outputString'] = outdf.apply(lambda row: assignoutputString(row['ds_uniqueId'], row['rawStartTime'], row['rawEndTime']), axis=1)
print(len(outdf))
outdf = outdf.reset_index(drop=True)
outdf.head(3)

In [None]:
# Drop and only keep columns rows with "label" = Daily or daily.
outdf = outdf[(outdf.label == "Daily") | (outdf.label == "daily")]
print(len(outdf))
outdf = outdf.reset_index(drop=True)
outdf.head(3)

In [None]:
# The TS API search.

tsdf = pd.DataFrame(columns=['locationName', 'loc_uniqueId',
                             'timeStamp', 'numericValue1']) 

tsSearchIndex = 0
outdfIndex = 0
tsdfIndex = 0

while outdfIndex < len(outdf):
    
    url = outdf.loc[outdfIndex, 'outputString']
    print(outdfIndex)
    print(url)
    
    if url == "Unspecified":
        # Copy exsisting rows from outdf to tsdf.
        tsdf.loc[tsdfIndex, "locationName"] = outdf.loc[outdfIndex, "locationName"]  
        tsdf.loc[tsdfIndex, "loc_uniqueId"] = outdf.loc[outdfIndex, "loc_uniqueId"] 
        tsdfIndex += 1
    else:
        #The URl
        resD = json.loads(requests.get(url).text)
        resDL = resD['points']
        print("Length is: ", len(resDL))
        
        tsSearchIndex = 0
        while tsSearchIndex < len(resDL):
            
            # Copy exsisting rows from outdf to tsdf.
            tsdf.loc[tsdfIndex, "locationName"] = outdf.loc[outdfIndex, "locationName"]  
            tsdf.loc[tsdfIndex, "loc_uniqueId"] = outdf.loc[outdfIndex, "loc_uniqueId"]
            
            #Time Series serach index.
            LtD = resDL[tsSearchIndex]
    
            #timeStamp
            LtD_ts = LtD['timeStamp']
            tsdf.loc[tsdfIndex, 'timeStamp'] = LtD_ts
            
            #numericValue1
            LtD_nv1 = LtD['numericValue1']
            tsdf.loc[tsdfIndex, 'numericValue1'] = LtD_nv1
            
            
#             print("outdfIndex is " + str(outdfIndex) + ", tsdfIndex is " + str(tsdfIndex) + ", tsSearchIndex is " + str(tsSearchIndex))
#             tsdflist = tsdf.loc[tsdfIndex].tolist()
#             print(tsdflist)
            
            tsdfIndex += 1
            tsSearchIndex += 1
    
    # Advanced Index Counter
    outdfIndex += 1

print(len(tsdf))
tsdf.head(3)

In [None]:
#Merge
outdf = pd.merge(outdf, tsdf, left_on='loc_uniqueId', right_on='loc_uniqueId', how='left')
print(len(outdf))
outdf.head(3)

In [None]:
#Update datatype of date to fit WaDE 2.0 structure
outdf['timeStamp'] = pd.to_datetime(outdf['timeStamp'], utc=True)
outdf

In [None]:
#Extract year value for ReportYearCV
outdf['in_ReportYear'] = pd.DatetimeIndex(outdf['timeStamp']).year
outdf

In [26]:
#Exporting to Finished File
outdf.to_csv('P_idSSMaster.csv', index=False)  # The output