# Pre-processing Montana Site Specific data for WaDEQA upload.
- Date Updated: 09/13/2021
- Purpose:  To pre-process the Montana ss data into one master file for simple DataFrame creation and extraction

### Goal
- Create sites_input.csv, contains location and site information.
- Create cleaned input_timeseries.csv with a native site ID field.

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import json
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input Files
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Montana/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

### DataFrame Creation

In [2]:
# Dataframe Creation - location data
inputFile1 = "MGS_locations.csv"
df_loc = pd.read_csv(inputFile1)
print(len(df_loc))
df_loc.head(1)

174


Unnamed: 0,OID_,LocationID,LocationCode,LocationName,LocationPath,LocationType,IsExternalLocation,Longitude,Latitude,UtcOffset,LastModified,ElevationUnits,Elevation,Description,Tags,ExtendedAttributeValues,StatusDesc,ActiveFlag
0,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1


In [3]:
# Dataframe Creation - datasets data
inputFile2 = "MGS_datasets.csv"
df_dase = pd.read_csv(inputFile2)
print(len(df_dase))
df_dase.head(1)

509


Unnamed: 0,OID_,SensorCode,SensorID,SensorLabel,Comment,Description,isPublished,LocationName,LocationCode,LocationID,SubLocationCode,TimeSeriesType,Parameter,UnitOfMeasure,UtcOffset,ComputationMethod,ComputationPeriod,LastModifiedTime,ExtendedAttributeValues,ParameterLabel
0,1,Discharge.Daily Average@40A 1500,e069deacfba143b3ba052cc39714d44d,Daily Average,,,1,NF Musselshell near Delphine,40A 1500,b947d0c2364e4948a8032baac8904bef,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge


In [4]:
# Dataframe Creation - timeseries data
inputFile3 = "MGS_timeseries.csv"
df_ts = pd.read_csv(inputFile3)
print(len(df_ts))
df_ts.head(1)

26459294


Unnamed: 0,OID_,SensorID,Timestamp,RecordedValue,GradeCode,GradeName,Method,ApprovalLevel,ApprovalName
0,1,e069deacfba143b3ba052cc39714d44d,1/30/1981 0:00:00,,,,,,


### Location & Site Information

In [5]:
# Only working with ParameterLabel = Discharge, ComputationMethod = Mean & ComputationPeriod = Daily values for now.
df_dasetemp = df_dase[(df_dase['SensorLabel'] == 'discharge') |
                      (df_dase['SensorLabel'] == 'stage')]
print(len(df_dasetemp))
df_dasetemp.head()

131


Unnamed: 0,OID_,SensorCode,SensorID,SensorLabel,Comment,Description,isPublished,LocationName,LocationCode,LocationID,SubLocationCode,TimeSeriesType,Parameter,UnitOfMeasure,UtcOffset,ComputationMethod,ComputationPeriod,LastModifiedTime,ExtendedAttributeValues,ParameterLabel
1,2,Discharge.discharge@40A 1500,d3310b1c958446fe99a69e8997d848d8,discharge,,,1,NF Musselshell near Delphine,40A 1500,b947d0c2364e4948a8032baac8904bef,,ProcessorDerived,QR,ft^3/s,#NAME?,Unknown,Unknown,2/2/2021 6:24,[],Discharge
4,5,Discharge.discharge@40A 2000,0f1ce86e86f841ae91996afb7ced5ac7,discharge,,,1,NF Musselshell blw Bair Reservoir,40A 2000,b85ca58a37784f2aa2fb33588dc4cfe9,,ProcessorDerived,QR,ft^3/s,#NAME?,Unknown,Unknown,2/2/2021 6:24,[],Discharge
5,6,Stage.stage@40A 2000,94664635d58f42c4afad73ab0094a396,stage,,,1,NF Musselshell blw Bair Reservoir,40A 2000,b85ca58a37784f2aa2fb33588dc4cfe9,,ProcessorBasic,HG,ft,#NAME?,Unknown,Unknown,2/2/2021 6:16,[],Stage
8,9,Discharge.discharge@40A 10000,8efc886fb5364b46998d82168cb09504,discharge,,,1,Barber Canal,40A 10000,f412dc9558214d9f82ca6dd0ebcc3970,,ProcessorDerived,QR,ft^3/s,#NAME?,Unknown,Unknown,2/2/2021 6:24,[],Discharge
11,12,Discharge.discharge@40A 3000,ad9340452c964165abe86957f46a7092,discharge,,,1,NF Musselshell Diversion Canal,40A 3000,61da4aa318334518843094d1f7d8fe0e,,ProcessorDerived,QR,ft^3/s,#NAME?,Unknown,Unknown,2/2/2021 6:24,[],Discharge


In [6]:
# Create VariableSpecificCV

def createVariableSpecificCV(A):
    if A == 'discharge':
        outString = "Reservoirs and Gages - Daily - Discharge"
    if A == 'stage':
        outString = "Reservoirs and Gages - Daily - Stage"
    return outString


df_dasetemp['in_VariableSpecificCV'] = df_dasetemp.apply(lambda row: createVariableSpecificCV(row['SensorLabel']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dasetemp['in_VariableSpecificCV'] = df_dasetemp.apply(lambda row: createVariableSpecificCV(row['SensorLabel']), axis=1)


In [7]:
df_dasetemp['in_VariableSpecificCV'].nunique()

2

In [8]:
# trim down dataset to those fields that are needed.
df_dasetemp_sub = df_dasetemp[['LocationCode', 'SensorID', 'SensorLabel', 'TimeSeriesType', 'Parameter', 'UnitOfMeasure', 'ComputationMethod', 'ComputationPeriod', 'LastModifiedTime', 'ParameterLabel', 'in_VariableSpecificCV']]
print(len(df_dasetemp_sub))
df_dasetemp_sub.head()

131


Unnamed: 0,LocationCode,SensorID,SensorLabel,TimeSeriesType,Parameter,UnitOfMeasure,ComputationMethod,ComputationPeriod,LastModifiedTime,ParameterLabel,in_VariableSpecificCV
1,40A 1500,d3310b1c958446fe99a69e8997d848d8,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
4,40A 2000,0f1ce86e86f841ae91996afb7ced5ac7,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
5,40A 2000,94664635d58f42c4afad73ab0094a396,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage
8,40A 10000,8efc886fb5364b46998d82168cb09504,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
11,40A 3000,ad9340452c964165abe86957f46a7092,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge


In [9]:
# merge location info with trimmed down dataset data
df_loctemp=pd.DataFrame()
df_loctemp = pd.merge(df_loc, df_dasetemp_sub, on='LocationCode', how='left')
print(len(df_loctemp))
df_loctemp.head()

178


Unnamed: 0,OID_,LocationID,LocationCode,LocationName,LocationPath,LocationType,IsExternalLocation,Longitude,Latitude,UtcOffset,LastModified,ElevationUnits,Elevation,Description,Tags,ExtendedAttributeValues,StatusDesc,ActiveFlag,SensorID,SensorLabel,TimeSeriesType,Parameter,UnitOfMeasure,ComputationMethod,ComputationPeriod,LastModifiedTime,ParameterLabel,in_VariableSpecificCV
0,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,d3310b1c958446fe99a69e8997d848d8,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
1,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,0f1ce86e86f841ae91996afb7ced5ac7,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
2,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,94664635d58f42c4afad73ab0094a396,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage
3,3,f412dc9558214d9f82ca6dd0ebcc3970,40A 10000,Barber Canal,All Locations.DNRC SWP,Hydrology Station,0,-109.4012,46.3379,-PT7H,2/4/2021 15:34:28,ft,3877.729,Deadman's Basin Outlet Canal-Barber Canal (Son...,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,8efc886fb5364b46998d82168cb09504,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
4,4,61da4aa318334518843094d1f7d8fe0e,40A 3000,NF Musselshell Diversion Canal,All Locations.DNRC SWP,Hydrology Station,0,-110.418,46.5359,-PT7H,2/4/2021 15:34:56,ft,4958.962,North Fork Musselshell River Diversion Canal b...,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,ad9340452c964165abe86957f46a7092,discharge,ProcessorDerived,QR,ft^3/s,Unknown,Unknown,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge


### Cleaned Timeseries

In [10]:
# list of SensorID to find and perserve in timesers data
SensorIDList = df_dasetemp_sub['SensorID'].tolist()
SensorIDList

['d3310b1c958446fe99a69e8997d848d8',
 '0f1ce86e86f841ae91996afb7ced5ac7',
 '94664635d58f42c4afad73ab0094a396',
 '8efc886fb5364b46998d82168cb09504',
 'ad9340452c964165abe86957f46a7092',
 'f0f03c49f9944af39eb668a184c8c7d2',
 'c1f909c9847f459cacc46d3dbf7f3a29',
 '67fcc028e0204a248ae9a34a28c8d4ae',
 '6d5e953edb584ed799d810572e64d422',
 '6196528d99684286b84011601e8a4dcb',
 '7c7b10a3025141db97c7038dfcf3f72d',
 '2e0fd79a9271434cbbcf7e2f0ceb9866',
 'bfb6ca552ec04a0db930bbfe4b5d6c50',
 '960c04db33534b13b38b994977ab81e1',
 '93a2769ed5c24ff3bf55d75cbc8dc2cd',
 '5a7c87006b5c4e249f133d26ceda2b1f',
 'dca16bd831a74092bc7db432d169ab1f',
 '095c0d42e78b4aa383b15373c69dad60',
 '8af7ac69fb77446db3ab13a44bccaf01',
 '4ce6bc2a90fd496cb6517a13ae898db8',
 'ccef46acfa11457fada830f3f284ca59',
 'acfa1ec28c75401189bdd72384da0d4c',
 'adf5a87a4867405e88b7f17a2a948839',
 'ca975822eaae44df9c0cf90eb7d0e890',
 'b42f0874a0a646c2a9bfc25fd469b012',
 'c379f918a1f74090b7d335ddb1f9ee3c',
 '3f5ebd5d76f84daca1a37714d2bdd1cf',
 

In [11]:
# use only those timeseries rows who's SensorID is in the list
df_tstemp = df_ts
df_tstemp = df_tstemp[df_tstemp['SensorID'].isin(SensorIDList)].drop_duplicates().reset_index(drop=True)
print(len(df_tstemp))
df_tstemp.head()

10645520


Unnamed: 0,OID_,SensorID,Timestamp,RecordedValue,GradeCode,GradeName,Method,ApprovalLevel,ApprovalName
0,194407,94664635d58f42c4afad73ab0094a396,4/12/2006 12:00:00,,,,,,
1,194408,94664635d58f42c4afad73ab0094a396,4/13/2006 12:00:00,,,,,,
2,194409,94664635d58f42c4afad73ab0094a396,4/14/2006 12:00:00,,,,,,
3,194410,94664635d58f42c4afad73ab0094a396,4/15/2006 12:00:00,,,,,,
4,194411,94664635d58f42c4afad73ab0094a396,4/16/2006 12:00:00,,,,,,


In [12]:
# extract date and time values from Timestamp field
# -------------------------------------------------

#convert from string to datetime
df_tstemp['Timestamp'] = pd.to_datetime(df_tstemp['Timestamp']) 

# extract date, year and time, create three new fields
df_tstemp['Timestamp_Date'] = df_tstemp['Timestamp'].dt.date
df_tstemp['Timestamp_Date'] = pd.to_datetime(df_tstemp['Timestamp_Date'], errors = 'coerce')
df_tstemp['Timestamp_Date'] = pd.to_datetime(df_tstemp['Timestamp_Date'].dt.strftime('%m/%d/%Y'))

# Year
df_tstemp['Timestamp_Year'] = pd.DatetimeIndex(df_tstemp['Timestamp_Date']).year

# time
df_tstemp['Timestamp_Time'] = df_tstemp['Timestamp'].dt.time

print(len(df_tstemp))
df_tstemp.head()

10645520


Unnamed: 0,OID_,SensorID,Timestamp,RecordedValue,GradeCode,GradeName,Method,ApprovalLevel,ApprovalName,Timestamp_Date,Timestamp_Year,Timestamp_Time
0,194407,94664635d58f42c4afad73ab0094a396,2006-04-12 12:00:00,,,,,,,2006-04-12,2006,12:00:00
1,194408,94664635d58f42c4afad73ab0094a396,2006-04-13 12:00:00,,,,,,,2006-04-13,2006,12:00:00
2,194409,94664635d58f42c4afad73ab0094a396,2006-04-14 12:00:00,,,,,,,2006-04-14,2006,12:00:00
3,194410,94664635d58f42c4afad73ab0094a396,2006-04-15 12:00:00,,,,,,,2006-04-15,2006,12:00:00
4,194411,94664635d58f42c4afad73ab0094a396,2006-04-16 12:00:00,,,,,,,2006-04-16,2006,12:00:00


In [13]:
# remove unused fields
# -------------------------------------------------
df_tstemp = df_tstemp.drop(['OID_', 'GradeCode', 'GradeName', 'ApprovalLevel'], axis=1)
df_tstemp = df_tstemp.drop_duplicates().reset_index(drop=True)
df_tstemp

Unnamed: 0,SensorID,Timestamp,RecordedValue,Method,ApprovalName,Timestamp_Date,Timestamp_Year,Timestamp_Time
0,94664635d58f42c4afad73ab0094a396,2006-04-12 12:00:00,,,,2006-04-12,2006,12:00:00
1,94664635d58f42c4afad73ab0094a396,2006-04-13 12:00:00,,,,2006-04-13,2006,12:00:00
2,94664635d58f42c4afad73ab0094a396,2006-04-14 12:00:00,,,,2006-04-14,2006,12:00:00
3,94664635d58f42c4afad73ab0094a396,2006-04-15 12:00:00,,,,2006-04-15,2006,12:00:00
4,94664635d58f42c4afad73ab0094a396,2006-04-16 12:00:00,,,,2006-04-16,2006,12:00:00
...,...,...,...,...,...,...,...,...
10455751,1654933d997b4207b4c9429005a74569,2021-09-07 14:30:00,54.907559,DefaultNone,Provisional,2021-09-07,2021,14:30:00
10455752,05aeabe35a554edb824f42ae1bca3621,2021-09-07 13:45:00,469.388196,DefaultNone,Provisional,2021-09-07,2021,13:45:00
10455753,05aeabe35a554edb824f42ae1bca3621,2021-09-07 14:00:00,469.388196,DefaultNone,Provisional,2021-09-07,2021,14:00:00
10455754,05aeabe35a554edb824f42ae1bca3621,2021-09-07 14:15:00,469.388196,DefaultNone,Provisional,2021-09-07,2021,14:15:00


### Export out csv

In [14]:
#Merge ts with database
dfsupertemp = pd.merge(df_tstemp, df_loctemp, on='SensorID', how='left')

print(len(dfsupertemp))
dfsupertemp.head()

10455756


Unnamed: 0,SensorID,Timestamp,RecordedValue,Method,ApprovalName,Timestamp_Date,Timestamp_Year,Timestamp_Time,OID_,LocationID,LocationCode,LocationName,LocationPath,LocationType,IsExternalLocation,Longitude,Latitude,UtcOffset,LastModified,ElevationUnits,Elevation,Description,Tags,ExtendedAttributeValues,StatusDesc,ActiveFlag,SensorLabel,TimeSeriesType,Parameter,UnitOfMeasure,ComputationMethod,ComputationPeriod,LastModifiedTime,ParameterLabel,in_VariableSpecificCV
0,94664635d58f42c4afad73ab0094a396,2006-04-12 12:00:00,,,,2006-04-12,2006,12:00:00,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage
1,94664635d58f42c4afad73ab0094a396,2006-04-13 12:00:00,,,,2006-04-13,2006,12:00:00,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage
2,94664635d58f42c4afad73ab0094a396,2006-04-14 12:00:00,,,,2006-04-14,2006,12:00:00,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage
3,94664635d58f42c4afad73ab0094a396,2006-04-15 12:00:00,,,,2006-04-15,2006,12:00:00,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage
4,94664635d58f42c4afad73ab0094a396,2006-04-16 12:00:00,,,,2006-04-16,2006,12:00:00,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,stage,ProcessorBasic,HG,ft,Unknown,Unknown,2/2/2021 6:16,Stage,Reservoirs and Gages - Daily - Stage


In [15]:
# Export out to CSV.
# df_tstemp.to_csv('P_mnSSTimeSeries.csv', index=False) # The output.
# df_loctemp.to_csv('P_mnSSSiteInfo.csv', index=False) # The site information.
dfsupertemp.to_csv('P_mtSSMaster.csv', index=False) # The output.

In [16]:
dfsupertemp['UnitOfMeasure'].value_counts()

ft^3/s    10031502
ft          424254
Name: UnitOfMeasure, dtype: int64