# Pre-processing Montana Site Specific data for WaDEQA upload.
- Date Updated: 09/13/2021
- Purpose:  To pre-process the Montana ss data into one master file for simple DataFrame creation and extraction

### Goal
- Create sites_input.csv, contains location and site information.
- Create cleaned input_timeseries.csv with a native site ID field.

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import json
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Working Directory and Input Files
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Montana/SiteSpecificAmounts/RawInputData"
os.chdir(workingDir)

### DataFrame Creation

In [2]:
# Dataframe Creation - location data
inputFile1 = "MGS_locations.csv"
df_loc = pd.read_csv(inputFile1)
print(len(df_loc))
df_loc.head(1)

174


Unnamed: 0,OID_,LocationID,LocationCode,LocationName,LocationPath,LocationType,IsExternalLocation,Longitude,Latitude,UtcOffset,LastModified,ElevationUnits,Elevation,Description,Tags,ExtendedAttributeValues,StatusDesc,ActiveFlag
0,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1


In [3]:
# Dataframe Creation - datasets data
inputFile2 = "MGS_datasets.csv"
df_dase = pd.read_csv(inputFile2)
print(len(df_dase))
df_dase.head(1)

509


Unnamed: 0,OID_,SensorCode,SensorID,SensorLabel,Comment,Description,isPublished,LocationName,LocationCode,LocationID,SubLocationCode,TimeSeriesType,Parameter,UnitOfMeasure,UtcOffset,ComputationMethod,ComputationPeriod,LastModifiedTime,ExtendedAttributeValues,ParameterLabel
0,1,Discharge.Daily Average@40A 1500,e069deacfba143b3ba052cc39714d44d,Daily Average,,,1,NF Musselshell near Delphine,40A 1500,b947d0c2364e4948a8032baac8904bef,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge


In [4]:
# Dataframe Creation - timeseries data
inputFile3 = "MGS_timeseries.csv"
df_ts = pd.read_csv(inputFile3)
print(len(df_ts))
df_ts.head(1)

26459294


Unnamed: 0,OID_,SensorID,Timestamp,RecordedValue,GradeCode,GradeName,Method,ApprovalLevel,ApprovalName
0,1,e069deacfba143b3ba052cc39714d44d,1/30/1981 0:00:00,,,,,,


### Location & Site Information

In [5]:
# Only working with SensorLabel = Daily Average, & ParameterLabel = Discharge or Stage, to recreatie their plots.
df_dasetemp = df_dase[(df_dase['SensorLabel'] == 'Daily Average')]
df_dasetemp = df_dasetemp[(df_dasetemp['ParameterLabel'] == 'Discharge') | (df_dasetemp['ParameterLabel'] == 'Stage')]

print(len(df_dasetemp))
df_dasetemp.head()

172


Unnamed: 0,OID_,SensorCode,SensorID,SensorLabel,Comment,Description,isPublished,LocationName,LocationCode,LocationID,SubLocationCode,TimeSeriesType,Parameter,UnitOfMeasure,UtcOffset,ComputationMethod,ComputationPeriod,LastModifiedTime,ExtendedAttributeValues,ParameterLabel
0,1,Discharge.Daily Average@40A 1500,e069deacfba143b3ba052cc39714d44d,Daily Average,,,1,NF Musselshell near Delphine,40A 1500,b947d0c2364e4948a8032baac8904bef,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge
3,4,Discharge.Daily Average@40A 2000,71a11087f6ce4e25b753b429143e9b23,Daily Average,,,1,NF Musselshell blw Bair Reservoir,40A 2000,b85ca58a37784f2aa2fb33588dc4cfe9,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge
7,8,Discharge.Daily Average@40A 10000,53271ee869f147a4b2e1d4bf6d8c5f30,Daily Average,,,1,Barber Canal,40A 10000,f412dc9558214d9f82ca6dd0ebcc3970,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge
10,11,Discharge.Daily Average@40A 3000,2db3dc36ee104d15825a70e189b1a972,Daily Average,,,1,NF Musselshell Diversion Canal,40A 3000,61da4aa318334518843094d1f7d8fe0e,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge
13,14,Discharge.Daily Average@40A 4080,2a35ab6f412e42eca0ee527848845c04,Daily Average,,,1,Martinsdale Supply Canal,40A 4080,35143516212943bf8c60b01287ad82f0,,ProcessorDerived,QR,ft^3/s,#NAME?,Mean,Daily,2/2/2021 6:24,[],Discharge


In [6]:
# Create VariableSpecificCV

def createVariableSpecificCV(A):
    if A == 'Discharge':
        outString = "Reservoirs and Gages - Daily - Discharge"
    if A == 'Stage':
        outString = "Reservoirs and Gages - Daily - Stage"
    return outString

df_dasetemp['in_VariableSpecificCV'] = df_dasetemp.apply(lambda row: createVariableSpecificCV(row['ParameterLabel']), axis=1)

In [7]:
# trim down dataset to those fields that are needed.
df_dasetemp_sub = df_dasetemp[['LocationCode', 'SensorID', 'SensorLabel', 'TimeSeriesType', 'Parameter', 'UnitOfMeasure', 'ComputationMethod', 'ComputationPeriod', 'LastModifiedTime', 'ParameterLabel', 'in_VariableSpecificCV']]
print(len(df_dasetemp_sub))
df_dasetemp_sub.head()

172


Unnamed: 0,LocationCode,SensorID,SensorLabel,TimeSeriesType,Parameter,UnitOfMeasure,ComputationMethod,ComputationPeriod,LastModifiedTime,ParameterLabel,in_VariableSpecificCV
0,40A 1500,e069deacfba143b3ba052cc39714d44d,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
3,40A 2000,71a11087f6ce4e25b753b429143e9b23,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
7,40A 10000,53271ee869f147a4b2e1d4bf6d8c5f30,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
10,40A 3000,2db3dc36ee104d15825a70e189b1a972,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
13,40A 4080,2a35ab6f412e42eca0ee527848845c04,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge


In [8]:
# merge location info with trimmed down dataset data
df_loctemp=pd.DataFrame()
df_loctemp = pd.merge(df_loc, df_dasetemp_sub, on='LocationCode', how='left')
print(len(df_loctemp))
df_loctemp.head()

208


Unnamed: 0,OID_,LocationID,LocationCode,LocationName,LocationPath,LocationType,IsExternalLocation,Longitude,Latitude,UtcOffset,LastModified,ElevationUnits,Elevation,Description,Tags,ExtendedAttributeValues,StatusDesc,ActiveFlag,SensorID,SensorLabel,TimeSeriesType,Parameter,UnitOfMeasure,ComputationMethod,ComputationPeriod,LastModifiedTime,ParameterLabel,in_VariableSpecificCV
0,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,e069deacfba143b3ba052cc39714d44d,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
1,2,b85ca58a37784f2aa2fb33588dc4cfe9,40A 2000,NF Musselshell blw Bair Reservoir,All Locations.DNRC SWP,Hydrology Station,0,-110.5535,46.5747,-PT7H,2/4/2021 15:33:18,ft,5232.45,North Fork Musselshell River below Bair Reservoir,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,71a11087f6ce4e25b753b429143e9b23,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
2,3,f412dc9558214d9f82ca6dd0ebcc3970,40A 10000,Barber Canal,All Locations.DNRC SWP,Hydrology Station,0,-109.4012,46.3379,-PT7H,2/4/2021 15:34:28,ft,3877.729,Deadman's Basin Outlet Canal-Barber Canal (Son...,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,53271ee869f147a4b2e1d4bf6d8c5f30,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
3,4,61da4aa318334518843094d1f7d8fe0e,40A 3000,NF Musselshell Diversion Canal,All Locations.DNRC SWP,Hydrology Station,0,-110.418,46.5359,-PT7H,2/4/2021 15:34:56,ft,4958.962,North Fork Musselshell River Diversion Canal b...,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,2db3dc36ee104d15825a70e189b1a972,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
4,5,35143516212943bf8c60b01287ad82f0,40A 4080,Martinsdale Supply Canal,All Locations.DNRC SWP,Hydrology Station,0,-110.3088,46.454,-PT7H,2/4/2021 15:36:09,ft,4802.065,Martinsdale Supply Canal abv Drop,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,2a35ab6f412e42eca0ee527848845c04,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge


### Cleaned Timeseries

In [9]:
# list of SensorID to find and perserve in timesers data
SensorIDList = df_dasetemp_sub['SensorID'].tolist()
SensorIDList

['e069deacfba143b3ba052cc39714d44d',
 '71a11087f6ce4e25b753b429143e9b23',
 '53271ee869f147a4b2e1d4bf6d8c5f30',
 '2db3dc36ee104d15825a70e189b1a972',
 '2a35ab6f412e42eca0ee527848845c04',
 '9503f913f4cc4b72b38c80b255df8350',
 '14a757b647494fdba34994f095e96338',
 '6f661234169148079928df7bd407488e',
 '2cfb4bff6ae1401b9e672c70e40d347a',
 '4aa99f1e5b43450b9c397b46db4fbceb',
 '51260fdfca314f67bcace6f74bbffecd',
 'c67e545d7ad9487899ce7c54f505f1ac',
 'a007457fc29b478c8d15a24af5b1a7f3',
 'd945aaa229f34ff4a61ba35e0f44cbd0',
 'db593296d9234c5e89c164dab10da459',
 '80c1cc56ada24a4fb22839cb38deb962',
 '686b2aade1e24ddaac3fba90ea32841e',
 '0184446d3369443d93ce546daee12614',
 '34a8fcb433634d6587df0057fc04b1c2',
 'e630525fbdbf45a38d87c77bb61169c0',
 '603bcbadfd034ee08e562ecb53d9a8ad',
 '575973c03b0a4751820c740a1b6560c6',
 '986aa0055e7a4947bf3ef1c66467c255',
 '7167bd3b11b44a09b7c91be2718abed8',
 '78dcdf21ae20457aa5962451d9ad0761',
 '9cf36c21cb014a4bbc7bb05d346ff02f',
 '510d280b9d3347d188b26c8c83b8b7a6',
 

In [10]:
# use only those timeseries rows who's SensorID is in the list
df_tstemp = df_ts
df_tstemp = df_tstemp[df_tstemp['SensorID'].isin(SensorIDList)].drop_duplicates().reset_index(drop=True)
print(len(df_tstemp))
df_tstemp.head()

328776


Unnamed: 0,OID_,SensorID,Timestamp,RecordedValue,GradeCode,GradeName,Method,ApprovalLevel,ApprovalName
0,1,e069deacfba143b3ba052cc39714d44d,1/30/1981 0:00:00,,,,,,
1,2,e069deacfba143b3ba052cc39714d44d,1/14/1982 12:00:00,,,,,,
2,3,e069deacfba143b3ba052cc39714d44d,1/15/1983 0:00:00,,,,,,
3,4,e069deacfba143b3ba052cc39714d44d,12/31/1983 0:00:00,,,,,,
4,5,e069deacfba143b3ba052cc39714d44d,12/29/1984 12:00:00,,,,,,


In [11]:
# extract date and time values from Timestamp field
# -------------------------------------------------

#convert from string to datetime
df_tstemp['Timestamp'] = pd.to_datetime(df_tstemp['Timestamp']) 

# extract date, year and time, create three new fields
df_tstemp['Timestamp_Date'] = df_tstemp['Timestamp'].dt.date
df_tstemp['Timestamp_Date'] = pd.to_datetime(df_tstemp['Timestamp_Date'], errors = 'coerce')
df_tstemp['Timestamp_Date'] = pd.to_datetime(df_tstemp['Timestamp_Date'].dt.strftime('%m/%d/%Y'))

# Year
df_tstemp['Timestamp_Year'] = pd.DatetimeIndex(df_tstemp['Timestamp_Date']).year

# time
df_tstemp['Timestamp_Time'] = df_tstemp['Timestamp'].dt.time

print(len(df_tstemp))
df_tstemp.head()

328776


Unnamed: 0,OID_,SensorID,Timestamp,RecordedValue,GradeCode,GradeName,Method,ApprovalLevel,ApprovalName,Timestamp_Date,Timestamp_Year,Timestamp_Time
0,1,e069deacfba143b3ba052cc39714d44d,1981-01-30 00:00:00,,,,,,,1981-01-30,1981,00:00:00
1,2,e069deacfba143b3ba052cc39714d44d,1982-01-14 12:00:00,,,,,,,1982-01-14,1982,12:00:00
2,3,e069deacfba143b3ba052cc39714d44d,1983-01-15 00:00:00,,,,,,,1983-01-15,1983,00:00:00
3,4,e069deacfba143b3ba052cc39714d44d,1983-12-31 00:00:00,,,,,,,1983-12-31,1983,00:00:00
4,5,e069deacfba143b3ba052cc39714d44d,1984-12-29 12:00:00,,,,,,,1984-12-29,1984,12:00:00


In [12]:
# remove unused fields
# -------------------------------------------------
df_tstemp = df_tstemp.drop(['OID_', 'GradeCode', 'GradeName', 'ApprovalLevel'], axis=1)
df_tstemp = df_tstemp.drop_duplicates().reset_index(drop=True)
df_tstemp

Unnamed: 0,SensorID,Timestamp,RecordedValue,Method,ApprovalName,Timestamp_Date,Timestamp_Year,Timestamp_Time
0,e069deacfba143b3ba052cc39714d44d,1981-01-30 00:00:00,,,,1981-01-30,1981,00:00:00
1,e069deacfba143b3ba052cc39714d44d,1982-01-14 12:00:00,,,,1982-01-14,1982,12:00:00
2,e069deacfba143b3ba052cc39714d44d,1983-01-15 00:00:00,,,,1983-01-15,1983,00:00:00
3,e069deacfba143b3ba052cc39714d44d,1983-12-31 00:00:00,,,,1983-12-31,1983,00:00:00
4,e069deacfba143b3ba052cc39714d44d,1984-12-29 12:00:00,,,,1984-12-29,1984,12:00:00
...,...,...,...,...,...,...,...,...
327326,a7fa1a92160140ed9d141a2343a48719,2021-09-06 00:00:00,50.152872,DefaultNone,Provisional,2021-09-06,2021,00:00:00
327327,a7fa1a92160140ed9d141a2343a48719,2021-09-07 00:00:00,48.973710,DefaultNone,Provisional,2021-09-07,2021,00:00:00
327328,7dc38718740d4213920f5be684981377,2021-09-07 00:00:00,3.123750,DefaultNone,Provisional,2021-09-07,2021,00:00:00
327329,c0925856c3c74440afd2045c7f0dbc7c,2021-09-06 00:00:00,150.129749,DefaultNone,Provisional,2021-09-06,2021,00:00:00


### Export out csv

In [13]:
#Merge ts with database
dfsupertemp = pd.merge(df_tstemp, df_loctemp, on='SensorID', how='left')
print(len(dfsupertemp))
dfsupertemp.head()

327331


Unnamed: 0,SensorID,Timestamp,RecordedValue,Method,ApprovalName,Timestamp_Date,Timestamp_Year,Timestamp_Time,OID_,LocationID,LocationCode,LocationName,LocationPath,LocationType,IsExternalLocation,Longitude,Latitude,UtcOffset,LastModified,ElevationUnits,Elevation,Description,Tags,ExtendedAttributeValues,StatusDesc,ActiveFlag,SensorLabel,TimeSeriesType,Parameter,UnitOfMeasure,ComputationMethod,ComputationPeriod,LastModifiedTime,ParameterLabel,in_VariableSpecificCV
0,e069deacfba143b3ba052cc39714d44d,1981-01-30 00:00:00,,,,1981-01-30,1981,00:00:00,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
1,e069deacfba143b3ba052cc39714d44d,1982-01-14 12:00:00,,,,1982-01-14,1982,12:00:00,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
2,e069deacfba143b3ba052cc39714d44d,1983-01-15 00:00:00,,,,1983-01-15,1983,00:00:00,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
3,e069deacfba143b3ba052cc39714d44d,1983-12-31 00:00:00,,,,1983-12-31,1983,00:00:00,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge
4,e069deacfba143b3ba052cc39714d44d,1984-12-29 12:00:00,,,,1984-12-29,1984,12:00:00,1,b947d0c2364e4948a8032baac8904bef,40A 1500,NF Musselshell near Delphine,All Locations.DNRC SWP,Hydrology Station,0,-110.5768,46.6094,-PT7H,2/4/2021 15:20:51,ft,5380.533,North Fork Musselshell River near Delphine,"[{""UniqueId"": ""1c1aabaa26e8495bae4b3f69f0fd617...",[],Real-time,1,Daily Average,ProcessorDerived,QR,ft^3/s,Mean,Daily,2/2/2021 6:24,Discharge,Reservoirs and Gages - Daily - Discharge


In [14]:
# Export out to CSV.
dfsupertemp.to_csv('P_mtSSMaster.csv', index=False) # The output.

In [16]:
dfsupertemp['ParameterLabel'].value_counts()

Discharge    276970
Stage         50361
Name: ParameterLabel, dtype: int64