# Preprocessing Oregon Reservoir and Gage data for WaDE

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Oregon/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Site Data

In [3]:
# Input File - OWRD_gages.csv
sgInput = "OWRD_gages.csv"
dfsg = pd.read_csv(sgInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "orSRG" + dfsg.index.astype(str)
    dfsg.to_csv('OWRD_gages.csv', index=False)

print(len(dfsg))
dfsg.head(1)

572


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,3/1/1910 0:00,9/30/2011 0:00,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [4]:
# Drop null rows for the time periods

dfsg = dfsg.dropna(subset=['period_of_', 'period_of1']).reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

432


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,3/1/1910 0:00,9/30/2011 0:00,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [5]:
# convert start and end data to MM/DD/YYYY format.

dfsg['period_of_'] = pd.to_datetime(dfsg['period_of_'], utc=True)
dfsg['period_of_'] = pd.to_datetime(dfsg["period_of_"].dt.strftime('%m/%d/%Y').astype(str))

dfsg['period_of1'] = pd.to_datetime(dfsg['period_of1'], utc=True)
dfsg['period_of1'] = pd.to_datetime(dfsg["period_of1"].dt.strftime('%m/%d/%Y').astype(str))

dfsg.head(1)

Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,1910-03-01,2011-09-30,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [8]:
# Only use post-1950 start recrods

dfsg = dfsg[dfsg['period_of_'] > '1950-01-01']
print(f'min period_of_: ', min(dfsg['period_of_']))
print(len(dfsg))
dfsg.head(1)

min period_of_:  1950-04-20 00:00:00
222


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
6,8,65,126,10388001,1952-10-01,1991-09-30,"ANA R + SUMMER LAKE CN NR SUMMER LAKE, OR",D,S,S,1300700010,-120.749443,42.994999,Lake,Oregon,SC,12,17120005,,30,S,17.0,E,6,,,0,0.0,,OWRD,,0.0,0.0,0,0,Goose & Summer Lake,Spring,Stream,Discontinued,,39,10,1982,1991,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,orSRG8


In [9]:
# only use Active site records

dfsg = dfsg[dfsg['station__1'] == 'Active'].drop_duplicates().reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

97


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,13,78,139,10391050,1989-03-01,2014-09-30,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",A,L,S,1300600010,-121.134564,43.142203,Lake,Oregon,SC,11,17120005,,0,,0.0,,0,,,4460,0.0,YR,OWRD,,146.0,285.0,1,0,Goose & Summer Lake,,Stream,Active,Year-round,11,12,1989,2000,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG13


In [10]:
# Get list of StationNumber
streamgageIdList = dfsg['station_nb'].tolist()
print(len(streamgageIdList))
streamgageIdList

97


['10391050',
 '10392500',
 '11339995',
 '11340695',
 '11491400',
 '11495900',
 '11499100',
 '13274400',
 '13281200',
 '13282550',
 '13284900',
 '13317850',
 '13318060',
 '13318210',
 '13318920',
 '13318960',
 '13319900',
 '13320300',
 '13330050',
 '13330300',
 '13331450',
 '14010800',
 '14019109',
 '14019110',
 '14024100',
 '14024300',
 '14026897',
 '14029550',
 '14029780',
 '14029900',
 '14030820',
 '14031050',
 '14031600',
 '14032400',
 '14032650',
 '14034800',
 '14039380',
 '14039500',
 '14046890',
 '14047100',
 '14055600',
 '14069699',
 '14073520',
 '14074900',
 '14076050',
 '14076100',
 '14079800',
 '14080590',
 '14082550',
 '14083400',
 '14085200',
 '14085700',
 '14087300',
 '14095250',
 '14095255',
 '14100850',
 '14153800',
 '14194300',
 '14202450',
 '14202920',
 '14204530',
 '14204800',
 '14206200',
 '14206295',
 '14299000',
 '14299137',
 '14299150',
 '14300100',
 '14301300',
 '14304350',
 '14306030',
 '14306820',
 '14306900',
 '14320700',
 '14327120',
 '14327122',
 '14327137',

In [11]:
# Get list of start date
startDateList = dfsg['period_of_'].astype(str).tolist()
print(len(startDateList))
startDateList

97


['1989-03-01',
 '2014-11-01',
 '1980-10-01',
 '1976-04-01',
 '1973-10-01',
 '2008-07-22',
 '1973-10-01',
 '2008-10-01',
 '1976-06-23',
 '1999-05-22',
 '1969-11-15',
 '1992-10-01',
 '1977-07-01',
 '1992-10-01',
 '1992-10-01',
 '1996-10-01',
 '1992-10-01',
 '1996-10-01',
 '1995-08-01',
 '1995-06-01',
 '1995-08-16',
 '1969-10-01',
 '2004-10-01',
 '2004-10-01',
 '2006-01-26',
 '2006-01-21',
 '2005-10-01',
 '2006-10-01',
 '1993-04-29',
 '1992-11-09',
 '1980-10-01',
 '1992-11-09',
 '2009-10-01',
 '2007-08-15',
 '2006-10-03',
 '1960-10-01',
 '1994-01-12',
 '1951-10-01',
 '2003-10-01',
 '1972-07-18',
 '1969-10-01',
 '2006-10-01',
 '1997-11-03',
 '1970-10-30',
 '2000-05-18',
 '2007-04-01',
 '1960-10-01',
 '1981-04-21',
 '1999-11-17',
 '1999-09-15',
 '1955-04-01',
 '2009-10-01',
 '1967-10-01',
 '1999-09-28',
 '1999-07-19',
 '1968-04-01',
 '1976-10-01',
 '1958-10-01',
 '2008-01-02',
 '1972-10-01',
 '2008-01-01',
 '2008-01-01',
 '2008-02-23',
 '2008-01-01',
 '1977-09-13',
 '2009-10-01',
 '1974-10-

In [12]:
# Get list of end date
endDateList = dfsg['period_of1'].astype(str).tolist()
print(len(endDateList))
endDateList

97


['2014-09-30',
 '2016-09-30',
 '2012-09-30',
 '2014-09-30',
 '2014-09-30',
 '2011-09-30',
 '2014-09-30',
 '2012-09-30',
 '2014-09-30',
 '2014-09-30',
 '2012-09-30',
 '2012-09-30',
 '2012-09-30',
 '2012-09-30',
 '2012-09-30',
 '2014-09-30',
 '2012-09-30',
 '2012-09-30',
 '2021-09-30',
 '2015-09-30',
 '2012-09-30',
 '2019-09-30',
 '2016-09-30',
 '2016-09-30',
 '2012-09-30',
 '2012-09-30',
 '2016-09-30',
 '2012-09-30',
 '2016-09-30',
 '2012-09-30',
 '2016-09-30',
 '2016-10-30',
 '2011-09-30',
 '2016-09-30',
 '2014-09-30',
 '2014-09-30',
 '2012-09-30',
 '2015-09-30',
 '2022-02-28',
 '2011-09-30',
 '2011-09-30',
 '2012-09-30',
 '2011-09-30',
 '2014-09-30',
 '2016-09-30',
 '2015-09-30',
 '2015-09-30',
 '2011-09-30',
 '2012-09-30',
 '2014-09-30',
 '2015-09-30',
 '2015-09-30',
 '2014-09-30',
 '2011-09-30',
 '2011-09-30',
 '2011-09-30',
 '2011-09-30',
 '2015-09-30',
 '2015-09-30',
 '2015-09-30',
 '2015-09-30',
 '2015-09-30',
 '2015-09-30',
 '2015-09-30',
 '2019-09-30',
 '2014-09-30',
 '2014-09-

## Timeseries Data

In [13]:
# already done

# %%time
# # get timeseries results

# # create empty dataframe
# dfts = pd.DataFrame()

# sglength = len(streamgageIdList)
# for i in range(sglength):
#     url = "https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=" + str(streamgageIdList[i]) + "&start_date=" + str(startDateList[i]) + "&end_date=" + str(endDateList[i]) + "&dataset=MDF&format=html"
#     print(url)
#     try:
#         # store in dataframe
#         dftemp = pd.DataFrame()
#         dftemp = pd.read_csv(url, sep="\t")
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
    
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
#         print("Error, issue with API return.")

# dfts.to_csv('timeseriesData.csv', index=False)  # The output.
# print(len(dfts))
# dfts.head()

https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=10391050&start_date=1989-03-01&end_date=2014-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=10392500&start_date=2014-11-01&end_date=2016-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=11339995&start_date=1980-10-01&end_date=2012-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=11340695&start_date=1976-04-01&end_date=2014-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=11491400&start_date=1973-10-01&end_date=2014-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=11495900&start_date=2008-07-22&end_date=2011-09-30&dataset=MDF&format=html
https://ap

https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=14085200&start_date=1955-04-01&end_date=2015-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=14085700&start_date=2009-10-01&end_date=2015-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=14087300&start_date=1967-10-01&end_date=2014-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=14095250&start_date=1999-09-28&end_date=2011-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=14095255&start_date=1999-07-19&end_date=2011-09-30&dataset=MDF&format=html
https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=14100850&start_date=1968-04-01&end_date=2011-09-30&dataset=MDF&format=html
https://ap

Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url
0,10391050,03-01-1989,11.0,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...
1,10391050,03-02-1989,9.6,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...
2,10391050,03-03-1989,6.1,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...
3,10391050,03-04-1989,6.2,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...
4,10391050,03-05-1989,13.0,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...


In [15]:
# Input File - timeseriesData.csv
# read the dataset from a zip
fileInput = "timeseriesData.zip"
dfts = pd.read_csv(fileInput)

print(len(dfts))
dfts.head(1)

723226


  dfts = pd.read_csv(fileInput)


Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url
0,10391050,03-01-1989,11.0,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...


In [16]:
# only use published data

dfts = dfts[dfts['published_status'] == 'Published'].drop_duplicates().reset_index(drop=True)
print(len(dfts))
dfts.head(1)

557668


Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url
0,10391050,03-01-1989,11.0,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...


## Output Dataframe

In [17]:
# Merging dataframes into one, using left-join.
df = pd.merge(dfts, dfsg, left_on='<pre>station_nbr', right_on='station_nb', how='left')
print(len(df))
df.head(1)

557668


Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,10391050,03-01-1989,11.0,Published,,,01-26-2023 09:09,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,13,78,139,10391050,1989-03-01,2014-09-30,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",A,L,S,1300600010,-121.134564,43.142203,Lake,Oregon,SC,11,17120005,,0,,0.0,,0,,,4460,0.0,YR,OWRD,,146.0,285.0,1,0,Goose & Summer Lake,,Stream,Active,Year-round,11,12,1989,2000,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG13


In [18]:
dfout = pd.DataFrame(index=df.index)

# Water Source Info
dfout['in_WaterSourceTypeCV'] = df['streamfl_1']

# Site Info
dfout['in_County'] = df['county_nam']
dfout['in_Latitude'] = df['latitude_d']
dfout['in_Longitude'] = df['longitude_']
dfout['in_PODorPOUSite'] = "Gage"
dfout['in_SiteName'] = df['station_na']
dfout['in_SiteNativeID'] = df['station_nb']
dfout['in_SiteTypeCV'] = df['source_t_1']

# Site VariableAmounts Info
dfout['in_Amount'] = df['mean_daily_flow_cfs']
dfout['in_BeneficialUseCategory'] = "Unspecified"
dfout['in_ReportYearCV'] = df['record_date']
dfout['in_TimeframeEnd'] = df['record_date']
dfout['in_TimeframeStart'] = df['record_date']

print(len(dfout))
dfout.head(1)

557668


Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,11.0,Unspecified,03-01-1989,03-01-1989,03-01-1989


In [19]:
# drop duplicate values
dfout = dfout.drop_duplicates().reset_index(drop=True)
print(len(dfout))

557668


## Cleaning Output

In [20]:
dfout.info()
dfout.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557668 entries, 0 to 557667
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   in_WaterSourceTypeCV      557668 non-null  object 
 1   in_County                 557668 non-null  object 
 2   in_Latitude               557668 non-null  float64
 3   in_Longitude              557668 non-null  float64
 4   in_PODorPOUSite           557668 non-null  object 
 5   in_SiteName               557668 non-null  object 
 6   in_SiteNativeID           557668 non-null  object 
 7   in_SiteTypeCV             557668 non-null  object 
 8   in_Amount                 557668 non-null  float64
 9   in_BeneficialUseCategory  557668 non-null  object 
 10  in_ReportYearCV           557668 non-null  object 
 11  in_TimeframeEnd           557668 non-null  object 
 12  in_TimeframeStart         557668 non-null  object 
dtypes: float64(3), object(10)
memory usage: 55.3

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,11.0,Unspecified,03-01-1989,03-01-1989,03-01-1989


In [21]:
# Fixing empty water source types

def fixWaterSourceTypeCV(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val) or val == "nan":
        outString = "Unspecified"
    else:
        outString = val
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: fixWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

array(['Unspecified', 'Runoff', 'Spring'], dtype=object)

In [22]:
# Fixing empty site types names

def fixSiteTypeCV(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val) or val == "nan":
        outString = "Unspecified"
    else:
        outString = val
    return outString

dfout['in_SiteTypeCV'] = dfout.apply(lambda row: fixSiteTypeCV(row['in_SiteTypeCV']), axis=1)
dfout['in_SiteTypeCV'].unique()

array(['Stream', 'Diversion'], dtype=object)

In [23]:
# Convert History Year to MM/DD/YYYY format.
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], utc=True)
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], utc=True)
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,11.0,Unspecified,03-01-1989,1989-03-01,1989-03-01


In [24]:
# in_Latitude & in_Longitude
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout.head(1)

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,11.0,Unspecified,03-01-1989,1989-03-01,1989-03-01


In [25]:
# extract year out
dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], utc=True)
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.year
dfout.head(1)

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,11.0,Unspecified,1989,1989-03-01,1989-03-01


## WaDE Custom Elements (due to missing info)

In [26]:
%%time
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOR_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

Wall time: 1min 56s


array(['WaDEOR_WS1', 'WaDEOR_WS2', 'WaDEOR_WS3'], dtype=object)

## Export Outputs

In [27]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

in_WaterSourceTypeCV                object
in_County                           object
in_Latitude                        float64
in_Longitude                       float64
in_PODorPOUSite                     object
in_SiteName                         object
in_SiteNativeID                     object
in_SiteTypeCV                       object
in_Amount                          float64
in_BeneficialUseCategory            object
in_ReportYearCV                      int64
in_TimeframeEnd             datetime64[ns]
in_TimeframeStart           datetime64[ns]
in_WaterSourceNativeID              object
dtype: object


In [28]:
dfout

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID
0,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,11.0,Unspecified,1989,1989-03-01,1989-03-01,WaDEOR_WS1
1,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,9.6,Unspecified,1989,1989-03-02,1989-03-02,WaDEOR_WS1
2,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,6.1,Unspecified,1989,1989-03-03,1989-03-03,WaDEOR_WS1
3,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,6.2,Unspecified,1989,1989-03-04,1989-03-04,WaDEOR_WS1
4,Unspecified,Lake,43.142203,-121.134564,Gage,"BUCK CR AB PAULINA MARSH NR SILVER LAKE, OR",10391050,Stream,13.0,Unspecified,1989,1989-03-05,1989-03-05,WaDEOR_WS1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
557663,Runoff,Deschutes,44.254312,-121.550199,Gage,WHYCHUS CR BL TSID DIV NR SISTERS,14076020,Stream,26.0,Unspecified,2016,2016-09-26,2016-09-26,WaDEOR_WS2
557664,Runoff,Deschutes,44.254312,-121.550199,Gage,WHYCHUS CR BL TSID DIV NR SISTERS,14076020,Stream,27.0,Unspecified,2016,2016-09-27,2016-09-27,WaDEOR_WS2
557665,Runoff,Deschutes,44.254312,-121.550199,Gage,WHYCHUS CR BL TSID DIV NR SISTERS,14076020,Stream,27.0,Unspecified,2016,2016-09-28,2016-09-28,WaDEOR_WS2
557666,Runoff,Deschutes,44.254312,-121.550199,Gage,WHYCHUS CR BL TSID DIV NR SISTERS,14076020,Stream,24.0,Unspecified,2016,2016-09-29,2016-09-29,WaDEOR_WS2


In [29]:
#Exporting to Finished File
dfout.to_csv('P_orSSRGMain.csv', index=False)  # The output

In [30]:
dfout['in_WaterSourceTypeCV'].unique()

array(['Unspecified', 'Runoff', 'Spring'], dtype=object)

In [31]:
dfout['in_Latitude'].unique()

array([43.142203  , 43.922727  , 42.272314  , 42.17742   , 42.910953  ,
       42.49652778, 42.48666763, 44.57527542, 44.910045  , 45.012919  ,
       45.05685   , 45.071528  , 45.268544  , 45.266642  , 45.351944  ,
       45.353994  , 45.124822  , 45.211914  , 45.489692  , 45.537875  ,
       45.60875   , 45.885003  , 45.912217  , 45.866329  , 45.483971  ,
       45.48407   , 45.695441  , 45.853722  , 45.859968  , 45.721456  ,
       45.77435   , 45.764786  , 45.795456  , 45.863392  , 45.91109   ,
       45.262642  , 44.31645   , 44.42307778, 44.91027832, 44.953786  ,
       43.575578  , 44.347587  , 44.089328  , 44.116472  , 44.287949  ,
       44.318225  , 44.178215  , 44.239786  , 44.333966  , 44.407383  ,
       44.299416  , 44.450949  , 44.367806  , 44.801469  , 44.801195  ,
       45.25981111, 43.7092    , 45.364959  , 45.460472  , 45.480803  ,
       45.510778  , 45.502097  , 45.520114  , 45.489961  , 45.893442  ,
       45.87360764, 45.88500214, 45.85842   , 45.575115  , 44.81