# Preprocessing Oregon Reservoir and Gage data for WaDE

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Oregon/SS_ReservoirsObservationSites/RawInputData"
os.chdir(workingDir)

## Site Data

In [3]:
# Input File - OWRD_gages.csv
sgInput = "OWRD_gages.csv"
dfsg = pd.read_csv(sgInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfsg:
    dfsg['WaDEUUID'] = "orSRG" + dfsg.index.astype(str)
    dfsg.to_csv('OWRD_gages.csv', index=False)

print(len(dfsg))
dfsg.head(1)

572


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,3/1/1910 0:00,9/30/2011 0:00,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [4]:
# Drop null rows for the time periods

dfsg = dfsg.dropna(subset=['period_of_', 'period_of1']).reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

432


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,3/1/1910 0:00,9/30/2011 0:00,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [5]:
# only use Active site records

dfsg = dfsg[dfsg['station__1'] == 'Active'].drop_duplicates().reset_index(drop=True)
print(len(dfsg))
dfsg.head(1)

171


Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,3/1/1910 0:00,9/30/2011 0:00,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [6]:
# convert start and end data to MM/DD/YYYY format.

dfsg['period_of_'] = pd.to_datetime(dfsg['period_of_'], utc=True)
dfsg['period_of_'] = pd.to_datetime(dfsg["period_of_"].dt.strftime('%m/%d/%Y').astype(str))

dfsg['period_of1'] = pd.to_datetime(dfsg['period_of1'], utc=True)
dfsg['period_of1'] = pd.to_datetime(dfsg["period_of1"].dt.strftime('%m/%d/%Y').astype(str))

dfsg.head(1)

Unnamed: 0,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID
0,0,13,74,10366000,1910-03-01,2011-09-30,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12,17120007,,40,S,23.0,E,25,SE,,4580,4560.83,YR,OWRD,,194.0,189.0,1,35,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65,70,1911,2011,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,orSRG0


In [7]:
# Get list of StationNumber
streamgageIdList = dfsg['station_nb'].tolist()
print(len(streamgageIdList))
streamgageIdList

171


['10366000',
 '10371500',
 '10378500',
 '10384000',
 '10387500',
 '10388000',
 '10390000',
 '10391050',
 '10392500',
 '10393500',
 '10406500',
 '11339995',
 '11340695',
 '11491400',
 '11494000',
 '11495900',
 '11497500',
 '11499100',
 '11502950',
 '11503500',
 '11504120',
 '11510000',
 '13274400',
 '13281200',
 '13282550',
 '13284900',
 '13317850',
 '13318060',
 '13318210',
 '13318920',
 '13318960',
 '13319900',
 '13320000',
 '13320300',
 '13325500',
 '13330000',
 '13330050',
 '13330300',
 '13330500',
 '13331450',
 '14010000',
 '14010800',
 '14012100',
 '14012300',
 '14012500',
 '14019109',
 '14019110',
 '14021000',
 '14022500',
 '14023500',
 '14024100',
 '14024300',
 '14025000',
 '14026000',
 '14026897',
 '14029000',
 '14029550',
 '14029780',
 '14029800',
 '14029900',
 '14030000',
 '14030500',
 '14030820',
 '14031050',
 '14031500',
 '14031600',
 '14032000',
 '14032400',
 '14032650',
 '14034800',
 '14037500',
 '14038560',
 '14039380',
 '14039500',
 '14040500',
 '14042500',
 '14043700',

In [8]:
# Get list of start date
startDateList = dfsg['period_of_'].astype(str).tolist()
print(len(startDateList))
startDateList

171


['1910-03-01',
 '1922-10-01',
 '1910-10-01',
 '1912-04-03',
 '1926-03-24',
 '1928-03-01',
 '1905-01-01',
 '1989-03-01',
 '2014-11-01',
 '1903-06-01',
 '1911-03-25',
 '1980-10-01',
 '1976-04-01',
 '1973-10-01',
 '1912-05-01',
 '2008-07-22',
 '1912-04-19',
 '1973-10-01',
 '1927-07-13',
 '1922-11-12',
 '1927-07-13',
 '1929-07-06',
 '2008-10-01',
 '1976-06-23',
 '1999-05-22',
 '1969-11-15',
 '1992-10-01',
 '1977-07-01',
 '1992-10-01',
 '1992-10-01',
 '1996-10-01',
 '1992-10-01',
 '1911-08-01',
 '1996-10-01',
 '1924-02-01',
 '1912-09-01',
 '1995-08-01',
 '1995-06-01',
 '1915-04-01',
 '1995-08-16',
 '1903-02-01',
 '1969-10-01',
 '1932-05-19',
 '1929-06-01',
 '1941-04-01',
 '2004-10-01',
 '2004-10-01',
 '1903-11-01',
 '1921-05-01',
 '1918-11-01',
 '2006-01-26',
 '2006-01-21',
 '1921-05-01',
 '1903-10-01',
 '2005-10-01',
 '1925-10-01',
 '2006-10-01',
 '1993-04-29',
 '1929-10-01',
 '1992-11-09',
 '1926-03-01',
 '1926-03-01',
 '1980-10-01',
 '1992-11-09',
 '1925-10-01',
 '2009-10-01',
 '1926-01-

In [9]:
# Get list of end date
endDateList = dfsg['period_of1'].astype(str).tolist()
print(len(endDateList))
endDateList

171


['2011-09-30',
 '2016-09-30',
 '2014-09-30',
 '2016-09-30',
 '2014-09-30',
 '2016-09-30',
 '2011-09-30',
 '2014-09-30',
 '2016-09-30',
 '2019-09-30',
 '2012-09-30',
 '2012-09-30',
 '2014-09-30',
 '2014-09-30',
 '2015-09-30',
 '2011-09-30',
 '2012-09-30',
 '2014-09-30',
 '1927-09-03',
 '1927-07-31',
 '1927-09-30',
 '2014-09-30',
 '2012-09-30',
 '2014-09-30',
 '2014-09-30',
 '2012-09-30',
 '2012-09-30',
 '2012-09-30',
 '2012-09-30',
 '2012-09-30',
 '2014-09-30',
 '2012-09-30',
 '2014-09-30',
 '2012-09-30',
 '1941-09-30',
 '2012-09-30',
 '2021-09-30',
 '2015-09-30',
 '2015-10-31',
 '2012-09-30',
 '2019-09-30',
 '2019-09-30',
 '2019-09-30',
 '2019-09-30',
 '1948-02-21',
 '2016-09-30',
 '2016-09-30',
 '2016-09-30',
 '2014-09-30',
 '2014-09-30',
 '2012-09-30',
 '2012-09-30',
 '2014-09-30',
 '2014-09-30',
 '2016-09-30',
 '2016-09-30',
 '2012-09-30',
 '2016-09-30',
 '2016-09-30',
 '2012-09-30',
 '2016-09-30',
 '2016-09-30',
 '2016-09-30',
 '2016-10-30',
 '2016-09-30',
 '2011-09-30',
 '2014-09-

## Timeseries Data

In [10]:
# done already

# %%time
# # get timeseries results

# # create empty dataframe
# dfts = pd.DataFrame()

# sglength = len(streamgageIdList)
# for i in range(sglength):
#     url = "https://apps.wrd.state.or.us/apps/sw/hydro_near_real_time/hydro_download.aspx?station_nbr=" + str(streamgageIdList[i]) + "&start_date=" + str(startDateList[i]) + "&end_date=" + str(endDateList[i]) + "&dataset=MDF&format=html"
#     print(url)
#     try:
#         # store in dataframe
#         dftemp = pd.DataFrame()
#         dftemp = pd.read_csv(url, sep="\t")
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
    
#     except:
#         dftemp = pd.DataFrame()
#         dftemp['url'] = url
#         dfts = pd.concat([dfts, dftemp])
#         print("Error, issue with API return.")

# dfts.to_csv('timeseriesData.csv', index=False)  # The output.
# print(len(dfts))
# dfts.head()

In [11]:
# Input File - timeseriesData.csv
# read the dataset from a zip
fileInput = "timeseriesData.zip"
dfts = pd.read_csv(fileInput)

print(len(dfts))
dfts.head(1)

  dfts = pd.read_csv(fileInput)


2973172


Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url,WaDEUUID
0,10366000,03-01-1910,2610.0,Published,,,10-19-2022 12:03,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,orRG0


In [12]:
# only use published data

dfts = dfts[dfts['published_status'] == 'Published'].drop_duplicates().reset_index(drop=True)
print(len(dfts))
dfts.head(1)

2183814


Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url,WaDEUUID
0,10366000,03-01-1910,2610.0,Published,,,10-19-2022 12:03,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,orRG0


## Output Dataframe

In [13]:
# Merging dataframes into one, using left-join.
df = pd.merge(dfts, dfsg, left_on='<pre>station_nbr', right_on='station_nb', how='left')
print(len(df))
df.head(1)

2183814


Unnamed: 0,<pre>station_nbr,record_date,mean_daily_flow_cfs,published_status,estimated,revised,download_date,url,WaDEUUID_x,OID_,OBJECTID,lkp_gaging,station_nb,period_of_,period_of1,station_na,station_st,streamflow,source_typ,streamcode,longitude_,latitude_d,county_nam,state_name,owrd_regio,wm_distric,hydrologic,meridian,township,township_c,range,range_char,sctn,qtr160,qtr40,elevation,elevation_,current_op,most_recen,cooperator,published_,owrd_area,ws_charact,flood_regi,basin_name,streamfl_1,source_t_1,station__1,current__1,nbr_of_com,nbr_of_pea,peak_flow_,peak_flow1,near_real_,near_real1,daily_proc,stage_inst,flow_insta,mean_daily,measured_f,volume_mid,stage_midn,mean_dai_1,mean_dai_2,rating_cur,water_temp,water_te_1,water_te_2,water_te_3,water_te_4,air_temp_i,air_temp_m,air_temp_1,air_temp_2,precipitat,WaDEUUID_y
0,10366000,03-01-1910,2610.0,Published,,,10-19-2022 12:03,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,orRG0,0.0,13.0,74.0,10366000,1910-03-01,2011-09-30,"TWENTYMILE CR NR ADEL, OR",A,R,S,1300800570,-119.963672,42.072466,Lake,Oregon,SC,12.0,17120007.0,,40.0,S,23.0,E,25.0,SE,,4580.0,4560.83,YR,OWRD,,194.0,189.0,1.0,35.0,Goose & Summer Lake,Runoff,Stream,Active,Year-round,65.0,70.0,1911.0,2011.0,https://apps.wrd.state.or.us/apps/sw/hydro_nea...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,orSRG0


In [14]:
dfout = pd.DataFrame(index=df.index)

# Water Source Info
dfout['in_WaterSourceTypeCV'] = df['streamfl_1']

# Site Info
dfout['in_County'] = df['county_nam']
dfout['in_Latitude'] = df['latitude_d']
dfout['in_Longitude'] = df['longitude_']
dfout['in_PODorPOUSite'] = "Gage"
dfout['in_SiteName'] = df['station_na']
dfout['in_SiteNativeID'] = df['station_nb']
dfout['in_SiteTypeCV'] = df['source_t_1']

# Site VariableAmounts Info
dfout['in_Amount'] = df['mean_daily_flow_cfs']
dfout['in_BeneficialUseCategory'] = "Unspecified"
dfout['in_ReportYearCV'] = df['record_date']
dfout['in_TimeframeEnd'] = df['record_date']
dfout['in_TimeframeStart'] = df['record_date']

print(len(dfout))
dfout.head(1)

2183814


Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2610.0,Unspecified,03-01-1910,03-01-1910,03-01-1910


In [15]:
# drop duplicate values
dfout = dfout.drop_duplicates().reset_index(drop=True)
print(len(dfout))

2076760


## Cleaning Output

In [16]:
dfout.info()
dfout.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2076760 entries, 0 to 2076759
Data columns (total 13 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   in_WaterSourceTypeCV      object 
 1   in_County                 object 
 2   in_Latitude               float64
 3   in_Longitude              float64
 4   in_PODorPOUSite           object 
 5   in_SiteName               object 
 6   in_SiteNativeID           object 
 7   in_SiteTypeCV             object 
 8   in_Amount                 float64
 9   in_BeneficialUseCategory  object 
 10  in_ReportYearCV           object 
 11  in_TimeframeEnd           object 
 12  in_TimeframeStart         object 
dtypes: float64(3), object(10)
memory usage: 206.0+ MB


Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2610.0,Unspecified,03-01-1910,03-01-1910,03-01-1910


In [17]:
# Fixing empty water source types

def fixWaterSourceTypeCV(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val) or val == "nan":
        outString = "Unspecified"
    else:
        outString = val
    return outString

dfout['in_WaterSourceTypeCV'] = dfout.apply(lambda row: fixWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceTypeCV'].unique()

array(['Runoff', 'Unspecified', 'Spring'], dtype=object)

In [18]:
# Fixing empty site types names

def fixSiteTypeCV(val):
    val = str(val).strip()
    if val == "" or val == " " or pd.isnull(val) or val == "nan":
        outString = "Unspecified"
    else:
        outString = val
    return outString

dfout['in_SiteTypeCV'] = dfout.apply(lambda row: fixSiteTypeCV(row['in_SiteTypeCV']), axis=1)
dfout['in_SiteTypeCV'].unique()

array(['Stream', 'Diversion', 'Unspecified'], dtype=object)

In [19]:
# Convert History Year to MM/DD/YYYY format.
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], utc=True)
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], utc=True)
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout.head(1)

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2610.0,Unspecified,03-01-1910,1910-03-01,1910-03-01


In [20]:
# in_Latitude & in_Longitude
dfout['in_Latitude'] = pd.to_numeric(dfout['in_Latitude'], errors='coerce').fillna(0)
dfout['in_Longitude'] = pd.to_numeric(dfout['in_Longitude'], errors='coerce').fillna(0)
dfout.head(1)

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2610.0,Unspecified,03-01-1910,1910-03-01,1910-03-01


In [21]:
# extract year out
dfout['in_ReportYearCV'] = pd.to_datetime(dfout['in_ReportYearCV'], utc=True)
dfout['in_ReportYearCV'] = pd.to_datetime(dfout["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
dfout['in_ReportYearCV'] = dfout['in_ReportYearCV'].dt.year
dfout.head(1)

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart
0,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2610.0,Unspecified,1910,1910-03-01,1910-03-01


## WaDE Custom Elements (due to missing info)

In [22]:
%%time
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEOR_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceTypeCV'] == A), 'in_WaterSourceNativeID']
    if not (ml.empty):  # check if the series is empty
        outList = ml.iloc[0]
    else:
        outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceTypeCV']), axis=1)
dfout['in_WaterSourceNativeID'].unique()

Wall time: 6min 26s


array(['WaDEOR_WS1', 'WaDEOR_WS2', 'WaDEOR_WS3'], dtype=object)

## Export Outputs

In [23]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

in_WaterSourceTypeCV                object
in_County                           object
in_Latitude                        float64
in_Longitude                       float64
in_PODorPOUSite                     object
in_SiteName                         object
in_SiteNativeID                     object
in_SiteTypeCV                       object
in_Amount                          float64
in_BeneficialUseCategory            object
in_ReportYearCV                      int64
in_TimeframeEnd             datetime64[ns]
in_TimeframeStart           datetime64[ns]
in_WaterSourceNativeID              object
dtype: object


In [24]:
dfout

Unnamed: 0,in_WaterSourceTypeCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,in_Amount,in_BeneficialUseCategory,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceNativeID
0,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2610.0,Unspecified,1910,1910-03-01,1910-03-01,WaDEOR_WS1
1,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,2020.0,Unspecified,1910,1910-03-02,1910-03-02,WaDEOR_WS1
2,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,1020.0,Unspecified,1910,1910-03-03,1910-03-03,WaDEOR_WS1
3,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,600.0,Unspecified,1910,1910-03-04,1910-03-04,WaDEOR_WS1
4,Runoff,Lake,42.072466,-119.963672,Gage,"TWENTYMILE CR NR ADEL, OR",10366000,Stream,550.0,Unspecified,1910,1910-03-05,1910-03-05,WaDEOR_WS1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2076755,Unspecified,Deschutes,43.993711,-121.378892,Gage,"ARNOLD CN NR BEND, OR",14065500,Diversion,48.0,Unspecified,1928,1928-11-06,1928-11-06,WaDEOR_WS2
2076756,Unspecified,Deschutes,43.993711,-121.378892,Gage,"ARNOLD CN NR BEND, OR",14065500,Diversion,48.0,Unspecified,1928,1928-11-07,1928-11-07,WaDEOR_WS2
2076757,Unspecified,Deschutes,43.993711,-121.378892,Gage,"ARNOLD CN NR BEND, OR",14065500,Diversion,48.0,Unspecified,1928,1928-11-08,1928-11-08,WaDEOR_WS2
2076758,Unspecified,Deschutes,43.993711,-121.378892,Gage,"ARNOLD CN NR BEND, OR",14065500,Diversion,48.0,Unspecified,1928,1928-11-09,1928-11-09,WaDEOR_WS2


In [25]:
#Exporting to Finished File
dfout.to_csv('P_orSSRGMain.csv', index=False)  # The output

In [26]:
dfout['in_WaterSourceTypeCV'].unique()

array(['Runoff', 'Unspecified', 'Spring'], dtype=object)

In [27]:
dfout['in_Latitude'].unique()

array([42.072466  , 42.188502  , 42.424893  , 42.684954  , 42.982278  ,
       42.99491   ,  0.        , 43.113556  , 43.142203  , 43.922727  ,
       43.715937  , 42.1558342 , 42.272314  , 42.17742   , 42.910953  ,
       42.656147  , 42.49652778, 42.447585  , 42.48666763, 42.766077  ,
       42.763672  , 42.725902  , 42.15639114, 44.57527542, 44.910045  ,
       45.012919  , 45.05685   , 45.071528  , 45.268544  , 45.266642  ,
       45.351944  , 45.353994  , 45.124822  , 45.155539  , 45.211914  ,
       45.274927  , 45.439114  , 45.489692  , 45.537875  , 45.526822  ,
       45.60875   , 45.829925  , 45.885003  , 45.928485  , 45.945127  ,
       45.97541289, 45.912217  , 45.866329  , 45.671524  , 45.549078  ,
       45.608842  , 45.483971  , 45.48407   , 45.651801  , 45.677157  ,
       45.695441  , 45.723458  , 45.853722  , 45.859968  , 45.805879  ,
       45.721456  , 45.732792  , 45.735614  , 45.77435   , 45.764786  ,
       45.807118  , 45.795456  , 45.54650278, 45.863392  , 45.91