# Working with CO Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

Separating tasks out into two processes. 
- 1) create aggregate amount time series input.
- 2) create reporting unit input.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Colorado/AggregatedAmounts"
os.chdir(workingDir)

#CSV input file
fileInput = "RawInputData/DWR__Surface_Water_Supply_Index_Component_by_HUC_input.xlsx"
df = pd.read_excel(fileInput)

#Shapefile input
shapeInput = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Colorado/AggregatedAmounts/RawInputData/HUC8/CO_HUC8.shp')
dfshape = pd.DataFrame(shapeInput)

In [3]:
#check csv input
df.head(3)

Unnamed: 0,Basin,HUC8,HUC8 Name,Report Year,Report Month,Component Type,Component ID,Component Name,Component Volume,Component NEP
0,South Platte,10190001,South Platte Headwater,2010,10,ReservoirStorage,6016010,ANTERO RESERVOIR,19300.0,61.97
1,South Platte,10190001,South Platte Headwater,2010,11,ReservoirStorage,6016010,ANTERO RESERVOIR,19600.0,64.93
2,South Platte,10190001,South Platte Headwater,2010,12,ReservoirStorage,6016010,ANTERO RESERVOIR,19700.0,62.44


In [4]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

Basin                object
HUC8                  int64
HUC8 Name            object
Report Year           int64
Report Month          int64
Component Type       object
Component ID         object
Component Name       object
Component Volume    float64
Component NEP       float64
dtype: object


In [5]:
#check shp input
dfshape.head(3)

Unnamed: 0,TNMID,METASOURCE,SOURCEDATA,SOURCEORIG,SOURCEFEAT,LOADDATE,GNIS_ID,AREAACRES,AREASQKM,STATES,HUC8,NAME,Shape_Leng,Shape_Le_1,Shape_Area,geometry
0,{4CCAA733-584D-4347-A7F3-4E664ADA8B9B},,,,,2012-06-11,0,1104144.63,4468.32,"CO,NM",11080001,Canadian Headwaters,3.943636,3.943636,0.450856,"POLYGON ((-104.18034 36.92065, -104.18082 36.9..."
1,{A5C1EEA1-80EE-49EC-9473-DF41AD004E7F},,,,,2017-09-20,0,2082907.01,8429.23,"CO,NM",13020101,Upper Rio Grande,6.44145,6.44145,0.847277,"POLYGON ((-105.23391 37.03586, -105.23421 37.0..."
2,{72C226B6-F4BB-484B-8ADD-88C896D0DF72},,,,,2012-06-11,0,2021138.98,8179.27,"CO,NM",13020102,Rio Chama,4.926198,4.926198,0.822519,"POLYGON ((-106.58821 37.13834, -106.58788 37.1..."


In [6]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfshape.dtypes)

TNMID           object
METASOURCE      object
SOURCEDATA      object
SOURCEORIG      object
SOURCEFEAT      object
LOADDATE        object
GNIS_ID          int64
AREAACRES      float64
AREASQKM       float64
STATES          object
HUC8            object
NAME            object
Shape_Leng     float64
Shape_Le_1     float64
Shape_Area     float64
geometry      geometry
dtype: object


In [7]:
#TimeframeStart and End

#Concatenate strings
df['TimeframeStart'] = df['Report Year'].astype(str) + "-" + df['Report Month'].astype(str) + "-01"
df['TimeframeEnd'] = df['Report Year'].astype(str) + "-" + df['Report Month'].astype(str) + "-28"

#Changing datatype of used date fields. 
df['TimeframeStart'] = pd.to_datetime(df['TimeframeStart'], errors = 'coerce')
df['TimeframeStart'] = pd.to_datetime(df["TimeframeStart"].dt.strftime('%m/%d/%Y'))
df['TimeframeEnd'] = pd.to_datetime(df['TimeframeEnd'], errors = 'coerce')
df['TimeframeEnd'] = pd.to_datetime(df["TimeframeEnd"].dt.strftime('%m/%d/%Y'))

df

Unnamed: 0,Basin,HUC8,HUC8 Name,Report Year,Report Month,Component Type,Component ID,Component Name,Component Volume,Component NEP,TimeframeStart,TimeframeEnd
0,South Platte,10190001,South Platte Headwater,2010,10,ReservoirStorage,06016010,ANTERO RESERVOIR,19300.0,61.97,2010-10-01,2010-10-28
1,South Platte,10190001,South Platte Headwater,2010,11,ReservoirStorage,06016010,ANTERO RESERVOIR,19600.0,64.93,2010-11-01,2010-11-28
2,South Platte,10190001,South Platte Headwater,2010,12,ReservoirStorage,06016010,ANTERO RESERVOIR,19700.0,62.44,2010-12-01,2010-12-28
3,South Platte,10190001,South Platte Headwater,2011,1,ReservoirStorage,06016010,ANTERO RESERVOIR,19900.0,61.19,2011-01-01,2011-01-28
4,South Platte,10190001,South Platte Headwater,2011,2,ReservoirStorage,06016010,ANTERO RESERVOIR,20000.0,82.34,2011-02-01,2011-02-28
...,...,...,...,...,...,...,...,...,...,...,...,...
15895,Yampa-White,14050001,Upper Yampa,2020,10,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,2956.0,45.47,2020-10-01,2020-10-28
15896,Yampa-White,14050001,Upper Yampa,2020,11,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,3343.0,51.55,2020-11-01,2020-11-28
15897,Yampa-White,14050001,Upper Yampa,2020,12,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,3824.0,34.04,2020-12-01,2020-12-28
15898,Yampa-White,14050001,Upper Yampa,2021,1,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,4214.0,33.04,2021-01-01,2021-01-28


In [8]:
#Dropping fields we don't need.
df = df.drop(['Basin', 'Component NEP', 'Report Month'], axis=1)
df

Unnamed: 0,HUC8,HUC8 Name,Report Year,Component Type,Component ID,Component Name,Component Volume,TimeframeStart,TimeframeEnd
0,10190001,South Platte Headwater,2010,ReservoirStorage,06016010,ANTERO RESERVOIR,19300.0,2010-10-01,2010-10-28
1,10190001,South Platte Headwater,2010,ReservoirStorage,06016010,ANTERO RESERVOIR,19600.0,2010-11-01,2010-11-28
2,10190001,South Platte Headwater,2010,ReservoirStorage,06016010,ANTERO RESERVOIR,19700.0,2010-12-01,2010-12-28
3,10190001,South Platte Headwater,2011,ReservoirStorage,06016010,ANTERO RESERVOIR,19900.0,2011-01-01,2011-01-28
4,10190001,South Platte Headwater,2011,ReservoirStorage,06016010,ANTERO RESERVOIR,20000.0,2011-02-01,2011-02-28
...,...,...,...,...,...,...,...,...,...
15895,14050001,Upper Yampa,2020,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,2956.0,2020-10-01,2020-10-28
15896,14050001,Upper Yampa,2020,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,3343.0,2020-11-01,2020-11-28
15897,14050001,Upper Yampa,2020,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,3824.0,2020-12-01,2020-12-28
15898,14050001,Upper Yampa,2021,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,4214.0,2021-01-01,2021-01-28


In [9]:
#Final Data Type Check
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

HUC8                         int64
HUC8 Name                   object
Report Year                  int64
Component Type              object
Component ID                object
Component Name              object
Component Volume           float64
TimeframeStart      datetime64[ns]
TimeframeEnd        datetime64[ns]
dtype: object


In [10]:
df

Unnamed: 0,HUC8,HUC8 Name,Report Year,Component Type,Component ID,Component Name,Component Volume,TimeframeStart,TimeframeEnd
0,10190001,South Platte Headwater,2010,ReservoirStorage,06016010,ANTERO RESERVOIR,19300.0,2010-10-01,2010-10-28
1,10190001,South Platte Headwater,2010,ReservoirStorage,06016010,ANTERO RESERVOIR,19600.0,2010-11-01,2010-11-28
2,10190001,South Platte Headwater,2010,ReservoirStorage,06016010,ANTERO RESERVOIR,19700.0,2010-12-01,2010-12-28
3,10190001,South Platte Headwater,2011,ReservoirStorage,06016010,ANTERO RESERVOIR,19900.0,2011-01-01,2011-01-28
4,10190001,South Platte Headwater,2011,ReservoirStorage,06016010,ANTERO RESERVOIR,20000.0,2011-02-01,2011-02-28
...,...,...,...,...,...,...,...,...,...
15895,14050001,Upper Yampa,2020,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,2956.0,2020-10-01,2020-10-28
15896,14050001,Upper Yampa,2020,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,3343.0,2020-11-01,2020-11-28
15897,14050001,Upper Yampa,2020,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,3824.0,2020-12-01,2020-12-28
15898,14050001,Upper Yampa,2021,ReservoirStorage,YAMRESCO,YAMCOLO RESERVOIR,4214.0,2021-01-01,2021-01-28


In [11]:
#The Output - csv file
df.to_csv('RawInputData/P_COagg.csv', index=False)

### Creting Geometry Input File for ReportingUnits

Geometry field takes up too much space, and is mostly repeated and redundant.  Can improve by making shorter, separate reporting unit input file.

In [12]:
dfru = df
dfru = dfru.drop_duplicates(subset=['HUC8 Name']).reset_index()
dfru.head(3)

Unnamed: 0,index,HUC8,HUC8 Name,Report Year,Component Type,Component ID,Component Name,Component Volume,TimeframeStart,TimeframeEnd
0,0,10190001,South Platte Headwater,2010,ReservoirStorage,6016010,ANTERO RESERVOIR,19300.0,2010-10-01,2010-10-28
1,125,10190003,Middle South Platte-Cherry Creek,2010,ReservoirStorage,6016020,BARR LAKE,6448.0,2010-10-01,2010-10-28
2,250,10190007,Cache La Poudre,2010,ReservoirStorage,6016030,BLACK HOLLOW RESERVOIR,3171.0,2010-10-01,2010-10-28


In [13]:
#Dropping fields we don't need.
dfru = dfru.drop(['index'], axis=1)
dfru.head(3)

Unnamed: 0,HUC8,HUC8 Name,Report Year,Component Type,Component ID,Component Name,Component Volume,TimeframeStart,TimeframeEnd
0,10190001,South Platte Headwater,2010,ReservoirStorage,6016010,ANTERO RESERVOIR,19300.0,2010-10-01,2010-10-28
1,10190003,Middle South Platte-Cherry Creek,2010,ReservoirStorage,6016020,BARR LAKE,6448.0,2010-10-01,2010-10-28
2,10190007,Cache La Poudre,2010,ReservoirStorage,6016030,BLACK HOLLOW RESERVOIR,3171.0,2010-10-01,2010-10-28


In [14]:
#Fixing datatype to propery match dfs.
dfshape['HUC8'] = dfshape['HUC8'].astype(int)

#transfer WKT gemetry from dfshape
def retrieveGeometry(colrowValue, dfshape):
    if (colrowValue == '') or (pd.isnull(colrowValue)):
        outList = ''
    else:
        ml = dfshape.loc[(dfshape['HUC8'] == colrowValue), 'geometry']
        if not(ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfru['Geometry'] = dfru.apply(lambda row: retrieveGeometry(row['HUC8'], dfshape), axis=1)
dfru.head(3)

Unnamed: 0,HUC8,HUC8 Name,Report Year,Component Type,Component ID,Component Name,Component Volume,TimeframeStart,TimeframeEnd,Geometry
0,10190001,South Platte Headwater,2010,ReservoirStorage,6016010,ANTERO RESERVOIR,19300.0,2010-10-01,2010-10-28,POLYGON ((-105.8665960599602 39.49414401803205...
1,10190003,Middle South Platte-Cherry Creek,2010,ReservoirStorage,6016020,BARR LAKE,6448.0,2010-10-01,2010-10-28,POLYGON ((-104.2221499983237 40.63784999797906...
2,10190007,Cache La Poudre,2010,ReservoirStorage,6016030,BLACK HOLLOW RESERVOIR,3171.0,2010-10-01,2010-10-28,POLYGON ((-105.3982514163698 41.14762056443755...


In [15]:
#The Output - csv file
dfru.to_csv('RawInputData/P_COagg_ru.csv', index=False)