# Working with CO Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

Separating tasks out into two processes. 
- 1) create aggregate amount time series input.
- 2) create reporting unit input.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Colorado/AggregatedAmounts"
os.chdir(workingDir)

In [3]:
# Input file
fileInput = "RawInputData/DWR__Surface_Water_Supply_Index_Component_by_HUC_input.xlsx"
df = pd.read_excel(fileInput)
df.head(3)

Unnamed: 0,Basin,HUC8,HUC8 Name,Report Year,Report Month,Component Type,Component ID,Component Name,Component Volume,Component NEP
0,South Platte,10190001,South Platte Headwater,2010,10,ReservoirStorage,6016010,ANTERO RESERVOIR,19300.0,61.97
1,South Platte,10190001,South Platte Headwater,2010,11,ReservoirStorage,6016010,ANTERO RESERVOIR,19600.0,64.93
2,South Platte,10190001,South Platte Headwater,2010,12,ReservoirStorage,6016010,ANTERO RESERVOIR,19700.0,62.44


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15900 entries, 0 to 15899
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Basin             15900 non-null  object 
 1   HUC8              15900 non-null  int64  
 2   HUC8 Name         15900 non-null  object 
 3   Report Year       15900 non-null  int64  
 4   Report Month      15900 non-null  int64  
 5   Component Type    15900 non-null  object 
 6   Component ID      15900 non-null  object 
 7   Component Name    15900 non-null  object 
 8   Component Volume  15900 non-null  float64
 9   Component NEP     15900 non-null  float64
dtypes: float64(2), int64(3), object(5)
memory usage: 1.2+ MB


In [5]:
#TimeframeStart and End

#Concatenate strings
df['in_TimeframeStart'] = df['Report Year'].astype(str) + "-" + df['Report Month'].astype(str) + "-01"
df['in_TimeframeEnd'] = df['Report Year'].astype(str) + "-" + df['Report Month'].astype(str) + "-28"

#Changing datatype of used date fields. 
df['in_TimeframeStart'] = pd.to_datetime(df['in_TimeframeStart'], errors = 'coerce')
df['in_TimeframeStart'] = pd.to_datetime(df["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
df['in_TimeframeEnd'] = pd.to_datetime(df['in_TimeframeEnd'], errors = 'coerce')
df['in_TimeframeEnd'] = pd.to_datetime(df["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

df.head()

Unnamed: 0,Basin,HUC8,HUC8 Name,Report Year,Report Month,Component Type,Component ID,Component Name,Component Volume,Component NEP,in_TimeframeStart,in_TimeframeEnd
0,South Platte,10190001,South Platte Headwater,2010,10,ReservoirStorage,6016010,ANTERO RESERVOIR,19300.0,61.97,2010-10-01,2010-10-28
1,South Platte,10190001,South Platte Headwater,2010,11,ReservoirStorage,6016010,ANTERO RESERVOIR,19600.0,64.93,2010-11-01,2010-11-28
2,South Platte,10190001,South Platte Headwater,2010,12,ReservoirStorage,6016010,ANTERO RESERVOIR,19700.0,62.44,2010-12-01,2010-12-28
3,South Platte,10190001,South Platte Headwater,2011,1,ReservoirStorage,6016010,ANTERO RESERVOIR,19900.0,61.19,2011-01-01,2011-01-28
4,South Platte,10190001,South Platte Headwater,2011,2,ReservoirStorage,6016010,ANTERO RESERVOIR,20000.0,82.34,2011-02-01,2011-02-28


In [6]:
# VariableSpecificCV
df['in_VariableSpecificCV'] = df['Component Type'].astype(str) + "_Monthly_Unspecified_Surface Water"
df['in_VariableSpecificCV'].unique()

array(['ReservoirStorage_Monthly_Unspecified_Surface Water',
       'ForecastedRunoff_Monthly_Unspecified_Surface Water',
       'PrevMoStreamflow_Monthly_Unspecified_Surface Water'], dtype=object)

## Shapefile Data

In [7]:
#Shapefile input
shapeInput = gpd.read_file('RawInputData/HUC8/CO_HUC8.shp')
dfshapetemp = pd.DataFrame(shapeInput)
dfshapetemp.head(1)

Unnamed: 0,TNMID,METASOURCE,SOURCEDATA,SOURCEORIG,SOURCEFEAT,LOADDATE,GNIS_ID,AREAACRES,AREASQKM,STATES,HUC8,NAME,Shape_Leng,Shape_Le_1,Shape_Area,geometry
0,{4CCAA733-584D-4347-A7F3-4E664ADA8B9B},,,,,2012-06-11,0,1104144.63,4468.32,"CO,NM",11080001,Canadian Headwaters,3.943636,3.943636,0.450856,"POLYGON ((-104.18034 36.92065, -104.18082 36.9..."


In [8]:
#check shp input Water Plan Area
dfshape = pd.DataFrame()

columnsList = ['RU_ID', 'geometry']
dfshape = pd.DataFrame(columns=columnsList)
dfshape['RU_ID'] = dfshapetemp['HUC8']
dfshape['geometry'] = dfshapetemp['geometry']
dfshape = dfshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfshape.head(3)

Unnamed: 0,RU_ID,geometry
0,11080001,"POLYGON ((-104.18034 36.92065, -104.18082 36.9..."
1,13020101,"POLYGON ((-105.23391 37.03586, -105.23421 37.0..."
2,13020102,"POLYGON ((-106.58821 37.13834, -106.58788 37.1..."


## Export

In [9]:
df.to_csv('RawInputData/P_coAggMaster.csv', index=False) # The master Output - csv file
dfshape.to_csv('RawInputData/P_coGeometry.csv', index=False) # the shapefile output - csv file