# Pre-processing Overlay data for WaDE upload.
- Purpose: To preprocess state overlay data into one main file for simple DataFrame creation and extraction.

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/California/Overlays" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/California/Overlays


## Overlay Area Data #1

In [4]:
# Input File
inputFile = "RawInputData/shapefiles/2019_SGMA_Basins.zip"
dfin1 = gpd.read_file(inputFile).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "ov" + dfin1.index.astype(str)
    dfin1.to_csv('RawInputData/2019_SGMA_Basins.zip', compression=dict(method='zip', archive_name='2019_SGMA_Basins.csv'), index=False)

print(len(dfin1))
dfin1.head()

515


Unnamed: 0,Basin_Numb,Basin_Subb,Basin_Name,Basin_Su_1,Priority,GlobalID,Shape_Leng,Shape_Area,geometry,WaDEUUID
0,9-029,9-029,POTRERO VALLEY,POTRERO VALLEY,Very Low,{BA840110-5DB5-44A9-85AF-66FD1E9AE423},25722.24136,11550262.4601,"POLYGON ((-12977172.634 3849059.804, -12977057...",ov0
1,7-047,7-047,JACUMBA VALLEY,JACUMBA VALLEY,Very Low,{2D783C60-320D-497E-A0EE-16FCD58F1DC1},32398.06375,14165369.5953,"POLYGON ((-12933485.880 3849572.704, -12933243...",ov1
2,9-028,9-028,CAMPO VALLEY,CAMPO VALLEY,Very Low,{44FF5F9C-5D24-4313-8325-195B414B5EE1},49955.00108,20246644.0036,"POLYGON ((-12966516.362 3847596.752, -12966417...",ov2
3,7-061,7-061,DAVIES VALLEY,DAVIES VALLEY,Very Low,{D00C7D62-89F6-4827-82BD-8C7DF4D4A461},32087.40007,20446555.5825,"POLYGON ((-12914405.958 3855002.869, -12914517...",ov3
4,9-027,9-027,COTTONWOOD VALLEY,COTTONWOOD VALLEY,Very Low,{982B7183-EA63-46E8-A57D-FA11F4EB690D},50378.90626,22010436.4624,"POLYGON ((-12967760.558 3863695.656, -12967750...",ov4


In [5]:
# Input File
inputFile = "RawInputData/sgma_2019_basin_prioritization_dashboard_data.zip"
dfin2 = pd.read_csv(inputFile, encoding = "ISO-8859-1").replace(np.nan, "")

print(len(dfin2))
dfin2.head()

515


Unnamed: 0,Basin_Subbasin_Number,Basin_Subbasin_Name,Hydrologic_Region,Region_Office,Basin_Area_Acres,Basin_Area_SqMi,Project_Phase,C1_Population_Census,C1_Population_SqMi,C1_Priority_Points,C2_Population_Projection,C2_Population_Growth,C2_Zero_2010_Population,C2_Negative_or_No_Growth,C2_Postive_Growth_and_2010_Pop_1000,C2_Density_50_and_2010_Population_25000,C2_Priority_Points,C3_Public_Supply_Groundwater_Wells,C3_Public_Supply_Wells_SqMi,C3_Priority_Points,C4_Total_Groundwater_Wells,C4_Production_Wells_SqMi,C4_Priority_Points,C5_Irrigated_Acres,C5_Irrigated_Acres_SqMi,C5_Priority_Points,C6_Agricultural_Groundwater_AF,C6_Agricultural_ Surface_ Water_AF,C6_Agricultural_Total_Applied_Water_AF,C6_Agricultural_Groundwater_Percent_Supply,C6_Agricultural_Surface_Water_Percent_Supply,C6_Urban_Groundwater_AF,C6_Urban_Surface_Water_AF,C6_Urban_Purchased_Surface_Water_AF,C6_Urban_Total_Water_Use_AF,C6_Urban_Groundwater_Percent_Supply,C6_Urban_Surface&Purchased_Water_Percent_Supply,C6_Other_Groundwater,C6_Other_Surface_Water,C6_Groundwater_Use_AF,C6_Surface_Water_Use_AF,C6_Total_Water_Use_AF,C6a_Groundwater_Use_AF_BasinAcre,C6a_Points,C6b_Groundwater_Percent_Supply,C6b_Surface_Water_Percent_Supply,C6b_Points,C6_Priority_Points,C7_Impacts_Declining_Groundwater_Levels_Points,C7_Impacts_Declining_Groundwater_Levels_Comment,C7_Impacts_Subsidence_Points,C7_Impacts_Subsidence_Comment,C7_Impacts_Salt_Intrusion_Points,C7_Impacts_Salt_Intrusion_Comment,C7_Impacts_WQ_Detects,C7_WQ_MCL_Exceed,C7_WQ_Average_MCL_Exceedance,C7_WQ_Average_MCL_Exceed_Points,C7_WQ_Unique_Wells_Excd,C7_WQ_Wells_Excd_Per_PSW,C7_WQ_Wells_Excd_Per_PSW_Points,C7_WQ_Calculated_Points,C7_WQ_Other,C7_Impacts_Water_Quality_Points,C7_Impacts_Water_Quality_Comment,C7_Impacts_Total_Points,C7_Priority_Points,C8a_Streamflow_Points,C8a_Habitat_Points,C8a_Monitoring,C8a_Monitoring_and_GroundwaterThreshold_Adverse,C8a_Adverse_Adjustment,C8a_AdverseComment,C8a_HabitatSF_Priority_Points,C8b_BasinOtherInfo_Priority_Points,C8b_BasinOtherInfoComment,C8a_and_C8b_Priority_Points,C8c_2kGroundwater,C8c_9.5kGW_NoDocImpacts,C8c_Adjudication,C8c_Groundwater_NonAdj_AF,C8c_9.5kGW_NonAdj,C8c_CriticalOverdraft,C8c_OutOfBasinGWExports_SubstitutionTransfers,Substitution_Year,Substitution_Amt,Substitution_PriorityPoints,Total_Priority_Points,Priority,Priority_Change_2014_to_2018,WaDEUUID
0,1-001,Smith River Plain,North Coast,NRO,40434.48606,63.17888,1,24604,389,2,27633,12.31,False,False,False,False,2,19,0.3,3,1309,20.71,5,5457,86.37,2,5578.66,3346.3,8924.96,62.5,37.5,3563,2683,0,6246,57.0,43.0,0,0,9141.66,6029.3,15170.96,0.22,2,60.0,39.0,4,3.0,-,No documented Groundwater Level Declines\r\nSo...,0,No documented groundwater extraction induced i...,0,Salinity monitoring found salt water intrusion...,409,17,16.95,5,5,0.26,1,6,0,2,0,2.0,0,1,1,True,Monitored with no Declining GW Levels,0,,0,0,,0,False,True,False,0,False,False,False,0,0,0,0.0,Very Low,M/H to L/VL,re0
1,1-002.01,Klamath River Valley - Tulelake,North Coast,NRO,110521.3526,172.68961,1,2407,14,1,2434,1.12,False,False,False,True,0,7,0.04,1,260,1.5,1,60230,348.77,4,31294.0,113039.0,144333.0,21.7,78.3,784,0,0,784,100.0,0.0,0,0,32078.0,113039.0,145117.0,0.29,3,22.0,77.0,2,2.5,7.5,1) Long term hydrographs show groundwater leve...,0,No documented groundwater extraction induced i...,0,No documented Saline Intrusion,44,9,1.97,1,5,0.71,2,3,0,1,0,8.5,2,1,1,True,Monitored with Declining GW Levels and > 0.16 ...,0,,2,3,"Complex water rights system - other state, fed...",5,False,False,False,0,False,False,False,0,0,0,16.5,Medium,No Change,re1
2,1-002.02,Klamath River Valley - Lower Klamath,North Coast,NRO,75330.28412,117.70357,1,62,1,0,94,51.61,False,False,True,True,0,0,0.0,0,34,0.28,1,17208,146.19,3,2752.0,41047.0,43799.0,6.3,93.7,2,0,0,2,100.0,0.0,0,0,2754.0,41047.0,43801.0,0.03,1,6.0,93.0,1,1.0,-,No documented Groundwater Level Declines,0,No documented groundwater extraction induced i...,0,No documented Saline Intrusion,0,0,0.0,0,0,0.0,0,0,0,0,0,0.0,0,1,1,True,Monitored with no Declining GW Levels,0,,0,0,,0,False,True,False,0,False,False,False,0,0,0,0.0,Very Low,No Change,re2
3,1-003,Butte Valley,North Coast,NRO,79738.99549,124.59218,1,1464,12,1,1651,12.77,False,False,False,True,0,4,0.03,1,325,2.6,2,23420,187.97,3,46858.0,9438.0,56296.0,83.2,16.8,556,0,0,556,100.0,0.0,0,0,47414.0,9438.0,56852.0,0.59,4,83.0,16.0,5,4.5,7.5,1) CASGEM/WDL/GWIDS: Long term hydrographs sho...,0,No documented groundwater extraction induced i...,0,No documented Saline Intrusion,88,8,1.21,1,2,0.5,2,3,0,1,0,8.5,2,1,1,True,Monitored with Declining GW Levels and > 0.16 ...,0,,2,0,,2,False,False,False,0,False,False,False,0,0,0,15.5,Medium,No Change,re3
4,1-004,Shasta Valley - Shasta Valley,North Coast,NRO,218215.0317,340.96099,2,13070,38,1,19375,48.24,False,False,False,True,0,18,0.05,1,1782,5.22,3,34838,102.17,3,21623.49951,77432.68717,99056.18667,21.8,78.2,1866,1230,0,3096,60.3,39.7,0,0,23489.83579,78662.29778,102152.1336,0.1,2,23.0,77.0,2,2.0,-,No documented Groundwater Level Declines,0,No documented groundwater extraction induced i...,0,No documented Saline Intrusion,333,9,6.58,5,3,0.17,1,6,0,2,0,2.0,0,1,1,True,Monitored with no Declining GW Levels,1,During the irrigation season (April 1 to Octob...,2,3,Complex GW management issues internally and ex...,5,False,False,False,0,False,False,False,0,0,0,15.0,Medium,No Change,re4


In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Date Info
df['in_Date'] = ""
df['in_Year'] = ""

# Organization Info
df['in_OrganizationUUID'] = ""

# ReportingUnit Info
df['in_EPSGCodeCV'] = ""
df['in_ReportingUnitName'] = ""
df['in_ReportingUnitNativeID'] = ""
df['in_ReportingUnitProductVersion'] = ""
df['in_ReportingUnitTypeCV'] = ""
df['in_ReportingUnitUpdateDate'] = ""
df['in_StateCV'] = ""
df['in_Geometry'] = ""

# RegulatoryOverlay Info
df['in_OversightAgency'] = ""
df['in_RegulatoryDescription'] = ""
df['in_RegulatoryName'] = ""
df['in_RegulatoryOverlayNativeID'] = ""
df['in_RegulatoryStatusCV'] = ""
df['in_RegulatoryStatute'] = ""
df['in_RegulatoryStatuteLink'] =""
df['in_StatutoryEffectiveDate'] = d""
df['in_StatutoryEndDate'] = ""
df['in_RegulatoryOverlayTypeCV'] = ""
df['in_WaterSourceTypeCV'] = ""

df = df.drop_duplicates().reset_index(drop=True)
outdf1 = df.copy()
print(len(outdf1))
outdf1.head()

## Overlay Area #2

In [None]:
# etc etc,

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_ReportingUnitName'] = outdf.apply(lambda row: ensureEmptyString(row['in_ReportingUnitName']), axis=1)
outdf['in_ReportingUnitName'].unique()

In [None]:
outdf['in_RegulatoryDescription'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryDescription']), axis=1)
outdf['in_RegulatoryDescription'].unique()

In [None]:
outdf['in_RegulatoryName'] = outdf.apply(lambda row: ensureEmptyString(row['in_RegulatoryName']), axis=1)
outdf['in_RegulatoryName'].unique()

In [None]:
#Update datatype of StatutoryEffectiveDate to fit WaDE 2.0 structure
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'], errors = 'coerce')
outdf['in_StatutoryEffectiveDate'] = pd.to_datetime(outdf['in_StatutoryEffectiveDate'].dt.strftime('%m/%d/%Y'))
outdf['in_StatutoryEffectiveDate'].unique()

# Geometry Data
- For attaching geometry to overlay reporting unit area info.

#### Overlay Area #1 shapefile info

In [None]:
# Input File / or use same input as above
shapeInputFile = "RawInputData/shapefiles/{enter file name here}.zip"

gdfin1 = gpd.read_file(shapeInputFile)
gdfin1['geometry'] = gdfin1['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(gdfin1))
gdfin1.head()

In [None]:
# plot shape info to map
gdfin1.plot()

In [None]:
# create output for Regulatory Area #1 dataframe
df = pd.DataFrame()

columnsList = ['in_ReportingUnitNativeID', 'geometry']
goutdf1 = pd.DataFrame(columns=columnsList, index=gdfin1.index)

goutdf1['in_ReportingUnitNativeID'] = ""  # in_ReportingUnitNativeID needs to match source from above equivlaent datframe
goutdf1['geometry'] = gdfin1['geometry']
goutdf1 = goutdf1.drop_duplicates().reset_index(drop=True)
print(len(goutdf1))
goutdf1.head()

#### Overlay Area #2 shapefile info

In [None]:
# etc etc...

#### Concatenate goutdf shapefile info into single output

In [None]:
# Concatenate Basin & Subbasin DataFrames
frames = [goutdf1, goutdf2,] # add geoutdf dataframes here
goutdf = pd.concat(frames).reset_index(drop=True)

print(len(goutdf))
goutdf.head()

## Export Data

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(outdf.dtypes)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(goutdf.dtypes)

In [None]:
# Export out to CSV.
outdf.to_csv('RawInputData/Pov_Main.zip', compression=dict(method='zip', archive_name='Pov_Main.csv'), index=False)  # The output, save as a zip
goutdf.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.