# Pre-processing "XX" Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse


## Data Input

In [3]:
# Input File - water_use_data_Source
fileInput = "RawInputData/water_use_data_Source.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/water_use_data_Source.zip", compression=dict(method='zip', archive_name="water_use_data_Source.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

8844


Unnamed: 0,System ID,System Name,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Year,Units,Method of Measurement,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,Source Comments,WaDEUUID
0,1000,Leeds Domestic Water Users Association,10000001,Oak Grove Spring (WS001),Active,37.30895,-113.42877,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,*Source 1 springs and Source 2 well are combin...,in10


In [4]:
# Input File - water_use_data_System
fileInput = "RawInputData/water_use_data_System.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/water_use_data_System.zip", compression=dict(method='zip', archive_name="water_use_data_System.csv"), index=False)

print(len(dfin2))
dfin2.head(1)

22514


  dfin2 = pd.read_csv(fileInput).replace(np.nan, "")


Unnamed: 0,System ID,System Name,System Type,System Status,History Year,Date Received,County,Use Cooling Percent,Use Process Percent,Use Domestic Percent,Use Miscellaneous Percent,Irrigation (Lawn and Garden) Percent,Acres Irrigated,Irrigation (Agriculture),Acres Irrigated.1,DEQ ID,Population,Domestic Use,Commercial Use,Industrial Use,Institutional Use,Total Use,Method of Measurement,Domestic Connections,Commercial Connections,Industrial Connections,Institutional Connections,Total Connections,Peak Date,Peak Demand,Peak Demand Units,Peak Use Include,Peak Measurement Type,Peak Wholesale Volume,Peak Wholesale Volume Units,ERC Yes/No,ERC Value,Secondary Domestic Percent,Secondary Industrial Percent,Secondary Commercial Percent,Secondary Institutional Percent,Secondary Agriculture Percent,Secondary Domestic Connections,Secondary Industrial Connections,Secondary Commercial Connections,Secondary Institutional Connections,Secondary Agriculture Connections,Secondary Irrigation (Lawn and Garden) Acres,Secondary Irrigation (Agriculture) Acres,Secondary Metered Domestic Connections,Secondary Metered Industrial Connections,Secondary Metered Commercial Connections,Secondary Metered Institutional Connections,Secondary Metered Agriculture Connections,Secondary Metered Domestic Use,Secondary Metered Commercial Use,Secondary Metered Industrial Use,Secondary Metered Institutional Use,Secondary Metered Agriculture Use,System Comments,WaDEUUID
0,1000,Leeds Domestic Water Users Association,Public,Active,1960,12/31/1960,Washington,,,,,,,,,27010.0,0.0,0.0,0.0,0.0,,18301270.92,,0.0,0.0,0.0,,52.0,,,,,,,,Y,Not Avail,,,,,,,,,,,,,,,,,,,,,,,(1988)1. Spring totals do not include overflow...,in20


In [5]:
# Input File - CulinaryWaterServiceArea.shp
shapefileInput = "RawInputData/shapefiles/CulinaryWaterServiceArea.zip" # ziped folder of the shp file
dfPoUshapetemp = gpd.read_file(shapefileInput)
dfPoUshapetemp.head(1)

Unnamed: 0,FID_1,WRENAME,WRNAME,DWNAME,SYSTEMTYPE,WATERRESID,WRID,DWSYSNUM,WRLINK,WHOLESALER,LABEL,STATE,COUNTY,BASIN,SUBAREA,SUBAREANAM,LANUM,LANAME,ENDYEAR,DATASOURCE,SOURCEDATE,EDITOR,EDITDATE,STATUS,wade_cent_,wade_cent1,Shape_Leng,SHAPE_Le_1,Shape_Le_2,Shape_Area,geometry
0,1,Irontown,Irontown,Irontown,C,564,11358,UTAH11070,https://www.waterrights.utah.gov/asp_apps/view...,,,Utah,Iron,Cedar/Beaver,06-03-01,Escalante Desert,06-03-01a,Escalante Desert,2022,DWRe/Supplier,2021-09-08,ADAMCLARK,2021-11-19,Active,-113.44596,37.60243,6132.9725,6132.9725,0.06338,7e-05,"POLYGON ((-113.45270 37.60395, -113.45335 37.6..."


## Work with Data

#### Source POD Data

In [6]:
# left merge
dfin1 = dfin1.merge(dfin2[['System ID', 'System Name', 'County']], left_on='System ID', right_on='System ID', how='left').replace(np.nan, "")
print(len(dfin1))
dfin1.head(1)

187055


Unnamed: 0,System ID,System Name_x,Source ID,Source Name,Source Status,Lat NAD83,Lon NAD83,Source Type,Diversion Type,Use Type,Year,Units,Method of Measurement,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Total,Source Comments,WaDEUUID,System Name_y,County
0,1000,Leeds Domestic Water Users Association,10000001,Oak Grove Spring (WS001),Active,37.30895,-113.42877,Spring,Withdrawal,Water Supplier,,,,,,,,,,,,,,,,,*Source 1 springs and Source 2 well are combin...,in10,Leeds Domestic Water Users Association,Washington


In [7]:
# fix blank / null WaterSourcetypeCV
# simplify to WaDE specific categories

wsTypeDict = {
    "Well" : "Groundwater",
    "Well/Spring" : "Groundwater",
    "Well Field" : "Groundwater",
    "Well/Stream" : "Groundwater",
    "Tunnel" : "Groundwater",
    "Drain" : "Groundwater",
    "Stream" : "Surface Water",
    "Spring" : "Surface Water",
    "Reservoir" : "Surface Water",
    "Lake" : "Surface Water"}

def fixWaterSourceTypeCV(valA):
    valA = str(valA).strip()
    if valA == "" or pd.isnull(valA):
        outString = "WaDE Blank"
    else:
        try:
            outString = wsTypeDict[valA]
        except:
            outString = "WaDE Blank"
    return outString

dfin1['in_WaterSourceTypeCV'] = dfin1.apply(lambda row: fixWaterSourceTypeCV(row['Source Type']), axis=1)
dfin1['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater'], dtype=object)

In [8]:
# Loop data list.  Use this to search for specific fields.
monthUseList =  ['Jan',     'Feb',     'Mar',     'Apr',     'May',    'Jun',      'Jul',     'Aug',     'Sep',     'Oct',     'Nov',     'Dec',     'Total']
aggregatList =  ["Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Monthly", "Annual"]
startDateList = ["01/01/",  "02/01/",  "03/01/",  "04/01/",  "05/01/",  "06/01/",  "07/01/",  "08/01/",  "09/01/",  "10/01/",  "11/01/",  "12/01/",  "01/01/"]
endDateList =   ["01/31/",  "02/28/",  "03/31/",  "04/30/",  "05/31/",  "06/30/",  "07/31/",  "08/31/",  "09/30/",  "10/31/",  "11/30/",  "12/31/",  "12/31/"]

In [10]:
# create output POD dataframe
df = pd.DataFrame()

# for each value in variableTypeList
for x in monthUseList:
    
    # Data Assessment UUID
    df['WaDEUUID'] = "WaDEUUID"
    
    # Method Info
    df['in_MethodUUID'] = "" # will create sa portion below
    df['in_MethodTypeCV'] = dfin1['Method of Measurement']
    
    # Variable Info
    df['in_VariableSpecificUUID'] = "" # will create sa portion below
    df['in_AggregationIntervalUnitCV'] = aggregatList[x]
    df['in_VariableCV'] = dfin1['Diversion Type']
    
    # Organization Info
    df['in_OrganizationUUID'] = "UTssps_O1"
    
    # WaterSource Info
    df['in_Geometry'] = ""
    df['in_GNISFeatureNameCV'] = ""
    df['in_WaterQualityIndicatorCV'] = "Fresh"
    df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
    df['in_WaterSourceNativeID'] = "" # auto fill in below
    df['in_WaterSourceTypeCV'] = dfin1['in_WaterSourceTypeCV'] # need this for auto fill below
    
    # Site Info
    df['in_CoordinateAccuracy'] = "WaDE Blank"
    df['in_CoordinateMethodCV'] = "WaDE Blank"
    df['in_County'] = dfin1['County']
    df['in_EPSGCodeCV'] = 4326
    df['in_Geometry'] = ""
    df['in_GNISCodeCV'] = ""
    df['in_HUC12'] = ""
    df['in_HUC8'] = ""
    df['in_Latitude'] = dfin1['Lat NAD83']
    df['in_Longitude'] = dfin1['Lon NAD83']
    df['in_NHDNetworkStatusCV'] = ""
    df['in_NHDProductCV'] = ""
    df['in_PODorPOUSite'] = "POD"
    df['in_SiteName'] = dfin1['Source Name']
    df['in_SiteNativeID'] = dfin['Source ID']
    df['in_SitePoint'] = ""
    df['in_SiteTypeCV'] = dfin1['Source Type']
    df['in_StateCV'] = "UT"
    df['in_USGSSiteID'] = ""
       
    # Site VariableAmounts Info
    df['in_Amount'] = dfin1[monthUseList[x]]
    df['in_AllocationCropDutyAmount'] = ""
    df['in_AssociatedNativeAllocationIDs'] = ""
    df['in_BeneficialUseCategory'] = dfin1['Use Type']
    df['in_CommunityWaterSupplySystem'] = dfin1['System Name']
    df['in_CropTypeCV'] = ""
    df['in_CustomerTypeCV'] = ""
    df['in_DataPublicationDate'] = ""
    df['in_DataPublicationDOI'] = ""
    df['in_Geometry'] = ""
    df['in_IrrigatedAcreage'] = ""
    df['in_IrrigationMethodCV'] = ""
    df['in_PopulationServed'] = dfin1['Population']
    df['in_PowerGeneratedGWh'] = ""
    df['in_PowerType'] = ""
    df['in_PrimaryUseCategory'] = "" # auto fill in below
    df['in_ReportYearCV'] =  dfin1['Year']
    df['in_SDWISIdentifier'] = ""
    df['in_TimeframeEnd'] = endDateList[x] + dfin1['Year'].astype(str)
    df['in_TimeframeStart'] = startDateList[x] + dfin1['Year']

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

TypeError: list indices must be integers or slices, not str

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1, outdf2]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE MethodUUID
# ----------------------------------------------------------------------------------------------------



outdf['MethodUUID'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: {enter string entries here}

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = [""] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# PoU Shapefile Data
shapefileInput = "RawInputData/shapefiles/{enter file name here}.zip" # ziped folder of the shp file

dfPoUshapetemp = gpd.read_file(shapefileInput)
dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
print(len(dfPoUshapetemp))
dfPoUshapetemp.head()

In [None]:
# create temp dataframe to hold native ID and geometry from shapefile input
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)

# assing values to temp dataframe based on shapefile input
# for in_SiteNativeID assure ID value is the same as that listed above for POU info.
dfPoUshape['in_SiteNativeID'] = "POU" + ""
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_wu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.