# Pre-processing Montana Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [None]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [None]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Montana/WaterAllocation" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

## Point of Diversion Data

In [None]:
# Input File
FI_PoD = "RawInputData/PointsofDiversion.zip"
dfinPOD = pd.read_csv(FI_PoD).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "d" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/PointsofDiversion.zip', compression=dict(method='zip', archive_name='PointsofDiversion.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

In [None]:
# separate out PERIOD_OF_DIVERSIONS to start date
def SeparatePERIODOFDIVERSIONSStartFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        if ";" in val:
            val = val.split(";")[0].strip()
        outString = val.split("to")[0].strip()
    return outString

dfinPOD['in_AllocationTimeframeStart'] = dfinPOD.apply(lambda row: SeparatePERIODOFDIVERSIONSStartFunc(row['PERIOD_OF_DIVERSIONS']), axis=1)
dfinPOD['in_AllocationTimeframeStart'].unique()

In [None]:
# separate out PERIOD_OF_DIVERSIONS to end date
def SeparatePERIODOFDIVERSIONSEndFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        if ";" in val:
            val = val.split(";")[0].strip()
        outString = val.split("to")[1].strip()
    return outString

dfinPOD['in_AllocationTimeframeEnd'] = dfinPOD.apply(lambda row: SeparatePERIODOFDIVERSIONSEndFunc(row['PERIOD_OF_DIVERSIONS']), axis=1)
dfinPOD['in_AllocationTimeframeEnd'].unique()

In [None]:
# convert and round MAX_FLOW
def ConvertMAXFLOWFunc(val):
    val = str(val).strip()
    if val == "CFS" or val == "GPM" or val == "POF" or val == "" or val == " " or val == "nan" or pd.isnull(val):
        outVal = ""
    else:
        if "CFS" in val:
            val = val.split(" ")[0].strip()
            outVal = float(val)
        elif "GPM" in val:
            val = val.split(" ")[0].strip()
            outVal = (float(val) * 0.0026757275153786)
        elif "POF" in val:
            outVal = ""
        else:
            outVal = float(val)
    return outVal

dfinPOD['MAX_FLOW'] = dfinPOD.apply(lambda row: ConvertMAXFLOWFunc(row['MAX_FLOW']), axis=1)
dfinPOD['MAX_FLOW'].unique()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOD['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "MTwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "MTwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "MTwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = dfinPOD['SOURCE_NAME']
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = dfinPOD['SOURCE_TYPE']

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Blank"
df['in_CoordinateMethodCV'] = "WaDE Blank"
df['in_County'] = dfinPOD['COUNTY']
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD['wadeLatitude']
df['in_Longitude'] = dfinPOD['wadeLongitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfinPOD['PODV_ID_SEQ']
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfinPOD['MEANS_OF_DIV']
df['in_StateCV'] = "MT"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD['MAX_FLOW']
df['in_AllocationLegalStatusCV'] = dfinPOD['WR_STATUS']
df['in_AllocationNativeID'] = dfinPOD['WR_NUMBER']
df['in_AllocationOwner'] = dfinPOD['OWNERS']
df['in_AllocationPriorityDate'] = dfinPOD['ENF_PRTY_DT_DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOD['in_AllocationTimeframeEnd']
df['in_AllocationTimeframeStart'] = dfinPOD['in_AllocationTimeframeStart']
df['in_AllocationTypeCV'] = dfinPOD['WR_TYPE']
df['in_AllocationVolume_AF'] = dfinPOD['MAX_VOL']
df['in_BeneficialUseCategory'] = dfinPOD['PURPOSES']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "0" # either a 1 or 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOD['MAX_ACRES']
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOD['URL_ABSTRACT']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.head()

## Place of Use Data

In [None]:
# Input File - place of use data
FI_POU = "RawInputData/shapefiles/PlaceofUse.zip"
dfinPOU = gpd.read_file(FI_POU).replace(np.nan, "").reset_index()

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOU:
    dfinPOU['WaDEUUID'] = "u" + dfinPOU.index.astype(str)
    dfinPOU.to_csv('RawInputData/PlaceofUse.zip', compression=dict(method='zip', archive_name='PlaceofUse.csv'), index=False)

print(len(dfinPOU))
dfinPOU.head()

In [None]:
# separate out PERIOD_OF_DIVERSIONS to start date

# replace "-" char with a "/" char to match POD data
dfinPOU['PERIOD_OF_'] = dfinPOU['PERIOD_OF_'].str.replace("-", "/").str.replace(";", ",")

def SeparatePERIODOFDIVERSIONSStartFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        if ";" in val:
            val = val.split(";")[0].strip()
        outString = val.split("to")[0].strip()
    return outString

dfinPOU['in_AllocationTimeframeStart'] = dfinPOU.apply(lambda row: SeparatePERIODOFDIVERSIONSStartFunc(row['PERIOD_OF_']), axis=1)
dfinPOU['in_AllocationTimeframeStart'].unique()

In [None]:
# separate out PERIOD_OF_DIVERSIONS to end date
def SeparatePERIODOFDIVERSIONSEndFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        if ";" in val:
            val = val.split(";")[0].strip()
        outString = val.split("to")[1].strip()
    return outString

dfinPOU['in_AllocationTimeframeEnd'] = dfinPOU.apply(lambda row: SeparatePERIODOFDIVERSIONSEndFunc(row['PERIOD_OF_']), axis=1)
dfinPOU['in_AllocationTimeframeEnd'].unique()

In [None]:
# convert and round MAX_FLOW
def ConvertMAXFLOWFunc(val):
    val = str(val).strip()
    if val == "CFS" or val == "GPM" or val == "POF" or val == "" or val == " " or val == "nan" or pd.isnull(val):
        outVal = ""
    else:
        if "CFS" in val:
            val = val.split(" ")[0].strip()
            outVal = float(val)
        elif "GPM" in val:
            val = val.split(" ")[0].strip()
            outVal = (float(val) * 0.0026757275153786)
        elif "POF" in val:
            outVal = ""
        else:
            outVal = float(val)
    return outVal

dfinPOU['MAX_FLOW'] = dfinPOU.apply(lambda row: ConvertMAXFLOWFunc(row['MAX_FLOW']), axis=1)
dfinPOU['MAX_FLOW'].unique()

In [None]:
# Left-join outPOD to ensure water source information matches POD records

dfinPOU = pd.merge(dfinPOU, outPOD[['in_AllocationNativeID', 'in_WaterSourceName', 'in_WaterSourceTypeCV']], left_on='WR_NUMBER', right_on='in_AllocationNativeID', how='left')
print(len(dfinPOU))
dfinPOU.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfinPOU['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "MTwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "MTwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "MTwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfinPOU['in_WaterSourceName'] # from POD info
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = dfinPOU['in_WaterSourceTypeCV'] # from POD info

# Site Info
df['in_CoordinateAccuracy'] = "WaDE Blank"
df['in_CoordinateMethodCV'] = "Centroid of Area"
df['in_County'] = dfinPOU['COUNTY']
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOU['wadeLatitu']
df['in_Longitude'] = dfinPOU['wadeLongit']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POU"  # "Place of Use"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "u" + dfinPOU['index'].astype(str).str.strip() 
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "MT"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOU['MAX_FLOW']
df['in_AllocationLegalStatusCV'] =dfinPOU['WR_STATUS']
df['in_AllocationNativeID'] = dfinPOU['WR_NUMBER']
df['in_AllocationOwner'] = dfinPOU['OWNERS']
df['in_AllocationPriorityDate'] = dfinPOU['ENF_PRTY_D']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfinPOU['in_AllocationTimeframeEnd']
df['in_AllocationTimeframeStart'] = dfinPOU['in_AllocationTimeframeStart']
df['in_AllocationTypeCV'] = dfinPOU['WR_TYPE']
df['in_AllocationVolume_AF'] = dfinPOU['MAX_VOL']
df['in_BeneficialUseCategory'] = dfinPOU['PURPOSES']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "0" # either a 1 or 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = dfinPOU['MAX_ACRES']
df['in_IrrigationMethodCV'] = dfinPOU['IRR_TYPE']
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOU['URL_ABSTRA']


outPOU = df.copy()
outPOU = outPOU.drop_duplicates().reset_index(drop=True)
print(len(outPOU))
outPOU.head()

## Concatenate POD and POU Data.  Make needed changes

In [None]:
# Concatenate dataframes
frames = [outPOD, outPOU]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# remove ';' from benefical use, replace with commas ','

outdf['in_BeneficialUseCategory'] = outdf['in_BeneficialUseCategory'].str.replace(";", ",")
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# remove ';' from owner, replace with commas ','

outdf['in_AllocationOwner'] = outdf['in_AllocationOwner'].str.replace(";", ",")
outdf['in_AllocationOwner'].unique()

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

In [None]:
# Ensure Latitude entry is either numireic or a 0
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is either numireic or a 0
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Ensure Flow entry is either numireic or a 0
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Ensure Volume entry is either numireic or a 0
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Drop non-Active AllocationLegalStatusCV Water Rights
- For MT, we don't want water rights that are considered: {enter stringentries here}  

In [None]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ['WITHDRAWN', 
                       'TERMINATED', 
                       'PENDING', 
                       'CANCELLED',
                       'DISMISSED', 
                       'SUSPENDED', 
                       'DENIED', 
                       'SEVERED', 
                       'EXPIRED',
                       'REVOKED'
] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [None]:
# # PoU Shapefile Data
# shapefileInput = "RawInputData/shapefiles/{enter file name here}.zip" # ziped folder of the shp file

# dfPoUshapetemp = gpd.read_file(shapefileInput)
# dfPoUshapetemp['geometry'] = dfPoUshapetemp['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()


# use above POU input file
dfPoUshapetemp = dfinPOU.copy()
# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()

In [None]:
# create temp dataframe to hold native ID and geometry from shapefile input
columnsList = ['in_SiteNativeID', 'geometry']
dfPoUshape = pd.DataFrame(columns=columnsList)

# assing values to temp dataframe based on shapefile input
# for in_SiteNativeID assure ID value is the same as that listed above for POU info.
dfPoUshape['in_SiteNativeID'] = "u" + dfPoUshapetemp['index'].astype(str).str.strip() 
dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
print(len(dfPoUshape))
dfPoUshape.head()

## Export Data

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_mtMain.zip', compression=dict(method='zip', archive_name='Pwr_Main.csv'), index=False)  # The output, save as a zip
dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.