# Pre-processing Site-Specific Public Supply Time Series data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/AmericanSamoa/SS_PublicSupplyWaterUse" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/AmericanSamoa/SS_PublicSupplyWaterUse


## Data Input

In [3]:
# Input File - 'Final_Aggregated_Data'
fileInput = "RawInputData/Final_Aggregated_Data.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/Final_Aggregated_Data.zip", compression=dict(method='zip', archive_name="Final_Aggregated_Data.csv"), index=False)

print(len(dfin1))
dfin1.head(1)

4487


Unnamed: 0,SiteNativeID,BeneficialUseCategory,TimeframeStart,TimeframeEnd,Amount,VariableCV,ReportYear,WaDEUUID
0,Aasu,Commercial,2021-12-01,2021-12-31,5008.0,Consumptive Use,2021,in10


In [4]:
# Input File - 'Sites'
fileInput = "RawInputData/Sites.zip"
dfin2 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/Sites.zip", compression=dict(method='zip', archive_name="Sites.csv"), index=False)

print(len(dfin2))
dfin2.head(1)

134


Unnamed: 0,Lat,Long,SiteNativeID,SiteTypeCV,WaDEUUID
0,-14.28384,-170.55358,Aunuu,Village (aggregation of individual water mete...,in20


In [5]:
# Left-join input data into single dataframe

# remove white space
dfin1['SiteNativeID'] = dfin1['SiteNativeID'].astype(str).str.strip()
dfin2['SiteNativeID'] = dfin2['SiteNativeID'].astype(str).str.strip()

# Left-join data
dfin = pd.merge(dfin1, dfin2, left_on='SiteNativeID', right_on='SiteNativeID', how='left')
print(len(dfin))
dfin.head()

4487


Unnamed: 0,SiteNativeID,BeneficialUseCategory,TimeframeStart,TimeframeEnd,Amount,VariableCV,ReportYear,WaDEUUID_x,Lat,Long,SiteTypeCV,WaDEUUID_y
0,Aasu,Commercial,2021-12-01,2021-12-31,5008.0,Consumptive Use,2021,in10,-14.30227,-170.75793,Village (aggregation of individual water mete...,in251
1,Aasu,Domestic,2021-12-01,2021-12-31,641171.0,Consumptive Use,2021,in11,-14.30227,-170.75793,Village (aggregation of individual water mete...,in251
2,Aasu,Commercial,2022-01-01,2022-01-31,3592.0,Consumptive Use,2022,in12,-14.30227,-170.75793,Village (aggregation of individual water mete...,in251
3,Aasu,Domestic,2022-01-01,2022-01-31,511265.0,Consumptive Use,2022,in13,-14.30227,-170.75793,Village (aggregation of individual water mete...,in251
4,Aasu,Commercial,2022-02-01,2022-02-28,3323.0,Consumptive Use,2022,in14,-14.30227,-170.75793,Village (aggregation of individual water mete...,in251


## Work with Data

In [6]:
# determine POD or POU, use SiteType value

# remove empty white space
dfin['SiteTypeCV'] = dfin['SiteTypeCV'].str.strip()

# assign PODorPOUSite value based on SiteTypeCV entry
def CheckPODOrPOUFunc(val):
    val = str(val).strip()
    if val == "Withdrawal":
        outString = "POD"
    else:
        outString = "POU"
    return outString

dfin['in_PODorPOUSite'] = dfin.apply(lambda row: CheckPODOrPOUFunc(row['SiteTypeCV']), axis=1)
dfin['in_PODorPOUSite'].value_counts()

in_PODorPOUSite
POU    3522
POD     965
Name: count, dtype: int64

In [7]:
# determine CommunityWaterSupplySystem name, break apart SiteNativeID to identify

def CreateCommunityWaterSupplySystemFunc(val):
    val = str(val).strip()
    outString = list(val.split("_"))[0]
    return outString

dfin['in_CommunityWaterSupplySystem'] = dfin.apply(lambda row: CreateCommunityWaterSupplySystemFunc(row['SiteNativeID']), axis=1)
dfin['in_CommunityWaterSupplySystem'].value_counts()

in_CommunityWaterSupplySystem
Tafuna         226
Malaeloa       224
Pago Pago      181
Iliili         159
Pavaiai        151
              ... 
Alega           26
Onenoa          26
Masausi         26
Utumea East     26
Amaua           26
Name: count, Length: 64, dtype: int64

### POD Data

In [8]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "ASIssps_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "" # determine below
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = dfin['VariableCV']

# Organization Info
df['in_OrganizationUUID'] = "ASIssps_OR1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = "Unspecified" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "Unspecified"
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin['Lat']
df['in_Longitude'] = dfin['Long']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfin['in_PODorPOUSite']
df['in_SiteName'] = dfin['SiteNativeID']
df['in_SiteNativeID'] = dfin['SiteNativeID'] # will use same value as name as that is how AS keeps their records separate.
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin['SiteTypeCV']
df['in_StateCV'] = "AS"
df['in_USGSSiteID'] = ""
   
# Site VariableAmounts Info
df['in_Amount'] = dfin['Amount']
df['in_AllocationCropDutyAmount'] = ""
df['in_AssociatedNativeAllocationIDs'] = ""
df['in_BeneficialUseCategory'] = dfin['BeneficialUseCategory']
df['in_CommunityWaterSupplySystem'] = dfin['in_CommunityWaterSupplySystem'] # see above
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_Geometry'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerGeneratedGWh'] = ""
df['in_PowerType'] = ""
df['in_PrimaryUseCategory'] = df['in_BeneficialUseCategory']
df['in_ReportYearCV'] =  dfin['ReportYear']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin['TimeframeEnd']
df['in_TimeframeStart'] = dfin['TimeframeStart']

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

4487


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in10,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,5008.0,,,Commercial,Aasu,,,,,,,,,,Commercial,2021,,2021-12-31,2021-12-01
1,in11,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,641171.0,,,Domestic,Aasu,,,,,,,,,,Domestic,2021,,2021-12-31,2021-12-01
2,in12,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,3592.0,,,Commercial,Aasu,,,,,,,,,,Commercial,2022,,2022-01-31,2022-01-01
3,in13,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,511265.0,,,Domestic,Aasu,,,,,,,,,,Domestic,2022,,2022-01-31,2022-01-01
4,in14,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,3323.0,,,Commercial,Aasu,,,,,,,,,,Commercial,2022,,2022-02-28,2022-02-01


### POU Data

In [9]:
# repeat above

## Concatenate POD and POU Data.  Make needed changes

In [10]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

4487


## Clean Data / data types

In [11]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

  Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')


In [12]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Unspecified'], dtype=object)

In [13]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array([''], dtype=object)

In [14]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Aasu', 'Afao', 'Afono', 'Agugulu', 'Alao', 'Alega', 'Alofau',
       'Amaluia', 'Amanave', 'Amaua', 'Amouli', 'Aoa', 'Aoloau', 'Asili',
       'Atuu', 'Aua', 'Auasi', 'Aunuu', 'Auto', 'Avaio', 'Fagaalu',
       'Fagaitua', 'Fagalii', 'Fagamalo', 'Faganeanea', 'Fagasa',
       'Fagatogo', 'Failolo', 'Faleniu', 'Fatumafuti', 'Futiga', 'Iliili',
       'Laulii', 'Leloaloa', 'Leone', 'Malaeimi', 'Malaeloa', 'Maloata',
       'Mapusagafou', 'Masausi', 'Masefau', 'Matuu', 'Mesepa', 'Nua',
       'Nuuuli', 'Ofu', 'Olosega', 'Onenoa', 'Pagai', 'Pago Pago',
       'Pavaiai', 'Poloa', 'Sailele', 'Seetaga', 'Tafuna', 'Taputimu',
       'Tau', 'Tula', 'Utulei', 'Utumea East', 'Utumea West', 'Vailoatai',
       'Vaitogi', 'Vatia', 'Aasu_Well_128', 'Afono_Well_176',
       'Alao_Well_161', 'Aoa_Well_151', 'Aoa_Well_152', 'Aua_Well_97',
       'Aua_Well_99', 'Aunuu_Well_302', 'Fagaalu_Well_179',
       'Fagaitua_Well_164', 'Fagasa_Well_143', 'Fagasa_Well_144',
       'Fagatogo_Well_101', 'Ili

In [15]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [16]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Unspecified'], dtype=object)

In [17]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater'], dtype=object)

In [18]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Village (aggregation of individual water meter use within each village boundary)',
       '', 'Withdrawal'], dtype=object)

In [19]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Aasu', 'Afao', 'Afono', 'Agugulu', 'Alao', 'Alega', 'Alofau',
       'Amaluia', 'Amanave', 'Amaua', 'Amouli', 'Aoa', 'Aoloau', 'Asili',
       'Atuu', 'Aua', 'Auasi', 'Aunuu', 'Auto', 'Avaio', 'Fagaalu',
       'Fagaitua', 'Fagalii', 'Fagamalo', 'Faganeanea', 'Fagasa',
       'Fagatogo', 'Failolo', 'Faleniu', 'Fatumafuti', 'Futiga', 'Iliili',
       'Laulii', 'Leloaloa', 'Leone', 'Malaeimi', 'Malaeloa', 'Maloata',
       'Mapusagafou', 'Masausi', 'Masefau', 'Matuu', 'Mesepa', 'Nua',
       'Nuuuli', 'Ofu', 'Olosega', 'Onenoa', 'Pagai', 'Pago Pago',
       'Pavaiai', 'Poloa', 'Sailele', 'Seetaga', 'Tafuna', 'Taputimu',
       'Tau', 'Tula', 'Utulei', 'Utumea East', 'Utumea West', 'Vailoatai',
       'Vaitogi', 'Vatia', 'Aasu_Well_128', 'Afono_Well_176',
       'Alao_Well_161', 'Aoa_Well_151', 'Aoa_Well_152', 'Aua_Well_97',
       'Aua_Well_99', 'Aunuu_Well_302', 'Fagaalu_Well_179',
       'Fagaitua_Well_164', 'Fagasa_Well_143', 'Fagasa_Well_144',
       'Fagatogo_Well_101', 'Ili

In [20]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array([''], dtype=object)

In [21]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Commercial', 'Domestic', 'Industrial', 'Production']

In [22]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([-14.302265, -14.322683, -14.260274, -14.326378, -14.262692,
       -14.275551, -14.276086, -14.323087, -14.323255, -14.268287,
       -14.273231, -14.263652, -14.303573, -14.321193, -14.268744,
       -14.271191, -14.268337, -14.283838, -14.272675, -14.274369,
       -14.290319, -14.265926, -14.312442, -14.30108, -14.299348,
       -14.288178, -14.283737, -14.325378, -14.327548, -14.296449,
       -14.354383, -14.344307, -14.283528, -14.26617, -14.334376,
       -14.309598, '', -14.30854, -14.31788, -14.259672, -14.259154,
       -14.295858, -14.319156, -14.32141, -14.307647, -14.254069,
       -14.268132, -14.274006, -14.333156, -14.317759, -14.261929,
       -14.321285, -14.330737, -14.352669, -14.252489, -14.283568,
       -14.268461, -14.32385, -14.357329, -14.354405, -14.253836,
       -14.309801, -14.26778, -14.26244, -14.26502, -14.26459, -14.2687,
       -14.26776, -14.286658, -14.291027, -14.26756, -14.29271,
       -14.290934, -14.28003, -14.341723, -14.347566, -14.346

In [23]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-170.757932, -170.80129, -170.651214, -170.820111, -170.569131,
       -170.640331, -170.600952, -170.78905, -170.833062, -170.626089,
       -170.588341, -170.586874, -170.77944, -170.794625, -170.68572,
       -170.662121, -170.577498, -170.553583, -170.630945, -170.635282,
       -170.685467, -170.615297, -170.824097, -170.798623, -170.696623,
       -170.724944, -170.693464, -170.824424, -170.743654, -170.678754,
       -170.760214, -170.740577, -170.656751, -170.678005, -170.779596,
       -170.735128, '', -170.811071, -170.751117, -170.608484, -170.6314,
       -170.68974, -170.743009, -170.807108, -170.713961, -170.578844,
       -170.604702, -170.70403, -170.7527, -170.832867, -170.598508,
       -170.812586, -170.725407, -170.770655, -170.56933, -170.683608,
       -170.571409, -170.817104, -170.77675, -170.740559, -170.675167,
       -170.771491, -170.65048, -170.56889, -170.58598, -170.58577,
       -170.66118, -170.6607, -170.556946, -170.688929, -170.61795,
       -

In [24]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array([5008.0, 641171.0, 3592.0, ..., 865000.0, 904000.0, 883000.0],
      dtype=object)

In [25]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).astype(str)
outdf['in_PopulationServed'].unique()

array(['0'], dtype=object)

In [26]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
['2021-12-31 00:00:00', '2022-01-31 00:00:00', '2022-02-28 00:00:00',
 '2022-03-31 00:00:00', '2022-04-30 00:00:00', '2022-05-31 00:00:00',
 '2022-06-30 00:00:00', '2022-07-31 00:00:00', '2022-08-31 00:00:00',
 '2022-09-30 00:00:00', '2022-11-30 00:00:00', '2022-12-31 00:00:00',
 '2023-01-31 00:00:00', '2023-02-28 00:00:00', '2023-03-31 00:00:00',
 '2023-04-30 00:00:00', '2023-05-31 00:00:00', '2023-06-30 00:00:00',
 '2023-07-31 00:00:00', '2023-08-31 00:00:00', '2023-09-30 00:00:00',
 '2023-10-31 00:00:00', '2023-11-30 00:00:00', '2023-12-31 00:00:00',
 '2024-01-31 00:00:00', '2024-02-29 00:00:00', '2024-04-30 00:00:00',
 '2022-10-31 00:00:00', '2021-11-30 00:00:00', '2024-05-31 00:00:00',
 '2024-03-31 00:00:00', '2021-10-31 00:00:00']
Length: 32, dtype: datetime64[ns]

In [27]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
['2021-12-01 00:00:00', '2022-01-01 00:00:00', '2022-02-01 00:00:00',
 '2022-03-01 00:00:00', '2022-04-01 00:00:00', '2022-05-01 00:00:00',
 '2022-06-01 00:00:00', '2022-07-01 00:00:00', '2022-08-01 00:00:00',
 '2022-09-01 00:00:00', '2022-11-01 00:00:00', '2022-12-01 00:00:00',
 '2023-01-01 00:00:00', '2023-02-01 00:00:00', '2023-03-01 00:00:00',
 '2023-04-01 00:00:00', '2023-05-01 00:00:00', '2023-06-01 00:00:00',
 '2023-07-01 00:00:00', '2023-08-01 00:00:00', '2023-09-01 00:00:00',
 '2023-10-01 00:00:00', '2023-11-01 00:00:00', '2023-12-01 00:00:00',
 '2024-01-01 00:00:00', '2024-02-01 00:00:00', '2024-04-01 00:00:00',
 '2022-10-01 00:00:00', '2021-11-01 00:00:00', '2024-05-01 00:00:00',
 '2024-03-01 00:00:00', '2021-10-01 00:00:00']
Length: 32, dtype: datetime64[ns]

In [28]:
# extract year out
outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].replace("", 0).fillna(0).astype(int).astype(str)
outdf['in_ReportYearCV'].unique()

array(['2021', '2022', '2023', '2024'], dtype=object)

In [29]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_BeneficialUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Consumptive Use_Monthly_Commercial_Groundwater',
       'Consumptive Use_Monthly_Domestic_Groundwater',
       'Consumptive Use_Monthly_Industrial_Groundwater',
       'Withdrawal_Monthly_Production_Groundwater'], dtype=object)

In [30]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1'], dtype=object)

In [31]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['Aasu', 'Afao', 'Afono', 'Agugulu', 'Alao', 'Alega', 'Alofau',
       'Amaluia', 'Amanave', 'Amaua', 'Amouli', 'Aoa', 'Aoloau', 'Asili',
       'Atuu', 'Aua', 'Auasi', 'Aunuu', 'Auto', 'Avaio', 'Fagaalu',
       'Fagaitua', 'Fagalii', 'Fagamalo', 'Faganeanea', 'Fagasa',
       'Fagatogo', 'Failolo', 'Faleniu', 'Fatumafuti', 'Futiga', 'Iliili',
       'Laulii', 'Leloaloa', 'Leone', 'Malaeimi', 'Malaeloa', 'Maloata',
       'Mapusagafou', 'Masausi', 'Masefau', 'Matuu', 'Mesepa', 'Nua',
       'Nuuuli', 'Ofu', 'Olosega', 'Onenoa', 'Pagai', 'Pago Pago',
       'Pavaiai', 'Poloa', 'Sailele', 'Seetaga', 'Tafuna', 'Taputimu',
       'Tau', 'Tula', 'Utulei', 'Utumea East', 'Utumea West', 'Vailoatai',
       'Vaitogi', 'Vatia', 'Aasu_Well_128', 'Afono_Well_176',
       'Alao_Well_161', 'Aoa_Well_151', 'Aoa_Well_152', 'Aua_Well_97',
       'Aua_Well_99', 'Aunuu_Well_302', 'Fagaalu_Well_179',
       'Fagaitua_Well_164', 'Fagasa_Well_143', 'Fagasa_Well_144',
       'Fagatogo_Well_101', 'Ili

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [32]:
# # PoU Shapefile Data
# # see above for input

# print(len(dfPoUshapetemp))
# dfPoUshapetemp.head()

In [33]:
# # create temp dataframe to hold native ID and geometry from shapefile input
# columnsList = ['in_SiteNativeID', 'geometry']
# dfPoUshape = pd.DataFrame(columns=columnsList)

# # assing values to temp dataframe based on shapefile input
# # for in_SiteNativeID assure ID value is the same as that listed above for POU info.
# dfPoUshape['in_SiteNativeID'] = # "u" + Use same ID as up above 
# dfPoUshape['geometry'] = dfPoUshapetemp['geometry']
# dfPoUshape = dfPoUshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
# print(len(dfPoUshape))
# dfPoUshape.head()

## Export Outputs

In [34]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 50 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   WaDEUUID                          4487 non-null   object        
 1   in_MethodUUID                     4487 non-null   object        
 2   in_VariableSpecificUUID           4487 non-null   object        
 3   in_AggregationIntervalUnitCV      4487 non-null   object        
 4   in_VariableCV                     4487 non-null   object        
 5   in_OrganizationUUID               4487 non-null   object        
 6   in_Geometry                       4487 non-null   object        
 7   in_GNISFeatureNameCV              4487 non-null   object        
 8   in_WaterQualityIndicatorCV        4487 non-null   object        
 9   in_WaterSourceName                4487 non-null   object        
 10  in_WaterSourceNativeID            4487 non-null 

In [35]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_Amount,in_AllocationCropDutyAmount,in_AssociatedNativeAllocationIDs,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_IrrigatedAcreage,in_IrrigationMethodCV,in_PopulationServed,in_PowerGeneratedGWh,in_PowerType,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,in10,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,5008.00000,,,Commercial,Aasu,,,,,,,0,,,Commercial,2021,,2021-12-31,2021-12-01,Consumptive Use_Monthly_Commercial_Groundwater
1,in11,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,641171.00000,,,Domestic,Aasu,,,,,,,0,,,Domestic,2021,,2021-12-31,2021-12-01,Consumptive Use_Monthly_Domestic_Groundwater
2,in12,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,3592.00000,,,Commercial,Aasu,,,,,,,0,,,Commercial,2022,,2022-01-31,2022-01-01,Consumptive Use_Monthly_Commercial_Groundwater
3,in13,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,511265.00000,,,Domestic,Aasu,,,,,,,0,,,Domestic,2022,,2022-01-31,2022-01-01,Consumptive Use_Monthly_Domestic_Groundwater
4,in14,ASIssps_M1,,Monthly,Consumptive Use,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.30227,-170.75793,,,POU,Aasu,Aasu,,Village (aggregation of individual water meter...,AS,,3323.00000,,,Commercial,Aasu,,,,,,,0,,,Commercial,2022,,2022-02-28,2022-02-01,Consumptive Use_Monthly_Commercial_Groundwater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4482,in14482,ASIssps_M1,,Monthly,Withdrawal,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.25476,-170.66857,,,POD,Vatia_Well_180,Vatia_Well_180,,Withdrawal,AS,,937000.00000,,,Production,Vatia,,,,,,,0,,,Production,2023,,2023-11-30,2023-11-01,Withdrawal_Monthly_Production_Groundwater
4483,in14483,ASIssps_M1,,Monthly,Withdrawal,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.25476,-170.66857,,,POD,Vatia_Well_180,Vatia_Well_180,,Withdrawal,AS,,869000.00000,,,Production,Vatia,,,,,,,0,,,Production,2023,,2023-12-31,2023-12-01,Withdrawal_Monthly_Production_Groundwater
4484,in14484,ASIssps_M1,,Monthly,Withdrawal,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.25476,-170.66857,,,POD,Vatia_Well_180,Vatia_Well_180,,Withdrawal,AS,,865000.00000,,,Production,Vatia,,,,,,,0,,,Production,2024,,2024-01-31,2024-01-01,Withdrawal_Monthly_Production_Groundwater
4485,in14485,ASIssps_M1,,Monthly,Withdrawal,ASIssps_OR1,,,Fresh,Unspecified,wadeId1,Groundwater,,Unspecified,,4326,,,,-14.25476,-170.66857,,,POD,Vatia_Well_180,Vatia_Well_180,,Withdrawal,AS,,904000.00000,,,Production,Vatia,,,,,,,0,,,Production,2024,,2024-02-29,2024-02-01,Withdrawal_Monthly_Production_Groundwater


In [36]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pssps_Main.zip', compression=dict(method='zip', archive_name='Pssps_Main.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.