# Create WaDE Uploader csv Files for Water Supply Site Time Series data for WaDE
- Last Updated: 01/19/2024
- Purpose: To create necessary processed WaDE Uploader csv files for reservoir and observation site information.

In [1]:
# Needed Libraries / Modules
import sys
import os
import numpy as np
import pandas as pd

## Custom Libraries
sys.path.append("../../5_CustomFunctions/MappingFunctions")
import CreateWaterSourcesFile
import CreateSitesFile
import CreateSiteSpecificAmounts_factFile
import RemoveUnusedRecordsFile

## Input Data (make changes here)
 - create and add variables specific to this proejct here

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/WaDE Data Folder/Montana/WaterSupply_SiteSpecific"  # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

# ---- Inputs ----
varST = "MT" # source organization or state abbreviation
varUUIDType = "wsss" # UUID data type abbreviation
mainInputFile = "RawinputData/Pwsss_Main.zip" # use processed zip file

The working Directory is: G:/Shared drives/WaDE Data/WaDE Data Folder/Montana/WaterSupply_SiteSpecific


In [3]:
# ---- Read in mainInputFile csv file ----
df = pd.read_csv(mainInputFile, compression='zip')
print(f"Total memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Total memory: 18534.24 MB


In [4]:
# ---- Optomize data types of mainInputFile csv file ----
def optimize_dataframe(df):
    for col in df.select_dtypes(include='int'):
        if (df[col] >= 0).all():
            df[col] = pd.to_numeric(df[col], downcast='unsigned')
        else:
            df[col] = pd.to_numeric(df[col], downcast='integer')

    for col in df.select_dtypes(include='float'):
        df[col] = pd.to_numeric(df[col], downcast='float')

    for col in df.select_dtypes(include='object'):
        num_unique_values = df[col].nunique()
        num_total_values = len(df[col])
        if num_unique_values / num_total_values < 0.5:
            df[col] = df[col].astype('category')

    return df

df = optimize_dataframe(df)
print(f"Total memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Total memory: 2239.62 MB


## Create CSV Input Files

In [5]:
# ---- Method (methods.csv) ----
# Simple info, create by hand and save in ProcessedInputData folder.

In [6]:
# ---- Variable (variables.csv) ----
# Simple info, create by hand and save in ProcessedInputData folder.

In [7]:
# ---- Organization (organizations.csv) ----
# Simple info, create by hand and save in ProcessedInputData folder.

In [8]:
# ---- Water Sources (watersources.csv) ----
print("Creating Water Sources input csv...")
print("############################################################################")
CreateWaterSourcesFile.CreateWaterSourcesInputFunction(workingDirString, varST, varUUIDType, df)

Creating Water Sources input csv...
############################################################################
Setting inputs...
Populating dataframe...
Geometry
GNISFeatureNameCV
WaterQualityIndicatorCV
WaterSourceName
WaterSourceNativeID
WaterSourceTypeCV
Adding Data Assessment UUID
Resetting Index
GroupBy outdf duplicates based on key fields...
Error checking each field. Purging bad inputs.
Length of outdf DataFrame:  2
Length of dfpurge DataFrame:  0
Assign WaterSourceUUID
Cleaning export for correct data types...
Exporting dataframe...
Done


  outdf = outdf.groupby('WaterSourceNativeID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem != ""])).replace(np.nan, "").reset_index()


In [9]:
# ---- Sites (sites.csv) ----
print("Creating Sites input csv...")
print("############################################################################")
CreateSitesFile.CreateSitesInputFunction(workingDirString, varST, varUUIDType, df)

Creating Sites input csv...
############################################################################
Setting inputs...
...no geometry data to worry about.
Populating dataframe...
WaterSourceUUIDs
OverlayUUIDs
CoordinateAccuracy
CoordinateMethodCV
County
EPSGCodeCV
Geometry
GNISCodeCV
HUC12
HUC8
Latitude
Longitude
NHDNetworkStatusCV
NHDProductCV
PODorPOUSite
SiteName
SiteNativeID
SitePoint
SiteTypeCV
StateCV
USGSSiteID
Adding Data Assessment UUID
Resetting Index
GroupBy outdf duplicates based on key fields...


  outdf = outdf.groupby('SiteNativeID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem!=''])).replace(np.nan, "").reset_index()


Error checking each field. Purging bad inputs.
Length of outdf DataFrame:  281
Length of dfpurge DataFrame:  0
Assign SiteUUID
Cleaning export for correct data types...
Exporting dataframe...
Done


In [10]:
# ---- Site Specific Amounts (sitespecificamounts.csv) ----
print("Creating Site Specific Amounts input csv...")
print("############################################################################")
CreateSiteSpecificAmounts_factFile.CreateSiteSpecificAmounts_factsInputFunction(workingDirString, df)

Creating Site Specific Amounts input csv...
############################################################################
Setting inputs...
Populating dataframe outdf...
MethodUUID
VariableSpecificUUID
OrganizationUUID
WaterSourceUUID
SiteUUID
Amount
AllocationCropDutyAmount
AssociatedNativeAllocationIDs
BeneficialUseCategory
CommunityWaterSupplySystem
CropTypeCV
CustomerTypeCV
DataPublicationDate
DataPublicationDOI
Geometry
IrrigatedAcreage
IrrigationMethodCV
PopulationServed
PowerGeneratedGWh
PowerType
PrimaryUseCategory
ReportYearCV
SDWISIdentifier
TimeframeEnd
TimeframeStart
Adding Data Assessment UUID
Resetting Index
Error checking each field. Purging bad inputs.


  selectionVar = (dfx['Amount'].replace("", 0).fillna(0).astype(float) <= 0.0)
  dfy = pd.concat([dfy, mask]).reset_index(drop=True)
  dfy = pd.concat([dfy, mask]).reset_index(drop=True)
  dfy = pd.concat([dfy, mask]).reset_index(drop=True)
  dfy = pd.concat([dfy, mask]).reset_index(drop=True)
  dfy = pd.concat([dfy, mask]).reset_index(drop=True)
  dfy = pd.concat([dfy, mask]).reset_index(drop=True)


Length of outdf DataFrame:  619263
Length of dfpurge DataFrame:  6568153
Cleaning export for correct data types...
Exporting dataframe...
... 6568153  records removed.
Done


In [11]:
# ---- Remove unused records ----
print("Remove unused Water Sources and Sites records not found within sitespecificamountsinput.csv...")
print("############################################################################")
RemoveUnusedRecordsFile.RemoveUnusedSiteSpecificAmountsRecordsFileFunction(workingDirString)

Remove unused Water Sources and Sites records not found within sitespecificamountsinput.csv...
############################################################################
Reading input csv...
Length of dfs before removing sites:  281


  dfspurge = pd.concat(frames).reset_index(drop=True)


Length of dfs after removing sites:  265
Length of dfws before removing water sources:  2
Length of dfws after removing water sources:  2
Export Files - watersource.csv, watersource_missing.csv, sites.csv, sites_missing.csv, sitespecificamounts.csv
Done
