## Create podsitetopousiterelationships.csv file
Last Updated: 11/15/2022

Purpose: To create UT site specific public supply site POD and POU relation information and populate dataframe for WaDEQA 2.0.

Notes:    
- read in main processed input file, sites, and sitespecificamounts.
- separate out sites from POU to POD
- attach time info from sitespecificamounts to both POU and POD site dataframes, then attach linking element from main processed input fill.
- Attach POU dataframe to POD datafame if a common linking element.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Utah/SS_PublicSupplyWaterUse"
os.chdir(workingDir)

### Input Files

In [3]:
# Read in main processed input file
fileInput = "RawInputData/P_MasterUTSiteSpecific.csv"
dfmp = pd.read_csv(fileInput)
print(len(dfmp))
dfmp.head(1)

585719


  dfmp = pd.read_csv(fileInput)


Unnamed: 0,in_VariableCV,in_VariableSpecificCV,in_Amount,in_PopulationServed,in_BenUse,in_ReportYearCV,in_TimeframeEnd,in_TimeframeStart,in_WaterSourceTypeCV,linkKey,in_CoordinateMethodCV,in_County,in_Latitude,in_Longitude,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SiteTypeCV,monthCheck,in_WaterSourceNativeID
0,Delivered Water Use,Delivered Water Use_Annual_Domestic_Unspecified,4724846.0,28,Domestic,2021,2021-12-31,2021-01-01,Unspecified,11843,,,0.0,0.0,,,,,,WaDEUT_WS1


In [4]:
# Read in sites file
fileInput = "ProcessedInputData/Sites.csv"
dfs = pd.read_csv(fileInput)
print(len(dfs))
dfs.head(1)

4432


Unnamed: 0,SiteUUID,RegulatoryOverlayUUIDs,WaterSourceUUIDs,CoordinateAccuracy,CoordinateMethodCV,County,EPSGCodeCV,Geometry,GNISCodeCV,HUC12,HUC8,Latitude,Longitude,NHDNetworkStatusCV,NHDProductCV,PODorPOUSite,SiteName,SiteNativeID,SitePoint,SiteTypeCV,StateCV,USGSSiteID
0,UTssps_S1,,UTssps_WS3,,Representation Node,,4326,,,,,40.533785,-111.831387,,,POD,Well A (middle well),POD100000001,,Well,UT,


In [5]:
# Read in sitespecificamounts file
fileInput = "ProcessedInputData/sitespecificamounts.csv"
dfsa = pd.read_csv(fileInput)
dfsa = dfsa.assign(SiteUUID=dfsa['SiteUUID'].str.split(',')).explode('SiteUUID').reset_index(drop=True)
print(len(dfsa))
dfsa.head(1)

386961


Unnamed: 0,MethodUUID,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,Amount,AllocationCropDutyAmount,AssociatedNativeAllocationIDs,BeneficialUseCategory,CommunityWaterSupplySystem,CropTypeCV,CustomerTypeCV,DataPublicationDate,DataPublicationDOI,Geometry,IrrigatedAcreage,IrrigationMethodCV,PopulationServed,PowerGeneratedGWh,PowerType,PrimaryUseCategory,ReportYearCV,SDWISIdentifier,TimeframeEnd,TimeframeStart
0,UTssps_M1,UTssps_O1,UTssps_S4214,UTssps_V3,UTssps_WS1,1228800.0,,,Domestic,,,Public,11/15/2022,,,,,14,,,Unspecified,2021,,2021-12-31,2021-01-01


### Extract Data

In [6]:
# extract unique in_SiteNativeID and linkKey from main processed input file
dfmp2 = dfmp[['in_SiteNativeID', 'linkKey']]
dfmp2 = dfmp2.dropna().drop_duplicates().reset_index(drop=True)
print(len(dfmp2))
dfmp2.head(2)

4848


Unnamed: 0,in_SiteNativeID,linkKey
0,POU11830,11830
1,POU11815,11815


In [7]:
#create POU site file
dfspou = dfs[['SiteUUID', 'SiteNativeID', 'PODorPOUSite']]
dfspou = dfspou[dfspou['PODorPOUSite'] == 'POU']
dfspou = dfspou.dropna().drop_duplicates().reset_index(drop=True)
print(len(dfspou))
dfspou.head(2)

566


Unnamed: 0,SiteUUID,SiteNativeID,PODorPOUSite
0,UTssps_S3867,POU1000,POU
1,UTssps_S3868,POU1001,POU


In [8]:
#create POD site file
dfspod = dfs[['SiteUUID', 'SiteNativeID', 'PODorPOUSite']]
dfspod = dfspod[dfspod['PODorPOUSite'] == 'POD']
dfspou = dfspou.dropna().drop_duplicates().reset_index(drop=True)
print(len(dfspou))
dfspou.head(2)

566


Unnamed: 0,SiteUUID,SiteNativeID,PODorPOUSite
0,UTssps_S3867,POU1000,POU
1,UTssps_S3868,POU1001,POU


In [9]:
# extract useable site info with StartDate & EndDate from sitespecificamounts file
dfsa2 = dfsa[['SiteUUID', 'TimeframeStart', 'TimeframeEnd']]
dfsa2 = dfsa2.dropna().drop_duplicates().reset_index(drop=True)
print(len(dfsa2))
dfsa2.head(2)

372278


Unnamed: 0,SiteUUID,TimeframeStart,TimeframeEnd
0,UTssps_S4214,2021-01-01,2021-12-31
1,UTssps_S4211,2021-01-01,2021-12-31


In [10]:
# left join POU sites to sitespecificamounts to main processed input file
dfmergeU = pd.DataFrame(index=dfspou.index)

dfmergeU = pd.merge(dfspou, dfsa2, on='SiteUUID', how='left')
dfmergeU = pd.merge(dfmergeU, dfmp2, left_on='SiteNativeID', right_on='in_SiteNativeID', how='left')
dfmergeU = dfmergeU.drop_duplicates().reset_index(drop=True)
print(len(dfmergeU))
dfmergeU.head(2)

10741


Unnamed: 0,SiteUUID,SiteNativeID,PODorPOUSite,TimeframeStart,TimeframeEnd,in_SiteNativeID,linkKey
0,UTssps_S3867,POU1000,POU,2002-01-01,2002-12-31,POU1000,1000
1,UTssps_S3867,POU1000,POU,2012-01-01,2012-12-31,POU1000,1000


In [11]:
# left join POD sites to sitespecificamounts to main processed input file
dfmergeD = pd.DataFrame(index=dfspod.index)

dfmergeD = pd.merge(dfspod, dfsa2, on='SiteUUID', how='left')
dfmergeD = pd.merge(dfmergeD, dfmp2, left_on='SiteNativeID', right_on='in_SiteNativeID', how='left')
dfmergeD = dfmergeD.drop_duplicates().reset_index(drop=True)
print(len(dfmergeD))
dfmergeD.head(2)

362035


Unnamed: 0,SiteUUID,SiteNativeID,PODorPOUSite,TimeframeStart,TimeframeEnd,in_SiteNativeID,linkKey
0,UTssps_S1,POD100000001,POD,2021-04-01,2021-04-30,POD100000001,10000
1,UTssps_S1,POD100000001,POD,2018-04-01,2018-04-30,POD100000001,10000


In [12]:
# Left Join POD file with POU file with matching linking elements. Drop NaN rows
dfmergeDU = pd.DataFrame(index=dfmergeD.index)

dfmergeDU = pd.merge(dfmergeD, dfmergeU, on='linkKey', how='left')
dfmergeDU = dfmergeDU.dropna(subset=['SiteUUID_y']).drop_duplicates().reset_index(drop=True)
print(len(dfmergeDU))
dfmergeDU.head(2)

6504538


Unnamed: 0,SiteUUID_x,SiteNativeID_x,PODorPOUSite_x,TimeframeStart_x,TimeframeEnd_x,in_SiteNativeID_x,linkKey,SiteUUID_y,SiteNativeID_y,PODorPOUSite_y,TimeframeStart_y,TimeframeEnd_y,in_SiteNativeID_y
0,UTssps_S4,POD10000001,POD,2015-01-01,2015-01-31,POD10000001,1000,UTssps_S3867,POU1000,POU,2002-01-01,2002-12-31,POU1000
1,UTssps_S4,POD10000001,POD,2015-01-01,2015-01-31,POD10000001,1000,UTssps_S3867,POU1000,POU,2012-01-01,2012-12-31,POU1000


### Create Output Dataframe

In [13]:
columnslist = ["PODSiteUUID", "POUSiteUUID", "StartDate", "EndDate"]
outdf = pd.DataFrame(index=dfmergeDU.index, columns=columnslist)  # The output dataframe
print(len(outdf))
outdf.head(1)

6504538


Unnamed: 0,PODSiteUUID,POUSiteUUID,StartDate,EndDate
0,,,,


In [14]:
outdf['PODSiteUUID'] = dfmergeDU['SiteUUID_x']
outdf['POUSiteUUID'] = dfmergeDU['SiteUUID_y']
outdf['StartDate'] = dfmergeDU['TimeframeStart_x']
outdf['EndDate'] = dfmergeDU['TimeframeEnd_x']
outdf = outdf.dropna().drop_duplicates().reset_index(drop=True).sort_values(by=['PODSiteUUID', 'POUSiteUUID', 'StartDate'])
print(len(outdf))
outdf.head()

251562


Unnamed: 0,PODSiteUUID,POUSiteUUID,StartDate,EndDate
21040,UTssps_S100,UTssps_S3886,1968-01-01,1968-12-31
21043,UTssps_S100,UTssps_S3886,1971-01-01,1971-12-31
21037,UTssps_S100,UTssps_S3886,1972-01-01,1972-12-31
21036,UTssps_S100,UTssps_S3886,1973-01-01,1973-12-31
21042,UTssps_S100,UTssps_S3886,1974-01-01,1974-12-31


In [15]:
# The working output DataFrame for WaDE 2.0 input.
if not outdf.empty:
    outdf.to_csv('ProcessedInputData/podsitetopousiterelationships.csv', index=False)