# Pre-processing Texas Commission on Environmental Quality for Site-Specific Division & Withdrawl Site data for WaDE upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Texas/WaterAllocation_WaterUse_TCEQ" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Texas/WaterAllocation_WaterUse_TCEQ


## Data Input 1 - timeseries WaterUse
- clean up ben use values
- explode / separate out non-timeseries info & re-attach timseries info with specific month value

In [3]:
# Input File - WaterUse
fileInput = "RawInputData/WaterUse.zip"
dfin1 = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/WaterUse.zip", compression=dict(method='zip', archive_name='WaterUse.csv'), index=False)

print(len(dfin1))
dfin1.head(1)

80244


Unnamed: 0,OBJECTID,Water Right ID,Owner,Use,Year,JAN_DIV,FEB_DIV,MAR_DIV,APR_DIV,MAY_DIV,JUN_DIV,JUL_DIV,AUG_DIV,SEPT_DIV,OCT_DIV,NOV_DIV,DEC_DIV,TOTAL,WaDEUUID
0,60750362,C1009,LUMINANT GENERATION COMPANY LLC,INDUSTRIAL,2018,0.0,5.0,20.0,28.0,20.0,14.0,22.0,18.0,9.0,7.0,17.0,11.0,171.0,in10


In [None]:
# clean ben use info
# replace "&" with ",", remove white space
dfin1['Use'] = dfin1['Use'].str.strip().str.replace("  ", " ").str.title()
dfin1['Use'] = dfin1['Use'].str.replace(" And ", ", ").str.strip()
dfin1['Use'] = dfin1['Use'].str.replace(". ", ", ").str.strip()

def fixBenUse(val):
    val = str(val).strip()
    if val == "Domestic And Livestock":
        outString = "Domestic, Livestock"
    elif val == "Domestic And Livestock & Livestock":
        outString = "Domestic, Livestock"
    elif val == "Non-Consumptive":
        outString = "Non Consumptive"
    elif val == "Instraem":
        outString = "Instream"
    elif val == "Wilflife Management":
        outString = "Wildlife Management"
    elif val == "Watwe Quality":
        outString = "Water Quality"
    elif val == "Minng":
        outString = "Mining"
    elif val == "Muncipal":
        outString = "Municipal"
    else:
        outString = val
    return outString

dfin1['Use'] = dfin1.apply(lambda row: fixBenUse(row['Use']), axis=1)
for x in dfin1['Use'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# temp dataframe of non-timseries info
dfin1_b = dfin1[["OBJECTID", "Water Right ID", "Owner", "Use", "Year"]]
print(len(dfin1_b))
dfin1_b.head(1)

In [None]:
# extract timeseries data / month values, attach to non-timseries info

divColList = ["JAN_DIV", "FEB_DIV", "MAR_DIV", "APR_DIV", "MAY_DIV", "JUN_DIV", "JUL_DIV", 
              "AUG_DIV", "SEPT_DIV", "OCT_DIV", "NOV_DIV", "DEC_DIV"] # list of column names with Amount values
monthNumList = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] # list of month num values
lastDayMonNumLisdt = ["31", "28", "31", "30", "31", "30", "31", "31", "30", "31", "30", "31"]
dfin1_c =pd.DataFrame() # empty dataframe

for x in range(11):
    # divColName = 
    # divMonthNumVal = 
    # divLastDayMontNumVal = 
    
    dftemp = dfin1_b.copy()
    dftemp['in_Amount'] = dfin1[divColList[x]]
    dftemp['in_TimeframeEnd'] = monthNumList[x] + "/" + lastDayMonNumLisdt[x] + "/" + dftemp['Year'].astype(str)
    dftemp['in_TimeframeStart'] = monthNumList[x] + "/" + "01" + "/" + dftemp['Year'].astype(str)
    
    dfin1_c = pd.concat([dfin1_c, dftemp])

print(len(dfin1_c))
dfin1_c.head(1)

## Data Input 2 - owner info
- remove special characters
- group by "Water Right ID"

In [4]:
#Dataframe creation - owners
ownerInput = "RawinputData/WaterRightOwner.zip"
dfowner = pd.read_csv(ownerInput).replace(np.nan, "")
dfowner = dfowner.rename(columns={"Water Right ID": "WaterRightID"})

print(len(dfowner))
dfowner.head(1)

11325


Unnamed: 0,OBJECTID,WaterRightID,Owner
0,17294228,C925,"SOZA, JOSE JR"


In [None]:
# Clean Owner info.  Remove special characters. Change to title format.
def cleanOwnerDataFunc(Val):
    Val = str(Val).strip()
    Val = re.sub("[$'\"'@&.;,/\)(-]", "", Val).title().strip()
    return Val

dfowner['Owner'] = dfowner.apply(lambda row: cleanOwnerDataFunc(row['Owner']), axis=1)
for x in dfowner['Owner'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# group owner info by WR_ID
dfowner = dfowner.drop(['OBJECTID'], axis=1) # drop unused 'OBJECTID' columns
dfowner = dfowner.groupby('WaterRightID').agg(lambda x: ','.join([str(elem) for elem in (list(set(x))) if elem != ""])).replace(np.nan, "").reset_index()
print(len(dfowner))
dfowner.head(1)

## Data Input 3 - WaterRightsAsSinglePoints
- clean up Type
- attach owner info to sites
- merge site/owner info to ben use

In [5]:
# Input File
fileInput = "RawinputData/WaterRightsAsSinglePoints.zip"
dfinPOD = pd.read_csv(fileInput).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "in3" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawinputData/WaterRightsAsSinglePoints.zip', compression=dict(method='zip', archive_name='WaterRightsAsSinglePoints.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head(1)

15542


Unnamed: 0,OBJECTID,TCEQ ID,Type,Verified,Latitude,Longitude,Location Method,Location Accuracy,Reference,Location Date,Location Organization,Datum,Water Right ID,Water Right Type and Number,SHAPE,WaDEUUID
0,14591275,10503942001,Diversion Point,2,32.79544,-95.2061,DRG,12,OTHER,8/25/2008,TCEQ,NAD83,P3942,WRPERM3942,Point,in30


In [None]:
# clean TYPE info
dfinPOD['Type'] = dfinPOD['Type'].str.strip().str.replace("  ", " ")

def fixTypeFunc(val):
    val = str(val).strip()
    if val == "On-channel Reservior":
        outString = "On-channel Reservoir"
    else:
        outString = val
    return outString

dfinPOD['Type'] = dfinPOD.apply(lambda row: fixTypeFunc(row['Type']), axis=1)   
for x in dfinPOD['Type'].sort_values().unique():
    print(f'"' + x + '",')

In [None]:
# attach owner info to Site info

# Loop up dictonary using owner dataframe
OwnerDict = pd.Series(dfowner.Owner.values, index=dfowner.WaterRightID).to_dict()

def retrieveOwner(val):
    if val == "" or pd.isnull(val):
        outString = ""
    else:
        String1 = str(val).strip()
        try:
            outString = OwnerDict[String1]
        except:
            outString = ""
    return outString

dfinPOD['in_AllocationOwner'] = dfinPOD.apply(lambda row: retrieveOwner(row['Water Right ID']), axis=1)
dfinPOD['in_AllocationOwner'].unique()

In [None]:
# merge site/owner info to ben use

dfin1_c = dfin1_c.merge(dfinPOD, how='left', left_on='Water Right ID', right_on='Water Right ID')
print(len(dfin1_c))
dfin1_c.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1_c['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "TCEQwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "TCEQwr_V1" # for wr records portion only
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = "Water Use"

# Organization Info
df['in_OrganizationUUID'] = "TCEQwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE Blank" # need this for auto fill below
df['in_WaterSourceNativeID'] = "" # auto fill in below
df['in_WaterSourceTypeCV'] = "Surface Water" # need this for auto fill below

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = ""
df['in_County'] = ""
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1_c['Latitude']
df['in_Longitude'] = dfin1_c['Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "wade" + dfin1_c['OBJECTID_y'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin1_c['Type']
df['in_StateCV'] = "TX"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = "" #empty
df['in_AllocationLegalStatusCV'] = ""
df['in_AllocationNativeID'] =  dfin1_c['Water Right ID']
df['in_AllocationOwner'] = dfin1_c['in_AllocationOwner']
df['in_AllocationPriorityDate'] = "" #empty
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = "" #empty
df['in_BeneficialUseCategory'] = dfin1_c['Use']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 1 # we want this data excempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
#df['in_WaterAllocationNativeURL'] = "https://gisweb.tceq.texas.gov/WRRetrieveRights/?ID=" + dfin1_c['Water Right ID'].replace("", 0).fillna(0).str.strip().astype(str)
df['in_WaterAllocationNativeURL'] = "https://gisweb.tceq.texas.gov/WRRetrieveRights/?ID=" + dfin1_c['Water Right Type and Number'].replace("", 0).fillna(0).str.strip().astype(str)


# Site VariableAmounts Info
df['in_Amount'] = dfin1_c['in_Amount']
df['in_AssociatedNativeAllocationIDs'] = dfin1_c['Water Right ID']
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = ""
df['in_ReportYearCV'] = dfin1_c['Year']
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1_c['in_TimeframeEnd']
df['in_TimeframeStart'] = dfin1_c['in_TimeframeStart']
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

## Data Input 2
- site info
- timeseries info

In [None]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [None]:
# Concatenate dataframes
frames = [outdf1] # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean Data / data types

In [None]:
# # updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# # ----------------------------------------------------------------------------------------------------

# def createWaterSourceTypeCV(inWST):
#     inWST = str(inWST).strip()
    
#     if inWST == "":
#         outString = "WaDE Blank"
#     elif inWST == "Ground Water":
#         outString = "Groundwater"
#     else:
#         outString =  inWST
      
#     return outString

# outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
# outdf['in_WaterSourceTypeCV'].unique()

In [None]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\),(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [None]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

In [None]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

In [None]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

In [None]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

In [None]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

In [None]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

In [None]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

In [None]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

In [None]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

In [None]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

In [None]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

In [None]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

In [None]:
# extract year out
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf['in_ReportYearCV'], utc=True)
# outdf['in_ReportYearCV'] = pd.to_datetime(outdf["in_ReportYearCV"].dt.strftime('%m/%d/%Y'))
# outdf['in_ReportYearCV'] = outdf['in_ReportYearCV'].dt.year
outdf['in_ReportYearCV'].unique()

In [None]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

In [None]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

In [None]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

## Export Outputs

In [None]:
outdf.info()

In [None]:
outdf

In [None]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pssdw_Main.zip', compression=dict(method='zip', archive_name='Pssdw_Main.csv'), index=False)  # The output, save as a zip

In [None]:
# df = outdf.copy()
# df['in_Amount'] = df['in_Amount'].astype(str).astype(float)
# df.head(1)

In [None]:
# filtered_df = df[df.groupby(['in_AssociatedNativeAllocationIDs', 'in_VariableSpecificCV', 'in_ReportYearCV'])['in_Amount'].sum().reset_index() > 0]
# printlen(filtered_df)
# filtered_df.head(1)