# Pre-processing Texas TCEQ Allocation data for WaDEQA upload.
Date Updated: 03/28/2023
Purpose:  To pre-process the Texas data into one master file for simple DataFrame creation and extraction

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [None]:
#Working Directory
workingDir = "G:/Shared drives/WaDE Data/Texas/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Assign Owner Name
- create dictionary of owner names
- will use a dictionary to assign owner name for output main file.

In [None]:
#Dataframe creation - owners
ownerInput = "WaterRightOwner.csv"
df_owner = pd.read_csv(ownerInput, usecols=['Water Right ID', 'Owner'], encoding="ISO-8859-1")
df_owner.head()

In [None]:
# Clean Owner info.  Remove special characters

import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val

df_owner['Owner'] = df_owner.apply(lambda row: cleanOwnerDataFunc(row['Owner']), axis=1)
df_owner['Owner'].unique()

In [None]:
# Merge onwers, assign ID
# Issue of multiple owners per water Right ID

def retrieveNames(df):
    ids = df['Water Right ID'].drop_duplicates()
    outdf = pd.DataFrame(ids)
    outdf.reset_index(drop=True, inplace=True)
    outdf['owners'] = ''
    outdf.set_index(outdf['Water Right ID'], inplace=True)

    for id in ids:
        vals = df.loc[df['Water Right ID'] == id]
        vals.reset_index(inplace=True)
        names = []
        for i, row in vals.iterrows():
            names.append(row['Owner'])

        outdf.at[id, 'owners'] = ', '.join(names)

    return outdf

df_owners = retrieveNames(df_owner)
df_owners['WaterRightID'] = df_owners['Water Right ID']
df_owners.head()

## Assign Ben Use
- create dictionary of ben use
- will use a dictionary to assign ben use for output main file.

In [None]:
#Dataframe creation - ben use
useInput = "WaterUse.csv"
df_use = pd.read_csv(useInput, usecols=['Water Right ID', 'Use'])
df_use.head()

In [None]:
def retrieveUses(df):
    ids = df['Water Right ID'].drop_duplicates()
    outdf = pd.DataFrame(ids)
    outdf.reset_index(drop=True, inplace=True)
    outdf['uses'] = ''
    outdf.set_index(outdf['Water Right ID'], inplace=True)

    for id in ids:
        vals = df.loc[df['Water Right ID'] == id]
        vals.reset_index(inplace=True)
        uses = []
        for i, row in vals.iterrows():
            new_use = row['Use']

            if new_use not in uses:
                uses.append(new_use)

        outdf.at[id, 'uses'] = ','.join(uses)

    return outdf

df_uses = retrieveUses(df_use)
df_uses['WaterRightID'] = df_uses['Water Right ID']
df_uses.head()

## Water Right Points and Output file
- maint output

In [None]:
# Input File
fileInput = "WaterRightPoint.csv"
df = pd.read_csv(fileInput)

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in df:
    df['WaDEUUID'] = "txWR" + df.index.astype(str)
    df.to_csv('WaterRightPoint.csv', index=False)

print(len(df))
df.head()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.dtypes)

In [None]:
# Assining owner name to output file

# Loop up dictonary
OwnerDict = pd.Series(df_owners.owners.values, index=df_owners.WaterRightID).to_dict()

def retrieveOwner(val):
    if val == '' or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        String1 = str(val).strip()
        try:
            outString = OwnerDict[String1]
        except:
            outString = "WaDE Unspecified"
    return outString

df['in_AllocationOwner'] = df.apply(lambda row: retrieveOwner(row['WR_ID']), axis=1)
df

In [None]:
# Assining ben use to output file

# Loop up dictonary
BenuseDict = pd.Series(df_uses.uses.values, index=df_uses.WaterRightID).to_dict()

def retrieveBenUse(val):
    if val == '' or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        String1 = str(val).strip()
        try:
            outString = BenuseDict[String1]
        except:
            outString = "WaDE Unspecified"
    return outString

df['in_BeneficialUseCategory'] = df.apply(lambda row: retrieveBenUse(row['WR_ID']), axis=1)
df

In [None]:
# Tx projection = EPSG:4269.  WGS84 projection used by WaDE 2.0 = epsg:4326.

from pyproj import Transformer, transform # for transforming coordinates to a new projection
transformer = Transformer.from_proj(4269, 4326)  

def assignLat(colrowValueLat, colrowValueLong):
    lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return lat

def assignLong(colrowValueLat, colrowValueLong):
    lat, long = transformer.transform(colrowValueLat, colrowValueLong)
    return long



df['in_Latitude'] = df.apply(lambda row: assignLat(row['LAT_DD'], row['LONG_DD']), axis=1)
df['in_Longitude'] = df.apply(lambda row: assignLong(row['LAT_DD'], row['LONG_DD']), axis=1)
df.head()

In [None]:
#fixing Spelling issues in TCEQ TYPE field
TYPEdict = {
"Dischrage Point" : "Discharge Point",
"Dishcharge Point" : "Discharge Point",
"IBT -  Diversion Point" : "IBT - Diversion Point",
"On-channel  Reservoir" : "On-channel Reservoir",
"On-channel Reservior" : "On-channel Reservoir",
"GW -  Release Point" : "GW - Release Point"
}

def updateTYPE(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = colrowValue
    else:
        String1 = colrowValue  # remove whitespace chars
        try:
            outList = TYPEdict[String1]
        except:
            outList = colrowValue
    return outList

df['TYPE'] = df.apply(lambda row: updateTYPE(row['TYPE']), axis=1)

In [None]:
# Creating the output Dataframe for PODs.

dfPOD = pd.DataFrame(index=df.index)

# Data Assessment UUID
dfPOD['WaDEUUID'] = df['WaDEUUID']

# Water Source
dfPOD["in_WaterSourceName"] = "WaDE Unspecified"
dfPOD["in_WaterSourceTypeCV"] = "WaDE Unspecified"

# Site
dfPOD["in_CoordinateAccuracy"] = "WaDE Unspecified"
dfPOD["in_CoordinateMethodCV"] = "Digitized"
dfPOD['in_HUC12'] = ""
dfPOD['in_HUC8'] = ""
dfPOD['in_County'] = ""
dfPOD["in_Latitude"] = df['in_Latitude']
dfPOD["in_Longitude"] = df['in_Longitude']
dfPOD["in_PODorPOUSite"] = "POD"
dfPOD["in_SiteName"] = "WaDE Unspecified"
dfPOD["in_SiteNativeID"] = "POD" + df['TCEQ_ID'].astype(str)
dfPOD["in_SiteTypeCV"] = df['TYPE']
dfPOD["in_StateCV"] = "TX"

# Allocation
dfPOD["in_AllocationFlow_CFS"] = ""
dfPOD["in_AllocationVolume_AF"] = ""
dfPOD['in_AllocationLegalStatusCV'] = "WaDE Unspecified"
dfPOD["in_AllocationNativeID"] = df['WR_ID']
dfPOD['in_AllocationOwner'] = df['in_AllocationOwner']
dfPOD['in_AllocationPriorityDate'] = ""
dfPOD['in_AllocationTimeframeEnd'] = ""
dfPOD['in_AllocationTimeframeStart'] = ""
dfPOD['in_AllocationTypeCV'] = "WaDE Unspecified"
dfPOD["in_BeneficialUseCategory"] = df['in_BeneficialUseCategory']
dfPOD['in_CommunityWaterSupplySystem'] = ""
dfPOD['in_ExemptOfVolumeFlowPriority'] = "1" # we want this data excempt
dfPOD["in_IrrigatedAcreage"] = ""
dfPOD["in_IrrigationMethodCV"] = ""
dfPOD["in_WaterAllocationNativeURL"] = ""

dfPOD = dfPOD.drop_duplicates().reset_index(drop=True)
print(len(dfPOD))
dfPOD.head(1)

## Inspect and Clean Data

In [None]:
%%time

# Creating WaDE Custom water source native ID for easy water source identification
# Change 'outstring' name to be state specific.
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfPOD['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfPOD['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfPOD['in_WaterSourceNativeID'] = dfPOD.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
dfPOD['in_WaterSourceNativeID'].unique()

## Export Data

In [None]:
# Export the output dataframe
dfPOD.to_csv('P_TexasWRP.zip', index=False, compression="zip")  # The output, save as a zip