# Preprocessing Arizona Allocation data for WaDEQA upload.
Date Updated: 06/18/2021

Purpose:  To preprocess the Arizona data into one master file for simple DataFrame creation and extraction

Notes:
- Doing an inner join on sites and avaiable water right records, to cut down on cut elements.
- SW data has both POD and POU data (

In [None]:
#Needed Libararies|
import os
import numpy as np
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

## Groundwater POD Data

In [None]:
#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/WaterAllocation/RawInputData/Groundwater"
os.chdir(workingDir)

WellRegistry_Input = "WELLS_wellRegistry_input.csv"
GWSI_Input = "GWSI_SITES_input.csv"

In [None]:
# Dataframes creation - Groundwater

# Site / Well location
df_GWSI = pd.read_csv(GWSI_Input)
df_GWSI['REG_ID'] = df_GWSI['REG_ID'].astype(str)

# water record data
df_WR = pd.read_csv(WellRegistry_Input)
df_WR['REGISTRY_I'] = df_WR['REGISTRY_I'].astype(str)

# Merge together into one
df_GWSI_WR = pd.merge(df_GWSI, df_WR, left_on='REG_ID', right_on='REGISTRY_I', how='inner').replace(np.nan, "").drop_duplicates().reset_index(drop=True)

print(len(df_GWSI_WR))
df_GWSI_WR

In [None]:
# Create output dataframe for ground water
columnslist = [
    "in_ApplicableResourceTypeCV",
    
    'in_WaterSourceTypeCV',
    "in_WaterSourceName",
    
    "in_County",
    "in_Latitude",
    "in_Longitude",
    "in_PODorPOUSite",
    "in_SiteName",
    "in_SiteNativeID",
    "in_SiteTypeCV",
    
    "in_AllocationCommunityWaterSupplySystem",
    "in_AllocationFlow_CFS",
    "in_AllocationNativeID",
    "in_AllocationOwner",
    "in_AllocationPriorityDate",
    "in_AllocationTimeframeEnd",
    "in_AllocationTimeframeStart",
    "in_AllocationVolume_AF",
    "in_AllocationLegalStatusCV",
    "in_BeneficialUseCategory",
    "in_ExemptOfVolumeFlowPriority"
]

dfground = pd.DataFrame(columns=columnslist, index=df_GWSI_WR.index)

In [None]:
#############################################################################################
#Method
dfground['in_ApplicableResourceTypeCV'] = "Groundwater"

#WaterSource
dfground['in_WaterSourceTypeCV'] = "Groundwater"
dfground['in_WaterSourceName'] = "Unspecified"
                                    
#Site
dfground['in_County'] = df_GWSI_WR['COUNTY']
dfground['in_Latitude'] = df_GWSI_WR['DD_LAT']
dfground['in_Longitude'] = df_GWSI_WR['DD_LONG']
dfground['in_PODorPOUSite'] = "POD"
dfground['in_SiteName'] = "Unspecified"
dfground['in_SiteNativeID'] = df_GWSI_WR['SITE_ID']
dfground['in_SiteTypeCV'] = "Well"

#AllocationAmount_fact
dfground['in_AllocationCommunityWaterSupplySystem'] = df_GWSI_WR['AMA']
dfground['in_AllocationFlow_CFS'] = df_GWSI_WR['PUMPRATE']
dfground['in_AllocationNativeID'] = df_GWSI_WR['REGISTRY_I']
dfground['in_AllocationOwner'] = df_GWSI_WR['OWNER_NAME']
dfground['in_AllocationPriorityDate'] = ""
dfground['in_AllocationTimeframeEnd'] = "12/31"
dfground['in_AllocationTimeframeStart'] = "01/01"
dfground['in_AllocationLegalStatusCV'] = df_GWSI_WR['WELL_TYPE_']
dfground['in_AllocationVolume_AF'] = ""
dfground['in_BeneficialUseCategory'] = df_GWSI_WR['WATER_USE']
dfground['in_ExemptOfVolumeFlowPriority'] = 1

dfground = dfground.drop_duplicates().reset_index(drop=True)
print(len(dfground))
dfground

## Surface Water Data (POD & POU) 

In [None]:
# Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/WaterAllocation/RawInputData/Surface_Water"
os.chdir(workingDir)

# Surface Water Query by Watershed water record inputs.
csv_file_list = [
    "SW QUERY BY SURFACE WATERSHEDS_AGUA FRIA RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_BILL WILLIAMS RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_COLORADO RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_LITTLE COLORADO RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_LOWER GILA RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_RIO YAQUI.csv",
    "SW QUERY BY SURFACE WATERSHEDS_SALT RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_SAN PEDRO RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_SAN SIMON RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_SANTA CRUZ RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_UPPER GILA RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_VERDE RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_VIRGIN RIVER.csv",
    "SW QUERY BY SURFACE WATERSHEDS_WHITE WATER DRAW.csv",
    "SW QUERY BY SURFACE WATERSHEDS_WILLCOX PLAYA.csv"]

In [None]:
# Dataframes creation - Surface Water Query by Watershed water record.
list_of_dataframes = []
for filename in csv_file_list:
    list_of_dataframes.append(pd.read_csv(filename))

df_SWSHED = pd.concat(list_of_dataframes).replace(np.nan, "").drop_duplicates().reset_index(drop=True)

print(len(df_SWSHED))
df_SWSHED.head(5)

In [None]:
# Dataframes creation - Surface water sites.
SWR_fillings_input = "SWR_fillings_input.csv"
df_SWRfill = pd.read_csv(SWR_fillings_input)

print(len(df_SWRfill))
df_SWRfill.head(3)

In [None]:
# Need to restructure df_SWSHED
# Single Flow_CFS as ANNUAL USE value.
# Seperate, then combine via comma Water Use.
# Retreive BenUse to single flow df.

df_SWshed_AU = df_SWSHED.loc[df_SWSHED["WATER USE"] == "ANNUAL USE"].reset_index()
df_SWshed_Ben = df_SWSHED.loc[df_SWSHED["WATER USE"] != "ANNUAL USE"].reset_index()

df_SWshed_Ben = df_SWshed_Ben.groupby('REG. NO').agg(lambda x: ','.join([str(elem) for elem in (list(set(x)))])).replace(np.nan, '').reset_index()
df_SWshed_Ben['BenUse'] = df_SWshed_Ben['WATER USE']
df_SWshed_Ben['REGNO'] = df_SWshed_Ben['REG. NO']

BenUseDict = pd.Series(df_SWshed_Ben.BenUse.values, index = df_SWshed_Ben.REGNO).to_dict()
def retrieveBenUse(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        String1 = colrowValue
        try:
            outList = BenUseDict[String1]
        except:
            outList = ''
    return outList
df_SWshed_AU['BenUse'] = df_SWshed_AU.apply(lambda row: retrieveBenUse(row['REG. NO']), axis=1)
df_SWshed_AU.head(3)

In [None]:
# Merge together
df_SWFill_SWshed = pd.merge(df_SWRfill, df_SWshed_AU, left_on='FILE_NO', right_on='REG. NO', how='left')

In [None]:
# Creating long and lat values from data.  
# Need to convert from UTM 12N to WGS 84.
# I believe AZ is consiered WGS 84 / UTM zone 12N - EPSG:32612.

from pyproj import Proj
myProj = Proj(proj='utm',zone=12, ellps='WGS84', preserve_units=False)
long, lat = myProj(df_SWFill_SWshed['X_UTMNAD83'].values, df_SWFill_SWshed['Y_UTMNAD83'].values, inverse=True)
df_SWFill_SWshed['in_Latitude'] = lat
df_SWFill_SWshed['in_Longitude'] = long
df_SWFill_SWshed = df_SWFill_SWshed.replace(np.nan, '')  # Replaces NaN values with blank.
df_SWFill_SWshed.head(3)

In [None]:
# Creating AllocationFlow_CFS.
# Need to split string to value and units
# Need to convert based on string

def CreateFlow_CFS(val):
    if val == "" or pd.isnull(val):
        outVal = ""
    else:
        try:
            if "Cubic Feet Per Second" in val:
                val = val.split(" ")
                outVal = float(val[0])
            else:
                outVal = ""
        except:
            outVal = ""
    return outVal

df_SWFill_SWshed['in_AllocationFlow_CFS'] = df_SWFill_SWshed.apply(lambda row: CreateFlow_CFS(row['QUANTITY']), axis=1)
df_SWFill_SWshed.head(3)

In [None]:
# Creating AllocationVolume_AF.
# Need to split string to value and units
# Need to convert value based on unit,

# "Acre-Feet Per Annum",
# "Acre-Feet",
# "Acre-Feet Total",
# "ACRES",
# "CFT - Cubic Feet Total",
# "Feet",
# "Gallons",
# "Gallons Per Annum",
# "Miners Inches Per Annum",
# "MIT - Miners Inches Total",

def CreateVolume_AF(val):
    if val == '' or pd.isnull(val):
        outVal = ""
    else:
        try:
            if "Acre-Feet Per Annum" in val:
                val = val.split(" ")
                outVal = float(val[0])
            elif "Acre-Feet" in val:
                val = val.split(" ")
                outVal = float(val[0])
            elif "Acre-Feet Total" in val:
                val = val.split(" ")
                outVal = float(val[0])
            elif "ACRES" in val:
                val = val.split(" ")
                outVal = float(val[0])
            elif "CFT - Cubic Feet Total" in val:
                val = val.split(" ")
                val = float(val[0])
                val = float(val)
                outVal = val / (43559.9)  # Cubic Feet to AF
            elif unit == "Feet":
                val = val.split(" ")
                val = float(val[0])
                val = float(val)
                outVal = val / (43559.9)  # Cubic Feet to AF
            elif unit == "Gallons":
                val = val.split(" ")
                val = float(val[0])
                val = float(val)
                outVal = val / (325851)  # Gallons to AF
            elif unit == "Gallons Per Annum":
                val = val.split(" ")
                val = float(val[0])
                val = float(val)
                outVal = val / (325851)  # Gallons to AF
            elif unit == "Miners Inches Per Annum":
                val = val.split(" ")
                val = float(val[0])
                val = float(val)
                outVal = val * (0.055214457974269576)  # Miners Inches to AF
            elif unit == "MIT - Miners Inches Total":
                val = val.split(" ")
                val = float(val[0])
                val = float(val)
                outVal = val * (0.055214457974269576)  # Miners Inches to AF
            else:
                outVal = ""
        except:
            outVal = ""
    return outVal

df_SWFill_SWshed['in_AllocationVolume_AF'] = df_SWFill_SWshed.apply(lambda row: CreateVolume_AF(row['QUANTITY']), axis=1)
df_SWFill_SWshed.head(3)

In [None]:
# Create output dataframe for ground water
columnslist = [
    "in_ApplicableResourceTypeCV",
    'in_WaterSourceTypeCV',
    "in_WaterSourceName",
    "in_County",
    "in_Latitude",
    "in_Longitude",
    "in_PODorPOUSite",
    "in_SiteName",
    "in_SiteNativeID",
    "in_SiteTypeCV",
    "in_AllocationCommunityWaterSupplySystem",
    "in_AllocationFlow_CFS",
    "in_AllocationNativeID",
    "in_AllocationOwner",
    "in_AllocationPriorityDate",
    "in_AllocationTimeframeEnd",
    "in_AllocationTimeframeStart",
    "in_AllocationLegalStatusCV",
    "in_BeneficialUseCategory",
    "in_ExemptOfVolumeFlowPriority"
]

df_Surface = pd.DataFrame(columns=columnslist, index=df_SWFill_SWshed.index)

In [None]:
# For creating WaterSourceName
def assignWaterSourceName(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = "Unspecified"
    else:
        strVal = str(colrowValue)
        outList = strVal.strip()
    return outList

# For creating SiteNativeID
def assignSiteNativeID(colrowValue):
    strVal = str(colrowValue)
    strVal = strVal.strip()
    if strVal == '' or pd.isnull(strVal):
        outList = "Unspecified"
    else:
        outList = strVal
    return outList

#############################################################################################
#Method
df_Surface['in_ApplicableResourceTypeCV'] = "Surface Water"

#WaterSource
df_Surface['in_WaterSourceTypeCV'] = "Surface Water"
df_Surface['in_WaterSourceName'] = df_SWFill_SWshed.apply(lambda row: assignWaterSourceName(row['WATERSOURC']), axis=1)
                                    
#Site
df_Surface['in_County'] = df_SWFill_SWshed['COUNTY']
df_Surface['in_Latitude'] = df_SWFill_SWshed['in_Latitude']
df_Surface['in_Longitude'] = df_SWFill_SWshed['in_Longitude']
df_Surface['in_PODorPOUSite'] = df_SWFill_SWshed['POU_POD']
df_Surface['in_SiteName'] = "Unspecified"
df_Surface['in_SiteNativeID'] = df_SWFill_SWshed.apply(lambda row: assignSiteNativeID(row['CADASTRAL']), axis=1)
df_Surface['in_SiteTypeCV'] = "Unspecified"

#AllocationAmount_fact
df_Surface['in_AllocationCommunityWaterSupplySystem'] = "Unspecified"
df_Surface['in_AllocationFlow_CFS'] = df_SWFill_SWshed['in_AllocationFlow_CFS']
df_Surface['in_AllocationLegalStatusCV'] =df_SWFill_SWshed['STATUS_x']
df_Surface['in_AllocationVolume_AF'] = df_SWFill_SWshed['in_AllocationVolume_AF']
df_Surface['in_AllocationNativeID'] = df_SWFill_SWshed['FILE_NO']
df_Surface['in_AllocationOwner'] = df_SWFill_SWshed['HLDRNAME']
df_Surface['in_AllocationTimeframeEnd'] = '12/31'
df_Surface['in_AllocationTimeframeStart'] = '01/01'
df_Surface['in_AllocationPriorityDate'] = df_SWFill_SWshed['PRIOR_DATE']
df_Surface['in_BeneficialUseCategory'] = df_SWFill_SWshed['BenUse']
df_Surface['in_ExemptOfVolumeFlowPriority'] = 0

df_Surface = df_Surface.drop_duplicates().reset_index(drop=True)
print(len(df_Surface))
df_Surface

# Concatenate ground with surface

In [None]:
# Concatenate
frames = [dfground, df_Surface]
dfout = pd.concat(frames)
print(len(dfout))
dfout

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDEAZwr_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = dfout['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = dfout['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
dfout

#  Export Outputs

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(dfout.dtypes)

In [None]:
#Exporting to Finished File
#Working Directory and Input File
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Arizona/WaterAllocation/RawInputData"
os.chdir(workingDir)

dfout.to_csv('P_ArizonaMaster.csv', index=False)  # The output