# Preprocessing Arizona Allocation data for WaDEQA upload.
Date Updated: 03/01/2023

Purpose:  To preprocess the Arizona data into one mail file for simple DataFrame creation and extraction.

Notes:
- asdf

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Arizona/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Groundwater Data (POD)

In [3]:
# input File - Well_Registry.csv
fileInput = "Groundwater/Well_Registry.zip"
dfgw = pd.read_csv(fileInput, compression='zip')

if 'WaDEUUID' not in dfgw:
    dfgw['WaDEUUID'] = "azGW" + dfgw.index.astype(str)
    dfgw.to_csv('Groundwater/Well_Registry.zip', compression='zip', index=False)

print(len(dfgw))
dfgw.head(3)

229051


  dfgw = pd.read_csv(fileInput, compression='zip')


Unnamed: 0,OID_,OBJECTID,PROGRAM,REGISTRY_I,OWNER_NAME,RGR_PUMP_D,WELLTYPE,WELL_TYPE_,DLIC_NUM,APPROVED,INSTALLED,WELL_DEPTH,WATER_LEVE,CASING_DEP,CASING_DIA,CASING_TYP,PUMP_TYPE,PUMP_POWER,PUMPRATE,TESTEDRATE,DRAW_DOWN,COMPLETION,DRILL_LOG,WELL_CANCE,CADASTRAL,COUNTY,WATERSHED,BASIN_NAME,SUBBASIN_N,AMA,QUAD_CODE,WHOLE_TOWN,HALF_TOWNS,NORTHSOUTH,WHOLE_RANG,HALF_RANGE,EASTWEST,SECTION,QUARTER_16,QACRE160DI,QUARTER_40,QACRE40DIR,QUARTER_10,QACRE10DIR,UTM_X_METE,UTM_Y_METE,APPLICATIO,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,ZIP4,WATER_USE,latitude,longitude,WaDEUUID
0,1,1,55,60000,ARIZONA SONORAN COPPER COMPANY (USA) INC,NO,NON-EXEMPT,NON-EXEMPT,0,,,1790,257,60,20,OPEN HOLE IN AQUIFER,NO PUMP CODE LISTED,NO POWER CODE LISTED,0,0,0,,,N,D05005035ABA,PINAL,SANTA CRUZ RIVER,PINAL AMA,ELOY,PINAL,D,5,0,S,5,0,E,35,A,NE,B,NW,A,NE,423912.1,3646244.0,1/1/1900 0:00:00,ATTN: TRAVIS SNYDER,850 W. ELLIOT RD. STE 106,TEMPE,AZ,85284,,DEWATERING,32.95186,-111.81407,azGW0
1,2,2,55,60001,ROUTE 14 INVESTMENT PARTNERS LLC,YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,611,1/24/2003 0:00:00,7/2/1984 0:00:00,1000,520,1000,12,STEEL - PERFORATED OR SLOTTED CASING,SUBMERSIBLE,ELECTRIC MOTOR 16 - 100 HP,200,400,75,,,,B05002008CCB,MARICOPA,AGUA FRIA RIVER,PHOENIX AMA,WEST SALT RIVER VALLEY,PHOENIX,B,5,0,N,2,0,W,8,C,SW,C,SW,B,NW,361297.1,3739323.0,2/25/1985 0:00:00,ATTN: MARK REPANICH,33040 N 203RD AVE,WITTMANN,AZ,85361,,INDUSTRIAL,33.78498,-112.49814,azGW1
2,3,3,55,82721,"KEYES,G",NO,NON-EXEMPT,NON-EXEMPT,63,,,0,0,0,0,NO CASING CODE LISTED,NO PUMP CODE LISTED,NO POWER CODE LISTED,0,0,0,,,Y,B02009022000,MARICOPA,LOWER GILA RIVER,HARQUAHALA INA,HARQUAHALA,HARQUAHALA INA,B,2,0,N,9,0,W,22,0,,0,,0,,298366.4,3708927.0,10/15/1979 0:00:00,7765 E FOOTHILL DR S,,PARADISE VALLEY,AZ,85253,,DOMESTIC,33.5009,-113.17061,azGW2


In [None]:
dfgw.info()

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfgw['WaDEUUID']

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE_Unspecified"
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = "WaDE_Unspecified"
df['in_CoordinateMethodCV'] = "WaDE_Unspecified"
df['in_County'] = dfgw['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfgw['latitude']
df['in_Longitude'] = dfgw['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = "WaDE_Unspecified"
df['in_SiteNativeID'] = dfgw['CADASTRAL'].astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Well" # these should all be well records
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_MethodUUID'] = "AZwr_M1" # for goundwater
df['in_OrganizationUUID'] = "AZwr_O1"
df['in_SiteUUID'] = "" # ???
df['in_VariableSpecificUUID'] =  "AZwr_V1" # for CFS
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDateID'] = ""
df['in_AllocationFlow_CFS'] = dfgw['PUMPRATE'].astype(float)
df['in_AllocationLegalStatusCV'] = "WaDE_Unspecified"
df['in_AllocationNativeID'] =  dfgw['REGISTRY_I']
df['in_AllocationOwner'] = dfgw['OWNER_NAME']
df['in_AllocationPriorityDate'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = dfgw['WELL_TYPE_']
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfgw['WATER_USE']
df['in_CommunityWaterSupplySystem'] = dfgw['AMA']
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # all these gw records should be considered exempt for us.
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

dfgwOut = df.copy()
print(len(dfgwOut))
dfgwOut.head()

## Surface Water Data (POD & POU) 

In [3]:
#Input files - Surface Water Query by Watershed water records

# Surface Water Query by Watershed water record inputs.
csv_file_list = [
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/AGUA FRIA RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/BILL WILLIAMS RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/COLORADO RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/LITTLE COLORADO RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/LOWER GILA RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/RIO YAQUI.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SALT RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SAN PEDRO RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SAN SIMON RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SANTA CRUZ RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/UPPER GILA RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/VERDE RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/VIRGIN RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/WHITE WATER DRAW.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/WILLCOX PLAYA.zip"]


list_of_dataframes = []
for filename in csv_file_list:
    list_of_dataframes.append(pd.read_csv(filename, compression='zip'))

dfsw = pd.concat(list_of_dataframes).replace(np.nan, "").drop_duplicates().reset_index(drop=True)
dfsw = dfsw.drop(['LEGAL'], axis=1).drop_duplicates().reset_index(drop=True) # drop 'LEGAL', not needed.

if 'WaDEUUID' not in dfsw:
    dfsw['WaDEUUID'] = "azSW" + dfsw.index.astype(str)
    dfsw.to_csv('Surface Water/SW_QUERY_COMBINED.zip', compression='zip', index=False)

print(len(dfsw))
dfsw.head(3)

256140


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,POU/POD,WATER USE,QUANTITY,WaDEUUID
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-25474.0,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,DOMESTIC,,azSW0
1,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-25474.0,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,STOCK,1.5 Acre-Feet Per Annum,azSW1
2,"ANDERWALD, GERALDINE W",32,"CROWN KING RT MAYER, AZ 86333",36-66830.0,ACTIVE - ACTIVE,,,6/13/1978 12:00:00 AM,TURKEY CREEK,11/7/1906,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,ANNUAL USE,.15 Acre-Feet Per Annum,azSW2


In [None]:
# fixing 'REG. NO' format to match 'FILNO' in FILINGS shp file.

def fixREGNO(val):
   
    ### first fix
    # Create testVal to search for length, split on '-' & '.'
    testVal = str(val).strip()
    sep1 = '-'
    testVal = testVal.split(sep1, 1)[1]
    sep2 = '.'
    testVal = testVal.split(sep2, 1)[0]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 2:
        val = val.replace("-", "-0000")
    if len(testVal) == 3:
        val = val.replace("-", "-000")
    if len(testVal) == 4:
        val = val.replace("-", "-00")
    if len(testVal) == 5:
        val = val.replace("-", "-0")
        
        
    ### second fix
    # Create testVal to search for length, split on '.' at the end
    testVal = str(val).strip()
    sep1 = '.'
    testVal = testVal.split(sep1, 1)[1]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 1:
        val = val.replace(".", ".00" + testVal)
    if len(testVal) == 2:
        val = val.replace(".", ".0" + testVal)   
    
    return val

dfsw['REG. NO'] = dfsw.apply(lambda row: fixREGNO(row['REG. NO']), axis=1)
exList = dfsw['REG. NO'].unique().tolist()
exList.sort()
for x in exList:
    print(x)

In [None]:
#Input files - Fillings Layers.shp

#POD layer
fileName = "Surface Water/shapefile/FilingPOD.zip"
df_PODfill = pd.read_csv(fileName, compression='zip')

#POU layer
fileName = "Surface Water/shapefile/FillingPOU.zip"
df_POUfill = pd.read_csv(fileName, compression='zip')

# Concatenate dataframes
frames = [df_PODfill, df_POUfill] # add dataframes here
df_fill = pd.concat(frames)
df_fill = df_fill.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(df_fill))
df_fill.head(1)

In [None]:
# Merge Surface Water Query by Watershed water records with Filling.
dfsw = pd.merge(dfsw, df_fill, left_on='REG. NO', right_on='FILENO', how='left')
print(len(dfsw))
dfsw.head(1)

In [None]:
# Split 'QUANTITY' into 'Amount' and 'UNIT'
dfsw[['Amount', 'Unit']] = dfsw.QUANTITY.str.split("  ", expand = True)
dfsw['Amount'] = pd.to_numeric(dfsw['Amount'], errors='coerce').fillna(0).astype(float) # make sure this is numeric.
dfsw.head(1)

In [None]:
# 03/02/2023
# temp fix - remove recods with these 'Units'
dropList = ['ACRES',
            'Amount Required for Maintenance',
            'Feet',
            'MIT - Miners Inches Total',
            'Miners Inches Per Annum', 
            'XX - Unknown Code at Load time',
            'None',
            '',
            " "]

dfsw = dfsw[~dfsw['Unit'].isin(dropList)]
print(len(dfsw))
dfsw.head(1)

In [None]:
# Need to update to reflect groundwater values too
# CFS = V1, AF = V2

# Create VariableSpecificCv value
def createVariableSpecificCv(unit):
    outString = ""
    if unit == "Cubic Feet Per Second":
        outString = "AZwr_V1"
    if unit == "Acre-Feet Per Annum":
        outString = "AZwr_V1"
    if unit == "Gallons Per Annum":
        outString = "AZwr_V1"
    else:
        outString = "AZwr_V2"

    return(outString)

dfsw['in_VariableSpecificUUID'] = dfsw.apply(lambda row: createVariableSpecificCv(row['Unit']), axis=1)
dfsw['in_VariableSpecificUUID'].unique()

In [None]:
# convert all flow values to CFS
def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet Per Second":
        CFS_Value = val
    if unit == "Acre-Feet Per Annum":
        CFS_Value = val / (723.968)
    if unit == "Gallons Per Annum":
        CFS_Value = val / (235905662.34)
    else:
        CFS_Value = 0.0
    return(CFS_Value)

dfsw['CFS_Value'] = dfsw.apply(lambda row: convertFlowFunc(row['Amount'], row['Unit']), axis=1)
dfsw['CFS_Value'].unique()

In [None]:
# convert all volume values to AF
def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-Feet':
        AF_Value = val
    if unit == 'Acre-Feet Total':
        AF_Value = val
    if unit == "CFT - Cubic Feet Total":
        AF_Value = val / (43559.9)
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    else:
        AF_Value = 0.0
    return(AF_Value)

dfsw['AF_Value'] = dfsw.apply(lambda row: convertVolumeFunc(row['Amount'], row['Unit']), axis=1)
dfsw['AF_Value'].unique()

In [None]:
# Creating long and lat values from data.  
# Need to convert from UTM 12N to WGS 84.
# I believe AZ is consiered WGS 84 / UTM zone 12N - EPSG:32612.

from pyproj import Proj
myProj = Proj(proj='utm',zone=12, ellps='WGS84', preserve_units=False)
long, lat = myProj(dfsw['X_UTMNAD83'].values, dfsw['Y_UTMNAD83'].values, inverse=True)
dfsw['in_Latitude'] = lat
dfsw['in_Longitude'] = long
dfsw = dfsw.replace(np.nan, '')  # Replaces NaN values with blank.
dfsw.head(1)

In [None]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfsw['WaDEUUID']

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfsw['WATERSOURC']
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = "WaDE_Unspecified"
df['in_CoordinateMethodCV'] = "WaDE_Unspecified"
df['in_County'] = dfsw['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfsw['in_Latitude']
df['in_Longitude'] = dfsw['in_Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfsw['POU_POD']
df['in_SiteName'] = "WaDE_Unspecified"
df['in_SiteNativeID'] = dfsw['CADASTRAL'].astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE_Unspecified"
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_MethodUUID'] = "AZwr_M2" # for surface water
df['in_OrganizationUUID'] = "AZwr_O1"
df['in_SiteUUID'] = "" # ???
df['in_VariableSpecificUUID'] =  dfsw['in_VariableSpecificUUID']
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDateID'] = ""
df['in_AllocationFlow_CFS'] = dfsw['CFS_Value'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfsw['STATUS_x']
df['in_AllocationNativeID'] =  dfsw['REG. NO']
df['in_AllocationOwner'] = dfsw['NAME']
df['in_AllocationPriorityDate'] = dfsw['PRIOR DATE']
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = "WaDE_Unspecified"
df['in_AllocationVolume_AF'] = dfsw['AF_Value'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfsw['WATER USE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "0"
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

dfswOut = df.copy()
print(len(dfswOut))
dfswOut.head()

## Concatenate GW with SW dataframes

In [None]:
frames = [dfgwOut, dfswOut] # add dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

## Clean output dataframes

In [None]:
# clean up owner data
import re

def cleanOwnerDataFunc(Val):
    Val = re.sub("[$@&.;,/\)(-]", "", Val).strip()
    return Val

outdf['in_AllocationOwner'] = outdf.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
outdf.head(3)

In [None]:
# For filling in WaDE_Unspecified when null

def assignBeneficialUseCategory(val):
    val = str(val).strip().rstrip(',')
    if val == "" or pd.isnull(val):
        outString = "WaDE_Unspecified"
    else:
        outString = val
    return outString

outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: assignBeneficialUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

In [None]:
# WaterSourceName, For filling in WaDE_Unspecified when null

def assignWaterSourceName(val):
    val = str(val).strip()
    if val == "" or pd.isnull(val):
        outString = "WaDE_Unspecified"
    else:
        outString = val
    return outString

outdf['in_WaterSourceName'] = outdf.apply(lambda row: assignWaterSourceName(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

In [None]:
# SiteNativeID, For filling in WaDE_Unspecified when null

def assignSiteNativeID(val):
    val = str(val).strip()
    if val == "" or pd.isnull(val):
        outString = "WaDE_Unspecified"
    else:
        outString = val
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: assignSiteNativeID(row['in_SiteNativeID']), axis=1)
outdf['in_SiteNativeID'].unique()

In [None]:
# SiteName, For filling in WaDE_Unspecified when null

def assignSiteName(val):
    val = str(val).strip()
    if val == "" or pd.isnull(val):
        outString = "WaDE_Unspecified"
    else:
        outString = val
    return outString

outdf['in_SiteName'] = outdf.apply(lambda row: assignSiteName(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

In [None]:
# Update datatype of Priority Date to fit WaDE 2.0 structure
def formatDateString(inString1):
    inString = str(inString1).strip()
    try:
        if inString == "" or pd.isnull(inString):
            valndf = ""
        else:
            valD = pd.to_datetime(inString)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d/%Y')
    except:
        valndf = ""
    return valndf

outdf['in_AllocationPriorityDate'] = outdf.apply(lambda row: formatDateString(row['in_AllocationPriorityDate']), axis=1)
outdf['in_AllocationPriorityDate'].unique()

In [None]:
# Convert in_Latitude & in_Longitude to numeric
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0).astype(float)
print(outdf['in_Latitude'].unique())
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0).astype(float)
print(outdf['in_Longitude'].unique())

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDECA_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

## Review and Export

In [None]:
outdf.dtypes

In [None]:
outdf

In [None]:
#Exporting to Finished File
outdf.to_csv('Pwr_AZMain.csv', index=False)  # The output