# Preprocessing Arizona Allocation data for WaDEQA upload.
Purpose:  To preprocess the Arizona data into one mail file for simple DataFrame creation and extraction.

In [1]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# API retrieval
import requests
import json

# Cleanup
import re
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x) # suppress scientific notation in Pandas

In [2]:
# Working Directory
workingDir = "G:/Shared drives/WaDE Data/Arizona/WaterAllocation/RawInputData"
os.chdir(workingDir)

## Groundwater Data (POD)

In [3]:
# input File - Well_Registry.csv
fileInput = "Groundwater/Well_Registry.zip"
dfgw = pd.read_csv(fileInput, compression='zip')

if 'WaDEUUID' not in dfgw:
    dfgw['WaDEUUID'] = "azGW" + dfgw.index.astype(str)
    dfgw.to_csv('Groundwater/Well_Registry.zip', compression='zip', index=False)

print(len(dfgw))
dfgw.head()

229051


  dfgw = pd.read_csv(fileInput, compression='zip')


Unnamed: 0,OID_,OBJECTID,PROGRAM,REGISTRY_I,OWNER_NAME,RGR_PUMP_D,WELLTYPE,WELL_TYPE_,DLIC_NUM,APPROVED,INSTALLED,WELL_DEPTH,WATER_LEVE,CASING_DEP,CASING_DIA,CASING_TYP,PUMP_TYPE,PUMP_POWER,PUMPRATE,TESTEDRATE,DRAW_DOWN,COMPLETION,DRILL_LOG,WELL_CANCE,CADASTRAL,COUNTY,WATERSHED,BASIN_NAME,SUBBASIN_N,AMA,QUAD_CODE,WHOLE_TOWN,HALF_TOWNS,NORTHSOUTH,WHOLE_RANG,HALF_RANGE,EASTWEST,SECTION,QUARTER_16,QACRE160DI,QUARTER_40,QACRE40DIR,QUARTER_10,QACRE10DIR,UTM_X_METE,UTM_Y_METE,APPLICATIO,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,ZIP4,WATER_USE,latitude,longitude,WaDEUUID
0,1,1,55,60000,ARIZONA SONORAN COPPER COMPANY (USA) INC,NO,NON-EXEMPT,NON-EXEMPT,0,,,1790,257,60,20,OPEN HOLE IN AQUIFER,NO PUMP CODE LISTED,NO POWER CODE LISTED,0,0,0,,,N,D05005035ABA,PINAL,SANTA CRUZ RIVER,PINAL AMA,ELOY,PINAL,D,5,0,S,5,0,E,35,A,NE,B,NW,A,NE,423912.1,3646244.0,1/1/1900 0:00:00,ATTN: TRAVIS SNYDER,850 W. ELLIOT RD. STE 106,TEMPE,AZ,85284,,DEWATERING,32.95186,-111.81407,azGW0
1,2,2,55,60001,ROUTE 14 INVESTMENT PARTNERS LLC,YES,NON-EXEMPT - NON-SERVICE,NON-EXEMPT,611,1/24/2003 0:00:00,7/2/1984 0:00:00,1000,520,1000,12,STEEL - PERFORATED OR SLOTTED CASING,SUBMERSIBLE,ELECTRIC MOTOR 16 - 100 HP,200,400,75,,,,B05002008CCB,MARICOPA,AGUA FRIA RIVER,PHOENIX AMA,WEST SALT RIVER VALLEY,PHOENIX,B,5,0,N,2,0,W,8,C,SW,C,SW,B,NW,361297.1,3739323.0,2/25/1985 0:00:00,ATTN: MARK REPANICH,33040 N 203RD AVE,WITTMANN,AZ,85361,,INDUSTRIAL,33.78498,-112.49814,azGW1
2,3,3,55,82721,"KEYES,G",NO,NON-EXEMPT,NON-EXEMPT,63,,,0,0,0,0,NO CASING CODE LISTED,NO PUMP CODE LISTED,NO POWER CODE LISTED,0,0,0,,,Y,B02009022000,MARICOPA,LOWER GILA RIVER,HARQUAHALA INA,HARQUAHALA,HARQUAHALA INA,B,2,0,N,9,0,W,22,0,,0,,0,,298366.4,3708927.0,10/15/1979 0:00:00,7765 E FOOTHILL DR S,,PARADISE VALLEY,AZ,85253,,DOMESTIC,33.5009,-113.17061,azGW2
3,4,4,55,83226,TOLBERT COLEMAN,NO,EXEMPT,EXEMPT,211,,,162,162,162,8,STEEL - PERFORATED OR SLOTTED CASING,SUBMERSIBLE,ELECTRIC MOTOR 1 - 5 HP,25,25,0,X,X,,D13025021BB0,COCHISE,WILLCOX PLAYA,WILLCOX,WILLCOX,OUTSIDE OF AMA OR INA,D,13,0,S,25,0,E,21,B,NW,B,NW,0,,612466.5,3573775.0,1/7/1980 0:00:00,BOX 446,,WILLCOX,AZ,85643,,DOMESTIC,32.29513,-109.80552,azGW3
4,5,5,55,83578,MYRON D CLARK,NO,NON-EXEMPT,NON-EXEMPT,27,2/29/1980 0:00:00,4/29/1980 0:00:00,101,12,101,16,STEEL - PERFORATED OR SLOTTED CASING,TURBINE,ELECTRIC MOTOR 16 - 100 HP,15,15,20,X,X,,D21013017CDD,SANTA CRUZ,SANTA CRUZ RIVER,SANTA CRUZ AMA,SANTA CRUZ AMA,SANTA CRUZ,D,21,0,S,13,0,E,17,C,SW,D,SE,D,SE,496526.6,3495617.0,2/15/1980 0:00:00,STAR RT #253,,TUMACACORI,AZ,85640,,DOMESTIC,31.59564,-111.03662,azGW4


In [4]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfgw['WaDEUUID']

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = "WaDE_Unspecified"
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = "WaDE_Unspecified"
df['in_CoordinateMethodCV'] = "WaDE_Unspecified"
df['in_County'] = dfgw['COUNTY'].str.title()
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfgw['latitude']
df['in_Longitude'] = dfgw['longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = "WaDE_Unspecified"
df['in_SiteNativeID'] = dfgw['CADASTRAL'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Well" # these should all be well records
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_MethodUUID'] = "AZwr_M1" # for goundwater
df['in_OrganizationUUID'] = "AZwr_O1"
df['in_SiteUUID'] = "" # ???
df['in_VariableSpecificUUID'] =  "AZwr_V1" # for CFS
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfgw['PUMPRATE'].astype(float)
df['in_AllocationLegalStatusCV'] = "WaDE_Unspecified"
df['in_AllocationNativeID'] =  dfgw['REGISTRY_I'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_AllocationOwner'] = dfgw['OWNER_NAME']
df['in_AllocationPriorityDate'] = ""
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
#df['in_AllocationTypeCV'] = dfgw['WELL_TYPE_'] # skip for now
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfgw['WATER_USE'].str.title()
df['in_CommunityWaterSupplySystem'] = dfgw['AMA']
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 1 # all these gw records should be considered exempt for us.
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

dfgwOut = df.copy()
print(len(dfgwOut))
dfgwOut.head()

229051


Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_MethodUUID,in_OrganizationUUID,in_SiteUUID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azGW0,,,,WaDE_Unspecified,,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Pinal,4326,,,,32.95186,-111.81407,,,POD,WaDE_Unspecified,D05005035ABA,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,0.0,WaDE_Unspecified,60000,ARIZONA SONORAN COPPER COMPANY (USA) INC,,,12/31,01/01,,,Dewatering,PINAL,,,,,1,,,,,,,,,,
1,azGW1,,,,WaDE_Unspecified,,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Maricopa,4326,,,,33.78498,-112.49814,,,POD,WaDE_Unspecified,B05002008CCB,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,200.0,WaDE_Unspecified,60001,ROUTE 14 INVESTMENT PARTNERS LLC,,,12/31,01/01,,,Industrial,PHOENIX,,,,,1,,,,,,,,,,
2,azGW2,,,,WaDE_Unspecified,,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Maricopa,4326,,,,33.5009,-113.17061,,,POD,WaDE_Unspecified,B02009022000,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,0.0,WaDE_Unspecified,82721,"KEYES,G",,,12/31,01/01,,,Domestic,HARQUAHALA INA,,,,,1,,,,,,,,,,
3,azGW3,,,,WaDE_Unspecified,,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Cochise,4326,,,,32.29513,-109.80552,,,POD,WaDE_Unspecified,D13025021BB0,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,25.0,WaDE_Unspecified,83226,TOLBERT COLEMAN,,,12/31,01/01,,,Domestic,OUTSIDE OF AMA OR INA,,,,,1,,,,,,,,,,
4,azGW4,,,,WaDE_Unspecified,,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Santa Cruz,4326,,,,31.59564,-111.03662,,,POD,WaDE_Unspecified,D21013017CDD,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,15.0,WaDE_Unspecified,83578,MYRON D CLARK,,,12/31,01/01,,,Domestic,SANTA CRUZ,,,,,1,,,,,,,,,,


## Surface Water Data (POD & POU) 

In [5]:
#Input files - Surface Water Query by Watershed water records

# Surface Water Query by Watershed water record inputs.
csv_file_list = [
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/AGUA FRIA RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/BILL WILLIAMS RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/COLORADO RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/LITTLE COLORADO RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/LOWER GILA RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/RIO YAQUI.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SALT RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SAN PEDRO RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SAN SIMON RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/SANTA CRUZ RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/UPPER GILA RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/VERDE RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/VIRGIN RIVER.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/WHITE WATER DRAW.zip",
    "Surface Water/SW QUERY BY SURFACE WATERSHEDS/WILLCOX PLAYA.zip"]


list_of_dataframes = []
for filename in csv_file_list:
    list_of_dataframes.append(pd.read_csv(filename, compression='zip'))

dfsw = pd.concat(list_of_dataframes).replace(np.nan, "").drop_duplicates().reset_index(drop=True)
dfsw = dfsw.drop(['LEGAL'], axis=1).drop_duplicates().reset_index(drop=True) # drop 'LEGAL', not needed.

if 'WaDEUUID' not in dfsw:
    dfsw['WaDEUUID'] = "azSW" + dfsw.index.astype(str)
    dfsw.to_csv('Surface Water/SW_QUERY_COMBINED.zip', compression='zip', index=False)

print(len(dfsw))
dfsw.head(3)

256140


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,POU/POD,WATER USE,QUANTITY,WaDEUUID
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-25474.0,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,DOMESTIC,,azSW0
1,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-25474.0,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,STOCK,1.5 Acre-Feet Per Annum,azSW1
2,"ANDERWALD, GERALDINE W",32,"CROWN KING RT MAYER, AZ 86333",36-66830.0,ACTIVE - ACTIVE,,,6/13/1978 12:00:00 AM,TURKEY CREEK,11/7/1906,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,ANNUAL USE,.15 Acre-Feet Per Annum,azSW2


In [6]:
# fixing 'REG. NO' format to match 'FILNO' in FILINGS shp file.

def fixREGNO(val):
   
    ### first fix
    # Create testVal to search for length, split on '-' & '.'
    testVal = str(val).strip()
    sep1 = '-'
    testVal = testVal.split(sep1, 1)[1]
    sep2 = '.'
    testVal = testVal.split(sep2, 1)[0]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 2:
        val = val.replace("-", "-0000")
    if len(testVal) == 3:
        val = val.replace("-", "-000")
    if len(testVal) == 4:
        val = val.replace("-", "-00")
    if len(testVal) == 5:
        val = val.replace("-", "-0")
        
        
    ### second fix
    # Create testVal to search for length, split on '.' at the end
    testVal = str(val).strip()
    sep1 = '.'
    testVal = testVal.split(sep1, 1)[1]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 1:
        val = val.replace(".", ".00" + testVal)
    if len(testVal) == 2:
        val = val.replace(".", ".0" + testVal)   
    
    return val

dfsw['REG. NO'] = dfsw.apply(lambda row: fixREGNO(row['REG. NO']), axis=1)
exList = dfsw['REG. NO'].unique().tolist()
exList.sort()
for x in exList:
    print(x)

33-000011.0000
33-000015.0022
33-000015.0033
33-000016.0000
33-000018.0000
33-000020.0000
33-000024.0000
33-009085.0000
33-011511.0000
33-012062.0011
33-023130.0011
33-023131.0011
33-023177.0000
33-023280.0000
33-023281.0000
33-023301.0000
33-023302.0000
33-023303.0000
33-023304.0000
33-023305.0000
33-023306.0000
33-023308.0000
33-023437.0000
33-023537.0033
33-023604.0011
33-023835.0000
33-024013.0044
33-024200.0011
33-024622.0000
33-024664.0011
33-024696.0022
33-024874.0011
33-024959.0011
33-025233.0000
33-025279.0011
33-025458.0011
33-025459.0011
33-025625.0000
33-025880.0011
33-025881.0011
33-025882.0011
33-026063.0022
33-026063.0044
33-026064.0011
33-026494.0011
33-026495.0011
33-026878.0000
33-026879.0000
33-026880.0000
33-026979.0000
33-027157.0000
33-027158.0000
33-027159.0000
33-027160.0000
33-027162.0000
33-027163.0000
33-027164.0000
33-027165.0000
33-027166.0000
33-027167.0000
33-027168.0000
33-027169.0000
33-027170.0000
33-027171.0000
33-027172.0000
33-027173.0000
33-027174.

In [7]:
#Input files - Fillings Layers.shp

#POD layer
fileName = "Surface Water/shapefile/FilingPOD.zip"
df_PODfill = pd.read_csv(fileName, compression='zip')

#POU layer
fileName = "Surface Water/shapefile/FillingPOU.zip"
df_POUfill = pd.read_csv(fileName, compression='zip')

# Concatenate dataframes
frames = [df_PODfill, df_POUfill] # add dataframes here
df_fill = pd.concat(frames)
df_fill = df_fill.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(df_fill))
df_fill.head(1)

  df_PODfill = pd.read_csv(fileName, compression='zip')
  df_POUfill = pd.read_csv(fileName, compression='zip')


228637


Unnamed: 0,OID_,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS,WATERSOURC,FILE_DATE,PRIOR_DATE,CONST_DT,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNITS1,H20_AMT_3,H20_UNIT_1,H20_AMT_4,H20_UNIT_2,H20_AMT_5,H20_UNIT_3,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83
0,0,33,1,0,33-000001.0000,,,,,,POD,INACTIVE - CANCELLED,,,,,NEW,,E ONE,,,,,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,,,,A0104,A010401,A010401,8.0,0.0,0.0


In [8]:
# Merge Surface Water Query by Watershed water records with Filling.
dfsw = pd.merge(dfsw, df_fill, left_on='REG. NO', right_on='FILENO', how='left')
print(len(dfsw))
dfsw.head(1)

530978


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS_x,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,POU/POD,WATER USE,QUANTITY,WaDEUUID,OID_,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS_y,WATERSOURC,FILE_DATE,PRIOR_DATE,CONST_DT,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNITS1,H20_AMT_3,H20_UNIT_1,H20_AMT_4,H20_UNIT_2,H20_AMT_5,H20_UNIT_3,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,DOMESTIC,,azSW0,27812.0,36,25474.0,0,36-025474.0000,,,,,,POD,ACTIVE - ACTIVE,GROUNDWATER SUB FLOW,6/29/1979 0:00,6/5/1977 0:00,,,,"ALLISON, PHYLLIS H",BLUE HILLS RT,,DEWEY,86327,,1.5,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,AGUA FRIA RIVER,,,A130010,A13001005,A13001005BD0,1.0,382128.1748,3822502.638


In [9]:
# Split 'QUANTITY' into 'Amount' and 'UNIT'
dfsw[['Amount', 'Unit']] = dfsw.QUANTITY.str.split("  ", expand = True)
dfsw['Amount'] = pd.to_numeric(dfsw['Amount'], errors='coerce').fillna(0).astype(float) # make sure this is numeric.
dfsw.head(1)

Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS_x,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,POU/POD,WATER USE,QUANTITY,WaDEUUID,OID_,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS_y,WATERSOURC,FILE_DATE,PRIOR_DATE,CONST_DT,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNITS1,H20_AMT_3,H20_UNIT_1,H20_AMT_4,H20_UNIT_2,H20_AMT_5,H20_UNIT_3,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83,Amount,Unit
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,DOMESTIC,,azSW0,27812.0,36,25474.0,0,36-025474.0000,,,,,,POD,ACTIVE - ACTIVE,GROUNDWATER SUB FLOW,6/29/1979 0:00,6/5/1977 0:00,,,,"ALLISON, PHYLLIS H",BLUE HILLS RT,,DEWEY,86327,,1.5,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,AGUA FRIA RIVER,,,A130010,A13001005,A13001005BD0,1.0,382128.1748,3822502.638,0.0,


In [10]:
# 03/02/2023
# temp fix - remove recods with these 'Units'
dropList = ['ACRES',
            'Amount Required for Maintenance',
            'Feet',
            'MIT - Miners Inches Total',
            'Miners Inches Per Annum', 
            'XX - Unknown Code at Load time',
            'None',
            '',
            " "]

dfsw = dfsw[~dfsw['Unit'].isin(dropList)]
print(len(dfsw))
dfsw.head(1)

530242


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS_x,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,POU/POD,WATER USE,QUANTITY,WaDEUUID,OID_,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS_y,WATERSOURC,FILE_DATE,PRIOR_DATE,CONST_DT,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNITS1,H20_AMT_3,H20_UNIT_1,H20_AMT_4,H20_UNIT_2,H20_AMT_5,H20_UNIT_3,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83,Amount,Unit
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,DOMESTIC,,azSW0,27812.0,36,25474.0,0,36-025474.0000,,,,,,POD,ACTIVE - ACTIVE,GROUNDWATER SUB FLOW,6/29/1979 0:00,6/5/1977 0:00,,,,"ALLISON, PHYLLIS H",BLUE HILLS RT,,DEWEY,86327,,1.5,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,AGUA FRIA RIVER,,,A130010,A13001005,A13001005BD0,1.0,382128.1748,3822502.638,0.0,


In [11]:
# Need to update to reflect groundwater values too
# CFS = V1, AF = V2

# Create VariableSpecificCv value
def createVariableSpecificCv(unit):
    outString = ""
    if unit == "Cubic Feet Per Second":
        outString = "AZwr_V1"
    if unit == "Acre-Feet Per Annum":
        outString = "AZwr_V1"
    if unit == "Gallons Per Annum":
        outString = "AZwr_V1"
    else:
        outString = "AZwr_V2"

    return(outString)

dfsw['in_VariableSpecificUUID'] = dfsw.apply(lambda row: createVariableSpecificCv(row['Unit']), axis=1)
dfsw['in_VariableSpecificUUID'].unique()

array(['AZwr_V2', 'AZwr_V1'], dtype=object)

In [12]:
# convert all flow values to CFS
def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet Per Second":
        CFS_Value = val
    if unit == "Acre-Feet Per Annum":
        CFS_Value = val / (723.968)
    if unit == "Gallons Per Annum":
        CFS_Value = val / (235905662.34)
    else:
        CFS_Value = 0.0
    return(CFS_Value)

dfsw['CFS_Value'] = dfsw.apply(lambda row: convertFlowFunc(row['Amount'], row['Unit']), axis=1)
dfsw['CFS_Value'].unique()

array([0.00000000e+00, 2.49940291e-02, 9.53771087e-03, ...,
       9.09516109e-05, 9.09261770e-06, 1.24991319e-02])

In [13]:
# convert all volume values to AF
def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-Feet':
        AF_Value = val
    if unit == 'Acre-Feet Total':
        AF_Value = val
    if unit == "CFT - Cubic Feet Total":
        AF_Value = val / (43559.9)
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    else:
        AF_Value = 0.0
    return(AF_Value)

dfsw['AF_Value'] = dfsw.apply(lambda row: convertVolumeFunc(row['Amount'], row['Unit']), axis=1)
dfsw['AF_Value'].unique()

array([0.00000000e+00, 1.53444393e-04, 6.13777570e-05, 1.53444393e-05,
       9.20666355e-06, 3.06888785e-05, 2.00000035e+01, 3.83610981e-03,
       1.10479963e-01, 4.03251864e+01, 1.37992542e+00, 9.28031686e-02,
       1.53444393e-03, 3.06888785e-03, 9.20666355e-04, 6.13777570e-04,
       4.60333178e-03, 5.60072033e-01, 1.34417288e+00, 1.38099953e+00,
       2.62451289e-02, 1.51909949e-02, 3.60594322e-02, 3.01687020e-02,
       1.76767940e-01, 5.74495806e-02, 7.58015299e-02, 9.20666355e-01,
       2.25804472e+00, 4.43577050e-01, 5.84009358e-02, 3.97727865e-04,
       9.20666355e-02, 6.13777570e-02, 3.06888785e-02, 2.68834576e+00,
       2.68834576e-01, 1.22755514e-01, 1.50000180e+00, 3.06888785e+01,
       1.99477710e+02, 6.75155327e-01, 3.37577664e-01, 6.13777570e-01,
       3.68266542e-01, 4.90623101e+00, 1.10998605e-01, 5.24166045e-02,
       3.92050423e+00, 3.06888785e-04, 5.03942074e+00, 3.06888785e-01,
       1.22755514e-02, 8.06503727e-03, 1.32575955e+00, 3.62128766e-01,
      

In [14]:
# Creating long and lat values from data.  
# Need to convert from UTM 12N to WGS 84.
# I believe AZ is consiered WGS 84 / UTM zone 12N - EPSG:32612.

from pyproj import Proj
myProj = Proj(proj='utm',zone=12, ellps='WGS84', preserve_units=False)
long, lat = myProj(dfsw['X_UTMNAD83'].values, dfsw['Y_UTMNAD83'].values, inverse=True)
dfsw['in_Latitude'] = lat
dfsw['in_Longitude'] = long
dfsw = dfsw.replace(np.nan, '')  # Replaces NaN values with blank.
dfsw.head(1)

Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS_x,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,POU/POD,WATER USE,QUANTITY,WaDEUUID,OID_,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS_y,WATERSOURC,FILE_DATE,PRIOR_DATE,CONST_DT,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNITS1,H20_AMT_3,H20_UNIT_1,H20_AMT_4,H20_UNIT_2,H20_AMT_5,H20_UNIT_3,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83,Amount,Unit,in_VariableSpecificUUID,CFS_Value,AF_Value,in_Latitude,in_Longitude
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,POD,DOMESTIC,,azSW0,27812.0,36,25474.0,0,36-025474.0000,,,,,,POD,ACTIVE - ACTIVE,GROUNDWATER SUB FLOW,6/29/1979 0:00,6/5/1977 0:00,,,,"ALLISON, PHYLLIS H",BLUE HILLS RT,,DEWEY,86327,,1.5,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,AGUA FRIA RIVER,,,A130010,A13001005,A13001005BD0,1.0,382128.1748,3822502.638,0.0,,AZwr_V2,0.0,0.0,34.53748,-112.2845


In [15]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfsw['WaDEUUID']

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfsw['WATERSOURC'].str.title()
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = "WaDE_Unspecified"
df['in_CoordinateMethodCV'] = "WaDE_Unspecified"
df['in_County'] = dfsw['COUNTY'].str.title()
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfsw['in_Latitude']
df['in_Longitude'] = dfsw['in_Longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfsw['POU_POD']
df['in_SiteName'] = "WaDE_Unspecified"
df['in_SiteNativeID'] = dfsw['CADASTRAL'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE_Unspecified"
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_MethodUUID'] = "AZwr_M2" # for surface water
df['in_OrganizationUUID'] = "AZwr_O1"
df['in_SiteUUID'] = "" # ???
df['in_VariableSpecificUUID'] =  dfsw['in_VariableSpecificUUID']
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfsw['CFS_Value'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfsw['STATUS_x'].str.title()
df['in_AllocationNativeID'] =  dfsw['REG. NO'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfsw['NAME']
df['in_AllocationPriorityDate'] = dfsw['PRIOR DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = "WaDE_Unspecified"
df['in_AllocationVolume_AF'] = dfsw['AF_Value'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfsw['WATER USE'].str.title()
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

dfswOut = df.copy()
print(len(dfswOut))
dfswOut.head()

530242


Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_MethodUUID,in_OrganizationUUID,in_SiteUUID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azSW0,,,,Groundwater Sub Flow,,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Yavapai,4326,,,,34.53748,-112.2845,,,POD,WaDE_Unspecified,A13001005BD0,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.0,Active - Active,36-025474.0000,"ALLISON, PHYLLIS H",6/5/1977,,12/31,01/01,WaDE_Unspecified,0.0,Domestic,,,,,,0,,,,,,,,,,
1,azSW1,,,,Groundwater Sub Flow,,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Yavapai,4326,,,,34.53748,-112.2845,,,POD,WaDE_Unspecified,A13001005BD0,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.0,Active - Active,36-025474.0000,"ALLISON, PHYLLIS H",6/5/1977,,12/31,01/01,WaDE_Unspecified,0.0,Stock,,,,,,0,,,,,,,,,,
2,azSW2,,,,Turkey Creek,,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Yavapai,4326,,,,34.23952,-112.20735,,,POD,WaDE_Unspecified,A10001001AC0,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.0,Active - Active,36-066830.0000,"ANDERWALD, GERALDINE W",11/7/1906,,12/31,01/01,WaDE_Unspecified,0.0,Annual Use,,,,,,0,,,,,,,,,,
3,azSW3,,,,Turkey Creek,,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Yavapai,4326,,,,34.23952,-112.20735,,,POD,WaDE_Unspecified,A10001001AC0,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.0,Active - Active,36-066830.0000,"ANDERWALD, GERALDINE W",11/7/1906,,12/31,01/01,WaDE_Unspecified,0.0,Domestic,,,,,,0,,,,,,,,,,
4,azSW4,,,,Turkey Creek,,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Yavapai,4326,,,,34.23952,-112.20735,,,POD,WaDE_Unspecified,A10001001AC0,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.0,Active - Active,36-066830.0000,"ANDERWALD, GERALDINE W",11/7/1906,,12/31,01/01,WaDE_Unspecified,0.0,Irrigation,,,,,,0,,,,,,,,,,


## Concatenate GW with SW dataframes

In [16]:
frames = [dfgwOut, dfswOut] # add dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

759165


## Clean output dataframes

In [17]:
# Clean owner name up
def cleanOwnerDataFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;,/\)(-]", "", Val).title().strip()
    return Val
outdf['in_AllocationOwner'] = outdf.apply(lambda row: cleanOwnerDataFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Arizona Sonoran Copper Company Usa Inc',
       'Route 14 Investment Partners Llc', 'Keyesg', ...,
       'Cimarron Ranch Land Llc', 'Roll Ranch Prtshp',
       'Ray A And Guadalupe Rogers'], dtype=object)

In [18]:
# Fixing empty string names

def fixEmptyString(val):
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "WaDE Unspecified"
    else:
        outString = val
    return outString

In [19]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['WaDE_Unspecified', 'Groundwater Sub Flow', 'Turkey Creek', ...,
       'Waughtal', 'Main Channel Of High', 'Wilson Dam'], dtype=object)

In [20]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater', 'Surface Water'], dtype=object)

In [21]:
outdf['in_County'] = outdf.apply(lambda row: fixEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Pinal', 'Maricopa', 'Cochise', 'Santa Cruz', 'Pima', 'Greenlee',
       'Navajo', 'Mohave', 'Yavapai', 'Apache', 'Coconino', 'La Paz',
       'Gila', 'Yuma', 'Graham', 'WaDE Unspecified'], dtype=object)

In [22]:
outdf['in_SiteName'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['WaDE_Unspecified'], dtype=object)

In [23]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: fixEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Well', 'WaDE_Unspecified'], dtype=object)

In [24]:
outdf['in_AllocationLegalStatusCV'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationLegalStatusCV']), axis=1)
outdf['in_AllocationLegalStatusCV'].unique()

array(['WaDE_Unspecified', 'Active - Active', 'Active - Full Assignment',
       'Active - Amended', 'Active - Partial Assignment',
       'Active - Instream Flow', 'Active - Conditional Full T&S',
       'Active - Full T&S', 'Active - Modified', 'Active - Partial T&S',
       'Active - Conditional Partial T&S'], dtype=object)

In [25]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: fixEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Arizona Sonoran Copper Company Usa Inc',
       'Route 14 Investment Partners Llc', 'Keyesg', ...,
       'Cimarron Ranch Land Llc', 'Roll Ranch Prtshp',
       'Ray A And Guadalupe Rogers'], dtype=object)

In [26]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fixEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Dewatering', 'Industrial', 'Domestic', 'Irrigation',
       'Utility (Water Co)', 'WaDE Unspecified', 'Municipal Uses',
       'Stock', 'Commercial', 'Other - Mineral Explore', 'Mining',
       'No Water Use, Domestic', 'Domestic, Irrigation', 'Recreation',
       'Utility (Water Co), Monitoring', 'Municipal Uses, Domestic',
       'Monitoring', 'Stock, Domestic', 'Drainage',
       'Irrigation, Municipal Uses', 'No Water Use', 'Domestic, Stock',
       'Industrial, Domestic', 'Test', 'Irrigation, Domestic',
       'Irrigation, Domestic, Stock', 'Utility (Water Co), No Water Use',
       'Municipal Uses, Recovery', 'Irrigation, Industrial',
       'Industrial, Irrigation', 'Stock, Irrigation',
       'Monitoring, No Water Use', 'Test, No Water Use',
       'Utility (Water Co), Industrial, Domestic', 'Irrigation, Stock',
       'Domestic, Industrial', 'Domestic, Irrigation, Commercial',
       'Test, Monitoring', 'Domestic, Municipal Uses',
       'Utility (Water Co), Municipal 

In [27]:
# in_Latitude & in_Longitude
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').fillna(0)
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').fillna(0)
outdf.head(1)

Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_MethodUUID,in_OrganizationUUID,in_SiteUUID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azGW0,,,,WaDE_Unspecified,,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Pinal,4326,,,,32.95186,-111.81407,,,POD,WaDE_Unspecified,D05005035ABA,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,0.0,WaDE_Unspecified,60000,Arizona Sonoran Copper Company Usa Inc,,,12/31,01/01,,,Dewatering,PINAL,,,,,1,,,,,,,,,,


In [28]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array([                          'NaT', '1977-06-05T00:00:00.000000000',
       '1906-11-07T00:00:00.000000000', ...,
       '1954-04-12T00:00:00.000000000', '1940-03-09T00:00:00.000000000',
       '2019-08-12T00:00:00.000000000'], dtype='datetime64[ns]')

In [29]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.00000000e+00, 2.00000000e+02, 2.50000000e+01, ...,
       9.09516109e-05, 9.09261770e-06, 1.24991319e-02])

In [30]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([0.00000000e+00, 1.53444393e-04, 6.13777570e-05, 1.53444393e-05,
       9.20666355e-06, 3.06888785e-05, 2.00000035e+01, 3.83610981e-03,
       1.10479963e-01, 4.03251864e+01, 1.37992542e+00, 9.28031686e-02,
       1.53444393e-03, 3.06888785e-03, 9.20666355e-04, 6.13777570e-04,
       4.60333178e-03, 5.60072033e-01, 1.34417288e+00, 1.38099953e+00,
       2.62451289e-02, 1.51909949e-02, 3.60594322e-02, 3.01687020e-02,
       1.76767940e-01, 5.74495806e-02, 7.58015299e-02, 9.20666355e-01,
       2.25804472e+00, 4.43577050e-01, 5.84009358e-02, 3.97727865e-04,
       9.20666355e-02, 6.13777570e-02, 3.06888785e-02, 2.68834576e+00,
       2.68834576e-01, 1.22755514e-01, 1.50000180e+00, 3.06888785e+01,
       1.99477710e+02, 6.75155327e-01, 3.37577664e-01, 6.13777570e-01,
       3.68266542e-01, 4.90623101e+00, 1.10998605e-01, 5.24166045e-02,
       3.92050423e+00, 3.06888785e-04, 5.03942074e+00, 3.06888785e-01,
       1.22755514e-02, 8.06503727e-03, 1.32575955e+00, 3.62128766e-01,
      

In [31]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceName'] == A) & 
                                       (dfWaterSourceNativeID['in_WaterSourceTypeCV'] == B), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID15172', 'wadeID15173',
       'wadeID15174'], dtype=object)

## Review and Export

In [32]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 759165 entries, 0 to 759164
Data columns (total 66 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   WaDEUUID                                      759165 non-null  object        
 1   in_Geometry                                   759165 non-null  object        
 2   in_GNISFeatureNameCV                          759165 non-null  object        
 3   in_WaterQualityIndicatorCV                    759165 non-null  object        
 4   in_WaterSourceName                            759165 non-null  object        
 5   in_WaterSourceNativeID                        759165 non-null  object        
 6   in_WaterSourceTypeCV                          759165 non-null  object        
 7   in_RegulatoryOverlayUUIDs                     759165 non-null  object        
 8   in_WaterSourceUUID                            759165 n

In [33]:
outdf

Unnamed: 0,WaDEUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_MethodUUID,in_OrganizationUUID,in_SiteUUID,in_VariableSpecificUUID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azGW0,,,,WaDE_Unspecified,wadeID1,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Pinal,4326,,,,32.95186,-111.81407,,,POD,WaDE_Unspecified,D05005035ABA,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,0.00000,WaDE_Unspecified,60000,Arizona Sonoran Copper Company Usa Inc,NaT,,12/31,01/01,,0.00000,Dewatering,PINAL,,,,,1,,,,,,,,,,
1,azGW1,,,,WaDE_Unspecified,wadeID1,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Maricopa,4326,,,,33.78498,-112.49814,,,POD,WaDE_Unspecified,B05002008CCB,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,200.00000,WaDE_Unspecified,60001,Route 14 Investment Partners Llc,NaT,,12/31,01/01,,0.00000,Industrial,PHOENIX,,,,,1,,,,,,,,,,
2,azGW2,,,,WaDE_Unspecified,wadeID1,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Maricopa,4326,,,,33.50090,-113.17061,,,POD,WaDE_Unspecified,B02009022000,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,0.00000,WaDE_Unspecified,82721,Keyesg,NaT,,12/31,01/01,,0.00000,Domestic,HARQUAHALA INA,,,,,1,,,,,,,,,,
3,azGW3,,,,WaDE_Unspecified,wadeID1,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Cochise,4326,,,,32.29513,-109.80552,,,POD,WaDE_Unspecified,D13025021BB0,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,25.00000,WaDE_Unspecified,83226,Tolbert Coleman,NaT,,12/31,01/01,,0.00000,Domestic,OUTSIDE OF AMA OR INA,,,,,1,,,,,,,,,,
4,azGW4,,,,WaDE_Unspecified,wadeID1,Groundwater,,,WaDE_Unspecified,WaDE_Unspecified,Santa Cruz,4326,,,,31.59564,-111.03662,,,POD,WaDE_Unspecified,D21013017CDD,,Well,AZ,,AZwr_M1,AZwr_O1,,AZwr_V1,,,,,,,,,15.00000,WaDE_Unspecified,83578,Myron D Clark,NaT,,12/31,01/01,,0.00000,Domestic,SANTA CRUZ,,,,,1,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
759160,azSW256137,,,,Ephemeral,wadeID5684,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Cochise,4326,,,,32.14044,-109.77458,,,POU,WaDE_Unspecified,D15025010DDA,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.00000,Active - Active,38-096723.0000,Blm Safford,2019-08-12,,12/31,01/01,WaDE_Unspecified,0.00000,Wildlife,,,,,,0,,,,,,,,,,
759161,azSW256138,,,,Ephemeral,wadeID5684,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Cochise,4326,,,,32.14044,-109.77458,,,POD,WaDE_Unspecified,D15025010DDA,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.00000,Active - Active,38-096723.0000,Blm Safford,2019-08-12,,12/31,01/01,WaDE_Unspecified,0.00000,Stock,,,,,,0,,,,,,,,,,
759162,azSW256138,,,,Ephemeral,wadeID5684,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Cochise,4326,,,,32.14044,-109.77458,,,POU,WaDE_Unspecified,D15025010DDA,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.00000,Active - Active,38-096723.0000,Blm Safford,2019-08-12,,12/31,01/01,WaDE_Unspecified,0.00000,Stock,,,,,,0,,,,,,,,,,
759163,azSW256139,,,,Ephemeral,wadeID5684,Surface Water,,,WaDE_Unspecified,WaDE_Unspecified,Cochise,4326,,,,32.14044,-109.77458,,,POD,WaDE_Unspecified,D15025010DDA,,WaDE_Unspecified,AZ,,AZwr_M2,AZwr_O1,,AZwr_V2,,,,,,,,,0.00000,Active - Active,38-096723.0000,Blm Safford,2019-08-12,,12/31,01/01,WaDE_Unspecified,0.00000,Wildlife,,,,,,0,,,,,,,,,,


In [34]:
# Export the output dataframe
outdf.to_csv('Pwr_azMain.zip', index=False, compression="zip")  # The output, save as a zip
#dfPoUshape.to_csv('P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.