# Pre-processing (state / organization Name) Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/Arizona/WaterAllocation" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/Arizona/WaterAllocation


## Input Data

In [3]:
# Input File - all SW QUERY BY SURFACE WATERSHEDS csv files

# use glob to get all the csv files in the folder 
import glob 
path = "RawInputData/SW QUERY BY SURFACE WATERSHEDS/"
csv_files = glob.glob(os.path.join(path, "*.zip"))

# loop over the list of csv files
dfin1 = pd.DataFrame()
for f in csv_files:  
    # read the csv file
    dftemp = pd.read_csv(f).replace(np.nan, "")
    dfin1 = pd.concat([dfin1, dftemp]) 

dfin1['WaDEUUID'] = "azSW" + dfin1.index.astype(str)

dfin1.to_csv('RawInputData/ALL_SWQUERYBYSURFACEWATERSHEDS.zip', compression=dict(method='zip', archive_name='ALL_SWQUERYBYSURFACEWATERSHEDS.csv'), index=False)

print(len(dfin1))
dfin1.head(1)

360539


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,LEGAL,POU/POD,WATER USE,QUANTITY,WaDEUUID
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,SE NW 5 13N 1E,POD,DOMESTIC,,azSW0


In [4]:
# Input File - Filing_POD shp file, for sw
inputFile = "RawInputData/Filing_POD.zip"
df_FPOD = gpd.read_file(inputFile).replace(np.nan, "")

df_FPOD['geometry'] = df_FPOD['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
df_FPOD["wade_lattitude"] = df_FPOD.centroid.y.round(5)
df_FPOD["wade_longitude"] = df_FPOD.centroid.x.round(5)
print(len(df_FPOD))
df_FPOD.head(1)


  df_FPOD["wade_lattitude"] = df_FPOD.centroid.y.round(5)
  result = self._values.round(decimals)

  df_FPOD["wade_longitude"] = df_FPOD.centroid.x.round(5)


99451


  result = self._values.round(decimals)


Unnamed: 0,OBJECTID,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS,WATERSOURC,FILE_DATE,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNIT_1,H20_AMT_3,H20_UNIT_2,H20_AMT_4,H20_UNIT_3,H20_AMT_5,H20_UNIT_4,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83,PRIOR_DT,CONST_DT,geometry,wade_lattitude,wade_longitude
0,1,33,15,3,33-000015.0003,593,2,9,,,POD,ACTIVE - PARTIAL T&S,GRANITE AND WILLOW CREEK,1922-02-07,INPLACE,,"PRESCOTT, CITY OF",2415 E CAMELBACK STE 700,,PHOENIX,85016,,4826.26,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,VERDE RIVER,,,B140020,B14002001,B14002001DC0,109,369969.9863,3831411.1684,2/7/1922,,POINT (-112.41833 34.61633),34.61633,-112.41833


In [5]:
# Input File - Filing_POU shp file, for sw
inputFile = "RawInputData/Filing_POU.zip"
df_FPOU = gpd.read_file(inputFile).replace(np.nan, "")

df_FPOU = df_FPOU[df_FPOU['X_UTMNAD83'] != 0.00000].reset_index(drop=True)

df_FPOU['geometry'] = df_FPOU['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
df_FPOU["wade_lattitude"] = df_FPOU.centroid.y.round(5)
df_FPOU["wade_longitude"] = df_FPOU.centroid.x.round(5)
print(len(df_FPOU))
df_FPOU.head(1)


  df_FPOU["wade_lattitude"] = df_FPOU.centroid.y.round(5)


129896



  df_FPOU["wade_longitude"] = df_FPOU.centroid.x.round(5)


Unnamed: 0,OBJECTID,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS,WATERSOURC,FILE_DATE,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNIT_1,H20_AMT_3,H20_UNIT_2,H20_AMT_4,H20_UNIT_3,H20_AMT_5,H20_UNIT_4,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83,PRIOR_DT,CONST_DT,geometry,wade_lattitude,wade_longitude
0,66,33,13,0,33-000013.0000,,,,,,POU,INACTIVE - WITHDRAWN,GILA RIVER,1922-01-28,,,BUCKEYE IRRIGATION DISTRICT,205 ROOSEVELT AVE,,BUCKEYE,85326,,180949.0,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.01,,0.0,,0.0,,LOWER GILA RIVER,,,B010010,B01001028,B01001028CC0,2,372315.1197,3695771.0358,1/28/1922,,POINT (-112.37293 33.39366),33.39366,-112.37293


In [6]:
# merge POD and POU shp files together for single dataframe
df_fill = pd.concat([df_FPOD, df_FPOU])
df_fill = df_fill.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(df_fill))
df_fill.head(1)

229347


Unnamed: 0,OBJECTID,PROGRAM,APPNO,CONVNO,FILENO,CERTNO,CERTSUFX,PERMITNO,PERMITSUF,PARENTAPP,POU_POD,STATUS,WATERSOURC,FILE_DATE,USESTAT,RSRVNAME,APPNAME,ADDRESS1,ADDRESS2,CITY,ZIP,ZIP_SUFX,H20_AMT_1,H20_UNITS_,H20_AMT_2,H20_UNIT_1,H20_AMT_3,H20_UNIT_2,H20_AMT_4,H20_UNIT_3,H20_AMT_5,H20_UNIT_4,USE_AMT_1,USE_FOR_1,USE_AMT_2,USE_FOR_2,USE_AMT_3,USE_FOR_3,WS_DESCR,ST_CODE,ST_DESCR,TOWNHOOK,SECTIONHOO,CADASTRAL,LOCQNTY,X_UTMNAD83,Y_UTMNAD83,PRIOR_DT,CONST_DT,geometry,wade_lattitude,wade_longitude
0,1,33,15,3,33-000015.0003,593,2,9,,,POD,ACTIVE - PARTIAL T&S,GRANITE AND WILLOW CREEK,1922-02-07,INPLACE,,"PRESCOTT, CITY OF",2415 E CAMELBACK STE 700,,PHOENIX,85016,,4826.26,Acre-Feet Per Annum,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,VERDE RIVER,,,B140020,B14002001,B14002001DC0,109,369969.9863,3831411.1684,2/7/1922,,POINT (-112.41833 34.61633),34.61633,-112.41833


In [7]:
# Input File - Well_Registry, for gw
inputFile = "RawInputData/Well_Registry.zip"
dfin2 = gpd.read_file(inputFile).replace(np.nan, "")

dfin2['geometry'] = dfin2['geometry'].to_crs(epsg=4326) # Realign Geometry Projection
dfin2["wade_lattitude"] = dfin2.centroid.y.round(5)
dfin2["wade_longitude"] = dfin2.centroid.x.round(5)

dfin2['WaDEUUID'] = "azGW" + dfin2.index.astype(str)

print(len(dfin2))
dfin2.head(1)


  dfin2["wade_lattitude"] = dfin2.centroid.y.round(5)

  dfin2["wade_longitude"] = dfin2.centroid.x.round(5)


232746


Unnamed: 0,OBJECTID,PROGRAM,REGISTRY_I,OWNER_NAME,RGR_PUMP_D,WELLTYPE,WELL_TYPE_,DLIC_NUM,APPROVED,INSTALLED,WELL_DEPTH,WATER_LEVE,CASING_DEP,CASING_DIA,CASING_TYP,PUMP_TYPE,PUMP_POWER,PUMPRATE,TESTEDRATE,DRAW_DOWN,COMPLETION,DRILL_LOG,WELL_CANCE,CADASTRAL,COUNTY,WATERSHED,BASIN_NAME,SUBBASIN_N,AMA,QUAD_CODE,WHOLE_TOWN,HALF_TOWNS,NORTHSOUTH,WHOLE_RANG,HALF_RANGE,EASTWEST,SECTION,QUARTER_16,QACRE160DI,QUARTER_40,QACRE40DIR,QUARTER_10,QACRE10DIR,UTM_X_METE,UTM_Y_METE,APPLICATIO,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,ZIP4,WATER_USE,SITE_USE,geometry,wade_lattitude,wade_longitude,WaDEUUID
0,1,55,60000,ARIZONA SONORAN COPPER COMPANY (USA) INC,NO,NON-EXEMPT - WITHDRAWAL PERMIT,NON-EXEMPT,0,,,1790,257,60,20,OPEN HOLE IN AQUIFER,NO PUMP CODE LISTED,NO POWER CODE LISTED,0,0,0,,,N,D05005035ABA,PINAL,SANTA CRUZ RIVER,PINAL AMA,ELOY,PINAL,D,5,0,S,5,0,E,35,A,NE,B,NW,A,NE,423912.1,3646244.0,1900-01-01,ATTN: TRAVIS SNYDER,850 W. ELLIOT RD. STE 106,TEMPE,AZ,85284,,DEWATERING,WATER PRODUCTION,POINT (-111.81407 32.95186),32.95186,-111.81407,azGW0


## Surface Water Data (POD & POU)

In [8]:
# Merge Surface Water Query by Watershed water records with Filling.
dfin1 = pd.merge(dfin1, df_fill[['FILENO', 'CADASTRAL', 'wade_lattitude', 'wade_longitude', 'POU_POD']], left_on='REG. NO', right_on='FILENO', how='left')
print(len(dfin1))
dfin1.head(1)

13937284


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,LEGAL,POU/POD,WATER USE,QUANTITY,WaDEUUID,FILENO,CADASTRAL,wade_lattitude,wade_longitude,POU_POD
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,SE NW 5 13N 1E,POD,DOMESTIC,,azSW0,36-025474.0000,A13001005BD0,34.53749,-112.28451,POD


In [9]:
# fixing 'REG. NO' format to match 'FILNO' in FILINGS shp file.

def fixREGNO(val):
   
    ### first fix
    # Create testVal to search for length, split on '-' & '.'
    testVal = str(val).strip()
    sep1 = '-'
    testVal = testVal.split(sep1, 1)[1]
    sep2 = '.'
    testVal = testVal.split(sep2, 1)[0]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 2:
        val = val.replace("-", "-0000")
    if len(testVal) == 3:
        val = val.replace("-", "-000")
    if len(testVal) == 4:
        val = val.replace("-", "-00")
    if len(testVal) == 5:
        val = val.replace("-", "-0")
        
        
    ### second fix
    # Create testVal to search for length, split on '.' at the end
    testVal = str(val).strip()
    sep1 = '.'
    testVal = testVal.split(sep1, 1)[1]
    
    # inerst new text into 'val' based on 'testVal' length.
    if len(testVal) == 1:
        val = val.replace(".", ".00" + testVal)
    if len(testVal) == 2:
        val = val.replace(".", ".0" + testVal)   
    
    return val

dfin1['REG. NO'] = dfin1.apply(lambda row: fixREGNO(row['REG. NO']), axis=1)
exList = dfin1['REG. NO'].unique().tolist()
exList.sort()
for x in exList:
    print(x)

33-000007.0000
33-000009.0000
33-000011.0000
33-000015.0002
33-000015.0003
33-000016.0000
33-000018.0000
33-000020.0000
33-000024.0000
33-009085.0000
33-011511.0000
33-023130.0001
33-023131.0001
33-023177.0000
33-023280.0000
33-023281.0000
33-023301.0000
33-023302.0000
33-023303.0000
33-023304.0000
33-023305.0000
33-023306.0000
33-023308.0000
33-023437.0000
33-023537.0003
33-023604.0001
33-023835.0000
33-024013.0004
33-024200.0001
33-024622.0000
33-024664.0001
33-024696.0002
33-024874.0001
33-024959.0001
33-025233.0000
33-025279.0001
33-025458.0001
33-025459.0001
33-025625.0000
33-025880.0001
33-025881.0001
33-025882.0001
33-026063.0002
33-026063.0005
33-026064.0001
33-026494.0001
33-026495.0001
33-026878.0000
33-026879.0000
33-026880.0000
33-026979.0000
33-027157.0000
33-027158.0000
33-027159.0000
33-027160.0000
33-027162.0000
33-027163.0000
33-027164.0000
33-027165.0000
33-027166.0000
33-027167.0000
33-027168.0000
33-027169.0000
33-027170.0000
33-027171.0000
33-027172.0000
33-027173.

In [10]:
# Split 'QUANTITY' into 'Amount' and 'UNIT'
dfin1[['Amount', 'Unit']] = dfin1.QUANTITY.str.split("  ", expand = True)
dfin1['Amount'] = pd.to_numeric(dfin1['Amount'], errors='coerce').fillna(0).astype(float) # make sure this is numeric.
dfin1.head(1)

Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,LEGAL,POU/POD,WATER USE,QUANTITY,WaDEUUID,FILENO,CADASTRAL,wade_lattitude,wade_longitude,POU_POD,Amount,Unit
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,SE NW 5 13N 1E,POD,DOMESTIC,,azSW0,36-025474.0000,A13001005BD0,34.53749,-112.28451,POD,0.0,


In [11]:
# temp fix - remove recods with these 'Units'
# AZ not provding us with metadata for these.
dropList = ['ACRES',
            'Amount Required for Maintenance',
            'Feet',
            'MIT - Miners Inches Total',
            'Miners Inches Per Annum', 
            'XX - Unknown Code at Load time',
            'None',
            '',
            " "]

dfin1 = dfin1[~dfin1['Unit'].isin(dropList)]
print(len(dfin1))
dfin1.head(1)

13931773


Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,LEGAL,POU/POD,WATER USE,QUANTITY,WaDEUUID,FILENO,CADASTRAL,wade_lattitude,wade_longitude,POU_POD,Amount,Unit
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,SE NW 5 13N 1E,POD,DOMESTIC,,azSW0,36-025474.0000,A13001005BD0,34.53749,-112.28451,POD,0.0,


In [12]:
# convert all flow values to CFS
def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet Per Second":
        CFS_Value = val
    if unit == "Acre-Feet Per Annum":
        CFS_Value = val / (723.968)
    if unit == "Gallons Per Annum":
        CFS_Value = val / (235905662.34)
    else:
        CFS_Value = 0.0
    return(CFS_Value)

dfin1['CFS_Value'] = dfin1.apply(lambda row: convertFlowFunc(row['Amount'], row['Unit']), axis=1)
dfin1['CFS_Value'].unique()

array([0.00000000e+00, 1.27169478e-05, 2.49940296e-01, ...,
       1.06822362e-04, 2.79772852e-06, 6.99432130e-06])

In [13]:
# convert all volume values to AF
def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-Feet':
        AF_Value = val
    if unit == 'Acre-Feet Total':
        AF_Value = val
    if unit == "CFT - Cubic Feet Total":
        AF_Value = val / (43559.9)
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    else:
        AF_Value = 0.0
    return(AF_Value)

dfin1['AF_Value'] = dfin1.apply(lambda row: convertVolumeFunc(row['Amount'], row['Unit']), axis=1)
dfin1['AF_Value'].unique()

array([0.00000000e+00, 1.53444393e-04, 6.13777570e-05, 1.53444393e-05,
       9.20666355e-06, 3.06888785e-05, 2.00000035e+01, 3.83610981e-03,
       1.10479963e-01, 4.03251864e+01, 5.37669151e-01, 1.37992542e+00,
       9.28031686e-02, 1.53444393e-03, 3.06888785e-03, 9.20666355e-04,
       6.13777570e-04, 4.60333178e-03, 5.60072033e-01, 1.34417288e+00,
       1.38099953e+00, 2.62451289e-02, 1.51909949e-02, 3.60594322e-02,
       3.01687020e-02, 1.76767940e-01, 5.74495806e-02, 7.58015299e-02,
       9.20666355e-01, 2.25804472e+00, 4.43577050e-01, 3.97727865e-04,
       5.84009358e-02, 3.94290711e-02, 1.10998605e-01, 5.24166045e-02,
       4.90623101e+00, 3.92050423e+00, 3.06888785e-04, 5.03942074e+00,
       9.20666355e-02, 6.13777570e-02, 3.06888785e-02, 2.68834576e+00,
       2.68834576e-01, 1.22755514e-01, 1.50000180e+00, 1.99477710e+02,
       3.06888785e+01, 2.96838177e-01, 6.75155327e-01, 3.37577664e-01,
       6.13777570e-01, 3.68266542e-01, 3.06888785e-01, 1.12321295e-01,
      

In [14]:
dfin1.head(1)

Unnamed: 0,NAME,ART_WS_IDNO_FKFLD,ADDRESS,REG. NO,STATUS,PERMIT NO,CERT. NO,FILE DATE,SOURCE,PRIOR DATE,COUNTY,OWNER TYPE,WATERSHED,LEGAL,POU/POD,WATER USE,QUANTITY,WaDEUUID,FILENO,CADASTRAL,wade_lattitude,wade_longitude,POU_POD,Amount,Unit,CFS_Value,AF_Value
0,"ALLISON, PHYLLIS H",32,"BLUE HILLS RT DEWEY, AZ 86327",36-025474.0000,ACTIVE - ACTIVE,,,6/29/1979 12:00:00 AM,GROUNDWATER SUB FLOW,6/5/1977,YAVAPAI,PRIVATE,AGUA FRIA RIVER,SE NW 5 13N 1E,POD,DOMESTIC,,azSW0,36-025474.0000,A13001005BD0,34.53749,-112.28451,POD,0.0,,0.0,0.0


In [15]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "AZwr_M2" # for surface water

# Variable Info
df['in_VariableSpecificUUID'] = "AZwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "AZwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Surface Water"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "WaDE Blank"
df['in_County'] = dfin1['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin1['wade_lattitude']
df['in_Longitude'] = dfin1['wade_longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = dfin1['POU_POD']
df['in_SiteName'] = ""
df['in_SiteNativeID'] = dfin1['POU_POD'].str.strip() + dfin1['CADASTRAL'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = ""
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['CFS_Value'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['STATUS']
df['in_AllocationNativeID'] =  dfin1['REG. NO'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin1['NAME']
df['in_AllocationPriorityDate'] = dfin1['PRIOR DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = "Prior Appropriation"
df['in_AllocationVolume_AF'] = dfin1['AF_Value'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['WATER USE'].str.title()
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 0 # we want these sw records to be as normal
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = ""

dfswOut = df.copy()
dfswOut = dfswOut.drop_duplicates().reset_index(drop=True)
print(len(dfswOut))
dfswOut.head()

13930565


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azSW0,AZwr_M2,AZwr_V1,AZwr_O1,,,,,,Surface Water,,WaDE Blank,YAVAPAI,4326,,,,34.53749,-112.28451,,,POD,,PODA13001005BD0,,,AZ,,,,,,,,,,0.0,ACTIVE - ACTIVE,36-025474.0000,"ALLISON, PHYLLIS H",6/5/1977,,12/31,01/01,Prior Appropriation,0.0,Domestic,,,,,,0,,,,,,,,,,
1,azSW1,AZwr_M2,AZwr_V1,AZwr_O1,,,,,,Surface Water,,WaDE Blank,YAVAPAI,4326,,,,34.53749,-112.28451,,,POD,,PODA13001005BD0,,,AZ,,,,,,,,,,0.0,ACTIVE - ACTIVE,36-025474.0000,"ALLISON, PHYLLIS H",6/5/1977,,12/31,01/01,Prior Appropriation,0.0,Stock,,,,,,0,,,,,,,,,,
2,azSW2,AZwr_M2,AZwr_V1,AZwr_O1,,,,,,Surface Water,,WaDE Blank,YAVAPAI,4326,,,,34.23953,-112.20736,,,POD,,PODA10001001AC0,,,AZ,,,,,,,,,,0.0,ACTIVE - ACTIVE,36-066830.0000,"ANDERWALD, GERALDINE W",11/7/1906,,12/31,01/01,Prior Appropriation,0.0,Annual Use,,,,,,0,,,,,,,,,,
3,azSW3,AZwr_M2,AZwr_V1,AZwr_O1,,,,,,Surface Water,,WaDE Blank,YAVAPAI,4326,,,,34.23953,-112.20736,,,POD,,PODA10001001AC0,,,AZ,,,,,,,,,,0.0,ACTIVE - ACTIVE,36-066830.0000,"ANDERWALD, GERALDINE W",11/7/1906,,12/31,01/01,Prior Appropriation,0.0,Domestic,,,,,,0,,,,,,,,,,
4,azSW4,AZwr_M2,AZwr_V1,AZwr_O1,,,,,,Surface Water,,WaDE Blank,YAVAPAI,4326,,,,34.23953,-112.20736,,,POD,,PODA10001001AC0,,,AZ,,,,,,,,,,0.0,ACTIVE - ACTIVE,36-066830.0000,"ANDERWALD, GERALDINE W",11/7/1906,,12/31,01/01,Prior Appropriation,0.0,Irrigation,,,,,,0,,,,,,,,,,


## Groundwater Data (POD)

In [16]:
# create WaDE Registration Number	
dfin2['wade_RegistrationN'] = dfin2['PROGRAM'].astype(str) + "-" + dfin2['REGISTRY_I'].astype(str)

exList = dfin2['wade_RegistrationN'].unique().tolist()
exList.sort()
for x in exList:
    print(x)

55-060000
55-060001
55-082721
55-083226
55-083578
55-083716
55-084067
55-084442
55-084455
55-084460
55-084580
55-084598
55-084614
55-084626
55-084630
55-084633
55-084644
55-084658
55-084659
55-084660
55-084661
55-084662
55-084664
55-084665
55-084676
55-084693
55-084701
55-084717
55-084718
55-084723
55-084734
55-084735
55-084741
55-084742
55-084747
55-084772
55-084773
55-084784
55-084785
55-084786
55-084787
55-084788
55-084806
55-084808
55-084809
55-084810
55-084811
55-084812
55-084814
55-084815
55-084816
55-084817
55-084818
55-084819
55-084820
55-084821
55-084822
55-084823
55-084824
55-084825
55-084826
55-084827
55-084828
55-084862
55-084923
55-084926
55-084945
55-084946
55-084947
55-084948
55-084949
55-084950
55-084951
55-084952
55-084953
55-084954
55-084955
55-084956
55-084957
55-084958
55-084959
55-084961
55-084962
55-084963
55-084964
55-084965
55-084966
55-084967
55-084968
55-084969
55-084970
55-084971
55-084973
55-084974
55-084975
55-084976
55-084977
55-084978
55-084979
55-084981


In [17]:
# AZwr Groundwater PUMPRATE is in GPM, need to convert to CFS
# 448.8 CFS = 1 GPM

# Clean owner name up
def ConvertGPMToCFSFunc(Val):
    Val = Val / 448.8 
    return Val

dfin2['PUMPRATE'] = dfin2.apply(lambda row: ConvertGPMToCFSFunc(row['PUMPRATE']), axis=1)
dfin2['PUMPRATE'].unique()

array([0.        , 0.4456328 , 0.0557041 , ..., 9.53654189, 3.72994652,
       0.66622103])

In [18]:
# in_AllocationTypeCV
# Groundwater outside AMAs has no laws governing them. Inside AMAs is both reasonable use and safe yield for the aquifer/prior appropriation.
# - AZ groundwater rights outside of the AMA AllocationType = Reasonable Use
# - AZ groundwater  rights inside of the AMA AllocationType = Reasonable Use and Prior Appropriation
                                                       
def ConvertGPMToCFSFunc(val):
    val = str(val).strip()
    if val == "OUTSIDE OF AMA OR INA":
        outString = "Reasonable Use"
    else:
        outString = "Reasonable Use and Prior Appropriation"
    return outString

dfin2['in_AllocationTypeCV'] = dfin2.apply(lambda row: ConvertGPMToCFSFunc(row['AMA']), axis=1)
dfin2['in_AllocationTypeCV'].unique()

array(['Reasonable Use and Prior Appropriation', 'Reasonable Use'],
      dtype=object)

In [19]:
dfin2.head(1)

Unnamed: 0,OBJECTID,PROGRAM,REGISTRY_I,OWNER_NAME,RGR_PUMP_D,WELLTYPE,WELL_TYPE_,DLIC_NUM,APPROVED,INSTALLED,WELL_DEPTH,WATER_LEVE,CASING_DEP,CASING_DIA,CASING_TYP,PUMP_TYPE,PUMP_POWER,PUMPRATE,TESTEDRATE,DRAW_DOWN,COMPLETION,DRILL_LOG,WELL_CANCE,CADASTRAL,COUNTY,WATERSHED,BASIN_NAME,SUBBASIN_N,AMA,QUAD_CODE,WHOLE_TOWN,HALF_TOWNS,NORTHSOUTH,WHOLE_RANG,HALF_RANGE,EASTWEST,SECTION,QUARTER_16,QACRE160DI,QUARTER_40,QACRE40DIR,QUARTER_10,QACRE10DIR,UTM_X_METE,UTM_Y_METE,APPLICATIO,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,ZIP4,WATER_USE,SITE_USE,geometry,wade_lattitude,wade_longitude,WaDEUUID,wade_RegistrationN,in_AllocationTypeCV
0,1,55,60000,ARIZONA SONORAN COPPER COMPANY (USA) INC,NO,NON-EXEMPT - WITHDRAWAL PERMIT,NON-EXEMPT,0,,,1790,257,60,20,OPEN HOLE IN AQUIFER,NO PUMP CODE LISTED,NO POWER CODE LISTED,0.0,0,0,,,N,D05005035ABA,PINAL,SANTA CRUZ RIVER,PINAL AMA,ELOY,PINAL,D,5,0,S,5,0,E,35,A,NE,B,NW,A,NE,423912.1,3646244.0,1900-01-01,ATTN: TRAVIS SNYDER,850 W. ELLIOT RD. STE 106,TEMPE,AZ,85284,,DEWATERING,WATER PRODUCTION,POINT (-111.81407 32.95186),32.95186,-111.81407,azGW0,55-060000,Reasonable Use and Prior Appropriation


In [20]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "AZwr_M1" # for goundwater

# Variable Info
df['in_VariableSpecificUUID'] =  "AZwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "AZwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = "WaDE Blank"
df['in_County'] = dfin2['COUNTY']
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfin2['wade_lattitude']
df['in_Longitude'] = dfin2['wade_longitude']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = ""
df['in_SiteNativeID'] = "POD" + dfin2['CADASTRAL'].replace("", 0).fillna(0).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Well" # these should all be well records
df['in_StateCV'] = "AZ"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin2['PUMPRATE'].astype(float)
df['in_AllocationLegalStatusCV'] = ""
df['in_AllocationNativeID'] =  dfin2['wade_RegistrationN'] # see above for creation
df['in_AllocationOwner'] = dfin2['OWNER_NAME']
df['in_AllocationPriorityDate'] = ""
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = "12/31"
df['in_AllocationTimeframeStart'] = "01/01"
df['in_AllocationTypeCV'] = dfin2['in_AllocationTypeCV']
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfin2['WATER_USE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = 1 # all these gw records should be considered exempt for us.
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://app.azwater.gov/WellRegistry/Detail.aspx?RegID=" + dfin2['REGISTRY_I'].replace("", 0).fillna(0).astype(int).astype(str)


dfgwOut = df.copy()
dfgwOut = dfgwOut.drop_duplicates().reset_index(drop=True)
print(len(dfgwOut))
dfgwOut.head()

232746


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azGW0,AZwr_M1,AZwr_V1,AZwr_O1,,,,,,Groundwater,,WaDE Blank,PINAL,4326,,,,32.95186,-111.81407,,,POD,,PODD05005035ABA,,Well,AZ,,,,,,,,,,0.0,,55-060000,ARIZONA SONORAN COPPER COMPANY (USA) INC,,,12/31,01/01,Reasonable Use and Prior Appropriation,,DEWATERING,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
1,azGW1,AZwr_M1,AZwr_V1,AZwr_O1,,,,,,Groundwater,,WaDE Blank,MARICOPA,4326,,,,33.78498,-112.49814,,,POD,,PODB05002008CCB,,Well,AZ,,,,,,,,,,0.44563,,55-060001,ROUTE 14 INVESTMENT PARTNERS LLC,,,12/31,01/01,Reasonable Use and Prior Appropriation,,INDUSTRIAL,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
2,azGW2,AZwr_M1,AZwr_V1,AZwr_O1,,,,,,Groundwater,,WaDE Blank,MARICOPA,4326,,,,33.5009,-113.17061,,,POD,,PODB02009022000,,Well,AZ,,,,,,,,,,0.0,,55-082721,"KEYES,G",,,12/31,01/01,Reasonable Use and Prior Appropriation,,DOMESTIC,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
3,azGW3,AZwr_M1,AZwr_V1,AZwr_O1,,,,,,Groundwater,,WaDE Blank,COCHISE,4326,,,,32.29513,-109.80552,,,POD,,PODD13025021BB0,,Well,AZ,,,,,,,,,,0.0557,,55-083226,TOLBERT COLEMAN,,,12/31,01/01,Reasonable Use,,DOMESTIC,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
4,azGW4,AZwr_M1,AZwr_V1,AZwr_O1,,,,,,Groundwater,,WaDE Blank,SANTA CRUZ,4326,,,,31.59564,-111.03662,,,POD,,PODD21013017CDD,,Well,AZ,,,,,,,,,,0.03342,,55-083578,MYRON D CLARK,,,12/31,01/01,Reasonable Use and Prior Appropriation,,DOMESTIC,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...


## Concatenate POD and POU Data.  Make needed changes

In [21]:
# Concatenate dataframes
frames = [dfswOut, dfgwOut]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

14163311


## Clean Data / data types

In [22]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip().rstrip(',')
    return Val

In [23]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [24]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Yavapai', 'Navajo', 'Maricopa', '', 'Greenlee', 'Coconino',
       'Gila', 'Pinal', 'Santa Cruz', 'Cochise', 'Graham', 'Mohave',
       'Apache', 'Pima', 'La Paz', 'Yuma'], dtype=object)

In [25]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [26]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Allison, Phyllis H', 'Anderwald, Geraldine W', 'Andrews, John H',
       ..., "John And Ginger O'Brien",
       'Paradise Valley Apostolic Church Assoc', 'Ramiro Mikala Ruiz'],
      dtype=object)

In [27]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [28]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [29]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Groundwater'], dtype=object)

In [30]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['', 'Well'], dtype=object)

In [31]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array([''], dtype=object)

In [32]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['Allison, Phyllis H', 'Anderwald, Geraldine W', 'Andrews, John H',
       ..., "John And Ginger O'Brien",
       'Paradise Valley Apostolic Church Assoc', 'Ramiro Mikala Ruiz'],
      dtype=object)

In [33]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['',
 'All',
 'Annual Use',
 'COMMERCIAL',
 'Commercial',
 'Cultivation Of Fish',
 'DEWATERING',
 'DOMESTIC',
 'DRAINAGE',
 'Domestic',
 'Fish Farming',
 'INDUSTRIAL',
 'IRRIGATION',
 'Industrial',
 'Irrigation',
 'MINING',
 'MONITORING',
 'MUNICIPAL USES',
 'Mining',
 'Municipal',
 'NO USE CODE ON NOI',
 'NO WATER USE',
 'OTHER - MINERAL EXPLORE',
 'OTHER - PRODUCTION',
 'Other',
 'Power',
 'RECOVERY',
 'RECREATION',
 'REMEDIATION',
 'RESERVED',
 'Recreation',
 'STOCK',
 'SUBDIVISION',
 'Stock',
 'TEST',
 'UNKNOWN',
 'UTILITY (WATER CO)',
 'Wildlife']

In [34]:
# Ensure Latitude entry is either numireic or a 0
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([34.53749, 34.23953, 34.51586, ..., 32.88997, 32.88999, 32.89002],
      dtype=object)

In [35]:
# Ensure Longitude entry is either numireic or a 0
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-112.28451, -112.20736, -112.29361, ..., -112.72966, -112.72322,
       -112.76655], dtype=object)

In [36]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
['1977-06-05 00:00:00', '1906-11-07 00:00:00', '1906-06-04 00:00:00',
                 'NaT', '2004-07-19 00:00:00', '1883-12-31 00:00:00',
 '1886-12-31 00:00:00', '1986-03-11 00:00:00', '1919-01-01 00:00:00',
 '1985-07-26 00:00:00',
 ...
 '1955-09-28 00:00:00', '1956-06-21 00:00:00', '1946-09-16 00:00:00',
 '2007-04-23 00:00:00', '1939-07-15 00:00:00', '1966-07-15 00:00:00',
 '1977-03-23 00:00:00', '1978-11-01 00:00:00', '1915-03-24 00:00:00',
 '1944-05-15 00:00:00']
Length: 8331, dtype: datetime64[ns]

In [37]:
# Ensure Flow entry is either numireic or a 0
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array(['', 0.25, 0.05, 0.02, 0.01, 0.21, 0.31, 0.15, 0.3, 0.1, 0.85, 0.08,
       0.4, 0.07, 0.22, 0.03, 0.04, 0.11, 2.54, 0.53, 0.58, 0.09, 0.33,
       0.5, 0.06, 0.18, 0.29, 0.17, 0.2, 1.0, 0.24, 2.5, 0.12, 0.13, 0.34,
       0.28, 0.75, 0.93, 3.0, 0.55, 0.41, 0.38, 1.55, 4.24, 2.67, 2.23,
       8.91, 2.45, 1.86, 0.44, 1.27, 0.19, 1.11, 1.56, 0.51, 0.26, 0.45,
       4.46, 44.56, 0.77, 2.0, 0.87, 1.25, 1.67, 2.01, 0.67, 2.51, 11.14,
       0.96, 0.36, 5.35, 3.34, 6.69, 1.45, 1.34, 3.56, 0.37, 1.78, 0.74,
       5.01, 0.62, 3.82, 0.16, 0.39, 0.46, 3.57, 0.23, 1.03, 0.89, 3.7,
       0.56, 4.01, 2.9, 1.43, 2.04, 5.09, 5.79, 4.67, 3.12, 6.68, 7.13,
       4.9, 7.43, 8.6, 1.19, 0.14, 0.65, 2.25, 1.38, 2.06, 0.52, 0.8,
       1.17, 0.47, 2.79, 8.47, 2.7, 1.93, 0.94, 0.78, 3.74, 2.05, 1.98,
       0.69, 9.36, 0.57, 0.72, 2.12, 4.68, 4.06, 0.61, 1.89, 10.03, 1.1,
       1.06, 1.2, 0.49, 5.4, 1.09, 3.18, 1.57, 1.02, 3.9, 0.9, 0.42, 4.95,
       11.59, 0.79, 1.23, 5.57, 13.59, 0.7, 4.07, 2.

In [38]:
# Ensure Volume entry is either numireic or a 0
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 20.0, 0.11, 40.33, 0.54, 1.38, 0.09, 0.56, 1.34, 0.03, 0.02,
       0.04, 0.18, 0.06, 0.08, 0.92, 2.26, 0.44, 0.05, 4.91, 3.92, 5.04,
       2.69, 0.27, 0.12, 1.5, 199.48, 30.69, 0.3, 0.68, 0.34, 0.61, 0.37,
       0.31, 0.01, 0.36, 1.33, 1.0, 9.68, 0.25, 0.46, 15.34, 0.4, 0.17,
       0.28, 13.81, 4.42, 16.88, 82.86, 997.39, 1150.83, 5.97, 11.2, 1.53,
       5.52, 14.42, 22.49, 48.1, 0.84, 1.1, 7.26, 8.0, 4.0, 0.23, 37.1,
       35.63, 0.77, 0.15, 3.61, 690.5, 5.89, 79.79, 2.76, 9.21, 2.0, 0.55],
      dtype=object)

In [39]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wId1', 'wId2'], dtype=object)

In [40]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['PODA13001005BD0', 'PODA10001001AC0', 'POUA13001007DD0', ...,
       'PODC06005024DBB', 'PODC06005024CCD', 'PODB08005027CDB'],
      dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this ADWR, we don't want water rights that are considered: "INACTIVE - WITHDRAWN",
                       "INACTIVE - CONSOLIDATED",
                       "INACTIVE - AMENDED",
                       "INACTIVE - CANCELLED",
                       "INACTIVE - REJECTED",
                       "INACTIVE - PARTIAL T&S",
                       "INACTIVE - RELINQUISHED",
                       "INACTIVE - FULL T&S",
                       "INACTIVE - INACTIVE",
                       "INACTIVE - FULL ASSIGNMENT",
                       "INACTIVE - PARTIAL ASSIGNMENT"

In [41]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["INACTIVE - WITHDRAWN",
                       "INACTIVE - CONSOLIDATED",
                       "INACTIVE - AMENDED",
                       "INACTIVE - CANCELLED",
                       "INACTIVE - REJECTED",
                       "INACTIVE - PARTIAL T&S",
                       "INACTIVE - RELINQUISHED",
                       "INACTIVE - FULL T&S",
                       "INACTIVE - INACTIVE",
                       "INACTIVE - FULL ASSIGNMENT",
                       "INACTIVE - PARTIAL ASSIGNMENT"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

14163311


array(['ACTIVE - ACTIVE', 'ACTIVE - FULL ASSIGNMENT',
       'ACTIVE - PARTIAL ASSIGNMENT', 'ACTIVE - AMENDED',
       'ACTIVE - INSTREAM FLOW', 'ACTIVE - CONDITIONAL FULL T&S',
       'ACTIVE - FULL T&S', 'ACTIVE - MODIFIED', 'ACTIVE - PARTIAL T&S',
       'ACTIVE - CONDITIONAL PARTIAL T&S', ''], dtype=object)

## Shapefile Data
- For attaching geometry to POU csv inputs.

In [42]:
# N/A, all data in POU are considered points for AZwr

## Export Data

In [43]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14163311 entries, 0 to 14163310
Data columns (total 63 columns):
 #   Column                                        Dtype         
---  ------                                        -----         
 0   WaDEUUID                                      object        
 1   in_MethodUUID                                 object        
 2   in_VariableSpecificUUID                       object        
 3   in_OrganizationUUID                           object        
 4   in_Geometry                                   object        
 5   in_GNISFeatureNameCV                          object        
 6   in_WaterQualityIndicatorCV                    object        
 7   in_WaterSourceName                            object        
 8   in_WaterSourceNativeID                        object        
 9   in_WaterSourceTypeCV                          object        
 10  in_CoordinateAccuracy                         object        
 11  in_CoordinateMethodCV 

In [44]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,azSW0,AZwr_M2,AZwr_V1,AZwr_O1,,,,,wId1,Surface Water,,WaDE Blank,Yavapai,4326,,,,34.53749,-112.28451,,,POD,,PODA13001005BD0,,,AZ,,,,,,,,,,,ACTIVE - ACTIVE,36-025474.0000,"Allison, Phyllis H",1977-06-05,,12/31,01/01,Prior Appropriation,,Domestic,,,,,,0,,,,,,,,,,
1,azSW1,AZwr_M2,AZwr_V1,AZwr_O1,,,,,wId1,Surface Water,,WaDE Blank,Yavapai,4326,,,,34.53749,-112.28451,,,POD,,PODA13001005BD0,,,AZ,,,,,,,,,,,ACTIVE - ACTIVE,36-025474.0000,"Allison, Phyllis H",1977-06-05,,12/31,01/01,Prior Appropriation,,Stock,,,,,,0,,,,,,,,,,
2,azSW2,AZwr_M2,AZwr_V1,AZwr_O1,,,,,wId1,Surface Water,,WaDE Blank,Yavapai,4326,,,,34.23953,-112.20736,,,POD,,PODA10001001AC0,,,AZ,,,,,,,,,,,ACTIVE - ACTIVE,36-066830.0000,"Anderwald, Geraldine W",1906-11-07,,12/31,01/01,Prior Appropriation,,Annual Use,,,,,,0,,,,,,,,,,
3,azSW3,AZwr_M2,AZwr_V1,AZwr_O1,,,,,wId1,Surface Water,,WaDE Blank,Yavapai,4326,,,,34.23953,-112.20736,,,POD,,PODA10001001AC0,,,AZ,,,,,,,,,,,ACTIVE - ACTIVE,36-066830.0000,"Anderwald, Geraldine W",1906-11-07,,12/31,01/01,Prior Appropriation,,Domestic,,,,,,0,,,,,,,,,,
4,azSW4,AZwr_M2,AZwr_V1,AZwr_O1,,,,,wId1,Surface Water,,WaDE Blank,Yavapai,4326,,,,34.23953,-112.20736,,,POD,,PODA10001001AC0,,,AZ,,,,,,,,,,,ACTIVE - ACTIVE,36-066830.0000,"Anderwald, Geraldine W",1906-11-07,,12/31,01/01,Prior Appropriation,,Irrigation,,,,,,0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14163306,azGW232741,AZwr_M1,AZwr_V1,AZwr_O1,,,,,wId2,Groundwater,,WaDE Blank,Pinal,4326,,,,32.90118,-111.88631,,,POD,,PODD06004013000,,Well,AZ,,,,,,,,,,,,55-930437,Mesa Cobre Holdings,NaT,,12/31,01/01,Reasonable Use and Prior Appropriation,,NO WATER USE,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
14163307,azGW232742,AZwr_M1,AZwr_V1,AZwr_O1,,,,,wId2,Groundwater,,WaDE Blank,Maricopa,4326,,,,32.88451,-112.72967,,,POD,,PODC06005024CCD,,Well,AZ,,,,,,,,,,,,55-930438,United States Air Force,NaT,,12/31,01/01,Reasonable Use,,MONITORING,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
14163308,azGW232743,AZwr_M1,AZwr_V1,AZwr_O1,,,,,wId2,Groundwater,,WaDE Blank,Maricopa,4326,,,,33.71541,-112.08843,,,POD,,PODA04003006DCA,,Well,AZ,,,,,,,,,,,,55-930439,Paradise Valley Apostolic Church Assoc,NaT,,12/31,01/01,Reasonable Use and Prior Appropriation,,DOMESTIC,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...
14163309,azGW232744,AZwr_M1,AZwr_V1,AZwr_O1,,,,,wId2,Groundwater,,WaDE Blank,Maricopa,4326,,,,33.42722,-112.36557,,,POD,,PODB01001016DBB,,Well,AZ,,,,,,,,,,,,55-930440,The Goodyear Tire And Rubber Company,NaT,,12/31,01/01,Reasonable Use and Prior Appropriation,,MONITORING,,,,,,1,,,,,,,,,,https://app.azwater.gov/WellRegistry/Detail.as...


In [45]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_azMain.zip', compression=dict(method='zip', archive_name='Pwr_azMain.csv'), index=False)  # The output, save as a zip
#dfPoUshape.to_csv('RawInputData/P_Geometry.zip', compression=dict(method='zip', archive_name='P_Geometry.csv'), index=False)  # The output geometry.

In [46]:
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_County'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['',
 'Apache',
 'Cochise',
 'Coconino',
 'Gila',
 'Graham',
 'Greenlee',
 'La Paz',
 'Maricopa',
 'Mohave',
 'Navajo',
 'Pima',
 'Pinal',
 'Santa Cruz',
 'Yavapai',
 'Yuma']