# Pre-processing California Department of Water Resources Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory
print(os.getcwd()) # see the current working directory

# set working directory, if need be
workingDir = "G:/Shared drives/WaDE Data/California/WaterAllocation_CADWR" # file location
os.chdir(workingDir)
print(os.getcwd())

C:\Users\rjame\Documents\WSWC Documents\MappingStatesDataToWaDE2.0\California\WaterAllocation_CADWR
G:\Shared drives\WaDE Data\California\WaterAllocation_CADWR


## Point of Diversion Data

In [3]:
# Input File - wellcompletionreports
FI_PoD = "RawInputData/WellCompletionReports.zip"
# dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1").replace(np.nan, "")
dfinPOD = pd.read_csv(FI_PoD).replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "cnra" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/WellCompletionReports.zip', compression=dict(method='zip', archive_name='WellCompletionReports.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

  dfinPOD = pd.read_csv(FI_PoD).replace(np.nan, "")


1074973


Unnamed: 0,_id,WCRNUMBER,LEGACYLOGNUMBER,REGIONOFFICE,COUNTYNAME,LOCALPERMITAGENCY,PERMITDATE,PERMITNUMBER,OWNERASSIGNEDWELLNUMBER,WELLLOCATION,CITY,PLANNEDUSEFORMERUSE,DRILLERNAME,DRILLERLICENSENUMBER,RECORDTYPE,DECIMALLATITUDE,DECIMALLONGITUDE,METHODOFDETERMINATIONLL,LLACCURACY,HORIZONTALDATUM,GROUNDSURFACEELEVATION,ELEVATIONACCURACY,ELEVATIONDETERMINATIONMETHOD,VERTICALDATUM,TOWNSHIP,RANGE,SECTION,BASELINEMERIDIAN,APN,DATEWORKENDED,WORKFLOWSTATUS,RECEIVEDDATE,TOTALDRILLDEPTH,TOTALCOMPLETEDDEPTH,TOPOFPERFORATEDINTERVAL,BOTTOMOFPERFORATEDINTERVAL,CASINGDIAMETER,DRILLINGMETHOD,FLUID,STATICWATERLEVEL,TOTALDRAWDOWN,TESTTYPE,PUMPTESTLENGTH,WELLYIELD,WELLYIELDUNITOFMEASURE,OTHEROBSERVATIONS,WaDEUUID
0,1,WCR1997-004295,525475,DWR North Central Region Office,Alameda,Alameda County Water District,,,,,NEWARK,Monitoring,"GREGG DRILLING & TESTING, GREGG DRILLING & TES...",485165,WellCompletion/New/Production or Monitoring/NA,37.5279,-122.015,Derived from TRS,Centroid of Section,,,,,,05S,01W,6.0,Mount Diablo,,1997-04-17T00:00:00,,,,15.0,,,,,,,,,,,,,cnra0
1,2,WCR1997-004359,525154,DWR North Central Region Office,Alameda,"Alameda County Public Works Agency, Water Reso...",,,,,OAKLAND,Unknown,MITCHELL DRILLING MITCHELL DRILLING,672617,WellCompletion/Destruction/NA/NA,,,,,,,,,,,,,Mount Diablo,,1997-10-15T00:00:00,,,,,,,,,,,,,,,,,cnra1
2,3,WCR1996-001972,449388,DWR North Central Region Office,Alameda,Alameda County Water District,,,,35735 CEDAR BLVD,NEWARK,Unknown,MARTELL WATER SYSTEMS INC MARTELL WATER SYSTEM...,510952,WellCompletion/Destruction/NA/NA,37.5567,-122.033,Derived from TRS,Centroid of Section,,,,,,04S,02W,25.0,Mount Diablo,,1996-08-24T00:00:00,,,,,,,,,,,,,,,,,cnra2
3,4,WCR1996-001974,449390,DWR North Central Region Office,Alameda,"Alameda County Public Works Agency, Water Reso...",,,,,OAKLAND,Unknown,MARTELL WATER SYSTEMS INC MARTELL WATER SYSTEM...,510952,WellCompletion/Destruction/NA/NA,,,,,,,,,,,,,Mount Diablo,,1996-09-06T00:00:00,,,,,,,8.0,,,,,,,,,,cnra3
4,5,WCR1996-000021,01-567E,DWR North Central Region Office,Alameda,Zone 7 Water Agency - Alameda County Flood Con...,,,,,LIVERMORE,Unknown,WOODWARD DRILLING COMPANY WOODWARD DRILLING CO...,710079,WellCompletion/Destruction/NA/NA,37.6582,-121.779,Derived from TRS,Centroid of Section,,,,,,03S,02E,20.0,Mount Diablo,,1996-04-03T00:00:00,,,,,,,,,,,,,,,,,cnra4


In [4]:
# Input File - wellreportpdflinks

lCsv = "RawInputData/WellCompletionReportPDFLinks.zip"# to match wrcn to state well number
linkCsv = pd.read_csv(lCsv, encoding = "ISO-8859-1").replace(np.nan, "")

# merge with wellcompletionreports
dfinPOD2 = pd.merge(dfinPOD, linkCsv, left_on='WCRNUMBER', right_on='WCRNumber',how='left')
dfinPOD2 = dfinPOD2.drop_duplicates().replace(np.nan, "").replace("nan,nan", "").reset_index(drop=True)
print(len(dfinPOD2))
dfinPOD2.head()

1142777


Unnamed: 0,_id,WCRNUMBER,LEGACYLOGNUMBER,REGIONOFFICE,COUNTYNAME,LOCALPERMITAGENCY,PERMITDATE,PERMITNUMBER,OWNERASSIGNEDWELLNUMBER,WELLLOCATION,CITY,PLANNEDUSEFORMERUSE,DRILLERNAME,DRILLERLICENSENUMBER,RECORDTYPE,DECIMALLATITUDE,DECIMALLONGITUDE,METHODOFDETERMINATIONLL,LLACCURACY,HORIZONTALDATUM,GROUNDSURFACEELEVATION,ELEVATIONACCURACY,ELEVATIONDETERMINATIONMETHOD,VERTICALDATUM,TOWNSHIP,RANGE,SECTION,BASELINEMERIDIAN,APN,DATEWORKENDED,WORKFLOWSTATUS,RECEIVEDDATE,TOTALDRILLDEPTH,TOTALCOMPLETEDDEPTH,TOPOFPERFORATEDINTERVAL,BOTTOMOFPERFORATEDINTERVAL,CASINGDIAMETER,DRILLINGMETHOD,FLUID,STATICWATERLEVEL,TOTALDRAWDOWN,TESTTYPE,PUMPTESTLENGTH,WELLYIELD,WELLYIELDUNITOFMEASURE,OTHEROBSERVATIONS,WaDEUUID,ï»¿_id,WCRNumber,WCRLink
0,1,WCR1997-004295,525475,DWR North Central Region Office,Alameda,Alameda County Water District,,,,,NEWARK,Monitoring,"GREGG DRILLING & TESTING, GREGG DRILLING & TES...",485165,WellCompletion/New/Production or Monitoring/NA,37.5279,-122.015,Derived from TRS,Centroid of Section,,,,,,05S,01W,6.0,Mount Diablo,,1997-04-17T00:00:00,,,,15.0,,,,,,,,,,,,,cnra0,642370.0,WCR1997-004295,https://cadwr.app.box.com/v/WellCompletionRepo...
1,2,WCR1997-004359,525154,DWR North Central Region Office,Alameda,"Alameda County Public Works Agency, Water Reso...",,,,,OAKLAND,Unknown,MITCHELL DRILLING MITCHELL DRILLING,672617,WellCompletion/Destruction/NA/NA,,,,,,,,,,,,,Mount Diablo,,1997-10-15T00:00:00,,,,,,,,,,,,,,,,,cnra1,,,
2,3,WCR1996-001972,449388,DWR North Central Region Office,Alameda,Alameda County Water District,,,,35735 CEDAR BLVD,NEWARK,Unknown,MARTELL WATER SYSTEMS INC MARTELL WATER SYSTEM...,510952,WellCompletion/Destruction/NA/NA,37.5567,-122.033,Derived from TRS,Centroid of Section,,,,,,04S,02W,25.0,Mount Diablo,,1996-08-24T00:00:00,,,,,,,,,,,,,,,,,cnra2,630570.0,WCR1996-001972,https://cadwr.app.box.com/v/WellCompletionRepo...
3,3,WCR1996-001972,449388,DWR North Central Region Office,Alameda,Alameda County Water District,,,,35735 CEDAR BLVD,NEWARK,Unknown,MARTELL WATER SYSTEMS INC MARTELL WATER SYSTEM...,510952,WellCompletion/Destruction/NA/NA,37.5567,-122.033,Derived from TRS,Centroid of Section,,,,,,04S,02W,25.0,Mount Diablo,,1996-08-24T00:00:00,,,,,,,,,,,,,,,,,cnra2,630571.0,WCR1996-001972,https://cadwr.app.box.com/v/WellCompletionRepo...
4,4,WCR1996-001974,449390,DWR North Central Region Office,Alameda,"Alameda County Public Works Agency, Water Reso...",,,,,OAKLAND,Unknown,MARTELL WATER SYSTEMS INC MARTELL WATER SYSTEM...,510952,WellCompletion/Destruction/NA/NA,,,,,,,,,,,,,Mount Diablo,,1996-09-06T00:00:00,,,,,,,8.0,,,,,,,,,,cnra3,,,


In [5]:
#removing these records
dfinPOD2 = dfinPOD2[dfinPOD2['RECORDTYPE'] != 'WellCompletion/Destruction/NA/NA']
dfinPOD2 = dfinPOD2[dfinPOD2['RECORDTYPE'] != 'WellCompletion/Drill and Destroy/NA/NA']
print(len(dfinPOD2))

953914


In [6]:
# create output POD dataframe
df = pd.DataFrame()


# Data Assessment UUID
df['WaDEUUID'] = dfinPOD2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CADWRwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "CADWRwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "CADWRwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = dfinPOD2['LLACCURACY']
df['in_CoordinateMethodCV'] = "Digitized"
df['in_County'] = dfinPOD2['COUNTYNAME']
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD2['DECIMALLATITUDE']
df['in_Longitude'] = dfinPOD2['DECIMALLONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
df['in_SiteName'] = dfinPOD2['OWNERASSIGNEDWELLNUMBER']
df['in_SiteNativeID'] = ""
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Well"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = ""
df['in_AllocationLegalStatusCV'] = ""
df['in_AllocationNativeID'] =  dfinPOD2['WCRNUMBER']
df['in_AllocationOwner'] = "" 
df['in_AllocationPriorityDate'] = ""
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = "Correlative Rights"
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOD2['PLANNEDUSEFORMERUSE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # either a 1 or 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = dfinPOD2['LEGACYLOGNUMBER']
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOD2['WCRLink']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.tail()

953519


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
953514,cnra1074968,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,,Groundwater,>50 FT,Digitized,,4326,,,,36.38122,-119.29016,,,POD,,,,Well,CA,,,,,,,,,,,,WCR2017-009452,,,,,,Correlative Rights,,Water Supply Domestic,,,,,,1,,,,E0371068,,,,,,
953515,cnra1074969,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,2275,,,Well,CA,,,,,,,,,,,,WCR2011-005405,,,,,,Correlative Rights,,Water Supply Domestic,,,,,,1,,,,e0146791,,,,,,
953516,cnra1074970,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,,Groundwater,Unknown,Digitized,,4326,,,,38.31187,-121.82957,,,POD,PZ-8,,,Well,CA,,,,,,,,,,,,WCR2017-011186,,,,,,Correlative Rights,,Monitoring,,,,,,1,,,,E0353767,,,,,,
953517,cnra1074971,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,1,,,Well,CA,,,,,,,,,,,,WCR2018-012226,,,,,,Correlative Rights,,Water Supply Domestic,,,,,,1,,,,E0369132,,,,,,
953518,cnra1074972,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,,Groundwater,Unknown,Digitized,,4326,,,,37.53622,-121.93128,,,POD,W035,,,Well,CA,,,,,,,,,,,,WCR2016-017626,,,,,,Correlative Rights,,Unknown,,,,,,1,,,,E0314231,,,,,,


In [7]:
# Concatenate dataframes
frames = [outPOD]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

953519


## Clean Data / data types

In [8]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).replace("  ", " ").strip().rstrip(',')
    return Val

In [9]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [10]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['Alameda', 'Alpine', 'Amador', 'Butte', 'Calaveras', 'Colusa',
       'Contra Costa', 'Del Norte', 'El Dorado', 'Fresno', 'Glenn',
       'Humboldt', 'Imperial', 'Inyo', 'Kern', 'Kings', 'Lake', 'Lassen',
       'Los Angeles', 'Madera', 'Marin', 'Mariposa', 'Mendocino',
       'Merced', 'Modoc', 'Mono', 'Monterey', 'Napa', 'Nevada', 'Orange',
       'Placer', 'Plumas', 'Riverside', 'Sacramento', 'San Benito',
       'San Bernardino', 'San Diego', 'San Francisco', 'San Joaquin',
       'San Luis Obispo', 'San Mateo', 'Santa Barbara', 'Santa Clara',
       'Santa Cruz', 'Shasta', 'Sierra', 'Siskiyou', 'Solano', 'Sonoma',
       'Stanislaus', 'Sutter', 'Tehama', 'Trinity', 'Tulare', 'Tuolumne',
       'Ventura', 'Yolo', 'Yuba', 'Unknown', ''], dtype=object)

In [11]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['', 'CPT04', 'MW3D', ..., 'JOHN TOSTE JR', '2275', 'W035'],
      dtype=object)

In [12]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array([''], dtype=object)

In [13]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [14]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [15]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater'], dtype=object)

In [16]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Well'], dtype=object)

In [17]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['', 'CPT04', 'MW3D', ..., 'JOHN TOSTE JR', '2275', 'W035'],
      dtype=object)

In [18]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array([''], dtype=object)

In [19]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['',
 'Cathodic Protection',
 'Dewatering',
 'Injection',
 'Monitoring',
 'Other',
 'Remediation',
 'Sparging',
 'Test Well',
 'Unknown',
 'Vapor Extraction',
 'Water Supply Domestic',
 'Water Supply Industrial',
 'Water Supply Irrigation - Agriculture',
 'Water Supply Irrigation - Landscape',
 'Water Supply Public',
 'Water Supply Stock or Animal Watering']

In [20]:
# Ensure Latitude entry is either numireic or a 0
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([37.5279, 37.5421, '', ..., 36.381223, 38.311869, 37.536222],
      dtype=object)

In [21]:
# Ensure Longitude entry is either numireic or a 0
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-122.015, -121.997, '', ..., -119.290159, -121.82957, -121.93128],
      dtype=object)

In [22]:
# # Changing datatype of Priority Date to date fields entry
# outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
# outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
# outdf['in_AllocationPriorityDate'].unique()

In [23]:
# Ensure Flow entry is either numireic or a 0
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([''], dtype=object)

In [24]:
# Ensure Volume entry is either numireic or a 0
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array([''], dtype=object)

In [25]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1'], dtype=object)

In [26]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId298978',
       'wadeId298979', 'wadeId298980'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: "Inactive", "Revoked", "Pending", "Cancelled", "Rejected", "Closed"

In [27]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Inactive", "Revoked", "Pending", "Cancelled", "Rejected", "Closed"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

953519


array([''], dtype=object)

## Export Data

In [28]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953519 entries, 0 to 953518
Data columns (total 63 columns):
 #   Column                                        Non-Null Count   Dtype 
---  ------                                        --------------   ----- 
 0   WaDEUUID                                      953519 non-null  object
 1   in_MethodUUID                                 953519 non-null  object
 2   in_VariableSpecificUUID                       953519 non-null  object
 3   in_OrganizationUUID                           953519 non-null  object
 4   in_Geometry                                   953519 non-null  object
 5   in_GNISFeatureNameCV                          953519 non-null  object
 6   in_WaterQualityIndicatorCV                    953519 non-null  object
 7   in_WaterSourceName                            953519 non-null  object
 8   in_WaterSourceNativeID                        953519 non-null  object
 9   in_WaterSourceTypeCV                          953519 non-nu

In [29]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,cnra0,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.52790,-122.01500,,,POD,,wadeId1,,Well,CA,,,,,,,,,,,,WCR1997-004295,,,,,,Correlative Rights,,Monitoring,,,,,,1,,,,525475,,,,,,https://cadwr.app.box.com/v/WellCompletionRepo...
1,cnra5,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.54210,-121.99700,,,POD,,wadeId2,,Well,CA,,,,,,,,,,,,WCR1996-004199,,,,,,Correlative Rights,,Monitoring,,,,,,1,,,,525776,,,,,,https://cadwr.app.box.com/v/WellCompletionRepo...
2,cnra6,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,,Digitized,Alameda,4326,,,,,,,,POD,,wadeId3,,Well,CA,,,,,,,,,,,,WCR1996-004573,,,,,,Correlative Rights,,Vapor Extraction,,,,,,1,,,,525127,,,,,,
3,cnra9,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,,Digitized,Alameda,4326,,,,,,,,POD,,wadeId3,,Well,CA,,,,,,,,,,,,WCR1996-003330,,,,,,Correlative Rights,,Vapor Extraction,,,,,,1,,,,477187,,,,,,
4,cnra10,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.74510,-122.19900,,,POD,,wadeId4,,Well,CA,,,,,,,,,,,,WCR1992-006082,,,,,,Correlative Rights,,Monitoring,,,,,,1,,,,413618A,,,,,,https://cadwr.app.box.com/v/WellCompletionRepo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
953514,cnra1074968,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,>50 FT,Digitized,,4326,,,,36.38122,-119.29016,,,POD,,wadeId298977,,Well,CA,,,,,,,,,,,,WCR2017-009452,,,,,,Correlative Rights,,Water Supply Domestic,,,,,,1,,,,E0371068,,,,,,
953515,cnra1074969,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,,Digitized,,4326,,,,,,,,POD,2275,wadeId298978,,Well,CA,,,,,,,,,,,,WCR2011-005405,,,,,,Correlative Rights,,Water Supply Domestic,,,,,,1,,,,e0146791,,,,,,
953516,cnra1074970,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,Unknown,Digitized,,4326,,,,38.31187,-121.82957,,,POD,PZ8,wadeId298979,,Well,CA,,,,,,,,,,,,WCR2017-011186,,,,,,Correlative Rights,,Monitoring,,,,,,1,,,,E0353767,,,,,,
953517,cnra1074971,CADWRwr_M1,CADWRwr_V1,CADWRwr_O1,,,Fresh,,wadeId1,Groundwater,,Digitized,,4326,,,,,,,,POD,1,wadeId1169,,Well,CA,,,,,,,,,,,,WCR2018-012226,,,,,,Correlative Rights,,Water Supply Domestic,,,,,,1,,,,E0369132,,,,,,


In [30]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_caMain.zip', compression=dict(method='zip', archive_name='Pwr_caMain.csv'), index=False)  # The output, save as a zip