# Pre-processing California Natural Resources Agency Allocation data for WaDE upload.
- Purpose:  To pre-process the data into one master file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# Working Directory
print(os.getcwd()) # see the current working directory

# set working directory, if need be
workingDir = "G:\Shared drives\WaDE Data\California\WaterAllocation_CNRA" # file location
os.chdir(workingDir)
print(os.getcwd())

C:\Users\rjame\Documents\WSWC Documents\MappingStatesDataToWaDE2.0\California\WaterAllocation_CNRA
G:\Shared drives\WaDE Data\California\WaterAllocation_CNRA


## Point of Diversion Data

In [3]:
# Input File - wellcompletionreports
FI_PoD = "RawInputData/wellcompletionreports.zip"
dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfinPOD:
    dfinPOD['WaDEUUID'] = "cnra" + dfinPOD.index.astype(str)
    dfinPOD.to_csv('RawInputData/wellcompletionreports.zip', compression=dict(method='zip', archive_name='wellcompletionreports.csv'), index=False)

print(len(dfinPOD))
dfinPOD.head()

  dfinPOD = pd.read_csv(FI_PoD, encoding = "ISO-8859-1").replace(np.nan, "")


1067251


Unnamed: 0,WCRNUMBER,LEGACYLOGNUMBER,REGIONOFFICE,COUNTYNAME,LOCALPERMITAGENCY,PERMITDATE,PERMITNUMBER,OWNERASSIGNEDWELLNUMBER,WELLLOCATION,CITY,PLANNEDUSEFORMERUSE,DRILLERNAME,DRILLERLICENSENUMBER,RECORDTYPE,DECIMALLATITUDE,DECIMALLONGITUDE,METHODOFDETERMINATIONLL,LLACCURACY,HORIZONTALDATUM,GROUNDSURFACEELEVATION,ELEVATIONACCURACY,ELEVATIONDETERMINATIONMETHOD,VERTICALDATUM,TOWNSHIP,RANGE,SECTION,BASELINEMERIDIAN,APN,DATEWORKENDED,WORKFLOWSTATUS,RECEIVEDDATE,TOTALDRILLDEPTH,TOTALCOMPLETEDDEPTH,TOPOFPERFORATEDINTERVAL,BOTTOMOFPERFORATEDINTERVAL,CASINGDIAMETER,DRILLINGMETHOD,FLUID,STATICWATERLEVEL,TOTALDRAWDOWN,TESTTYPE,PUMPTESTLENGTH,WELLYIELD,WELLYIELDUNITOFMEASURE,OTHEROBSERVATIONS,WaDEUUID
0,WCR1986-000560,01-553O,DWR North Central Region Office,Alameda,Alameda County Water District,,,,,FREMONT,Monitoring,"KLEINFELDER, J.H. & ASSOC KLEINFELDER, J.H. & ...",552198,WellCompletion/New/Production or Monitoring/NA,37.51314,-121.99661,Derived from TRS,Centroid of Section,,,,,,05S,01W,8,Mount Diablo,,4/24/1986,,,,30.0,,,12.0,,,,,,,,,,cnra0
1,WCR1986-000350,01-247F,DWR North Central Region Office,Alameda,Zone 7 Water Agency - Alameda County Flood Con...,,,,,LAWRENCE LAB,Monitoring,P.C. EXPLORATION CORP. P.C. EXPLORATION CORP.,265556,WellCompletion/New/Production or Monitoring/NA,37.68707,-121.7058,Derived from TRS,Centroid of Section,,,,,,03S,02E,12,Mount Diablo,,4/29/1986,,,,206.0,,,5.0,,,,,,,,,,cnra1
2,WCR1986-001334,179484,DWR North Central Region Office,Alameda,Alameda County Water District,,,,,FREMONT,Unknown,"DE LUCCHI WELL AND PUMP, DE LUCCHI WELL AND P...",394454,WellCompletion/Destruction/NA/NA,37.55672,-122.05109,Derived from TRS,Centroid of Section,,,,,,04S,02W,26,Mount Diablo,,7/1/1986,,,,,,,,,,,,,,,,,cnra2
3,WCR1986-001708,169091,DWR North Central Region Office,Alameda,Alameda County Water District,,,,,FREMONT,Monitoring,DATUM EXPLORATION DATUM EXPLORATION,CONV,WellCompletion/New/Production or Monitoring/NA,37.54215,-121.99658,Derived from TRS,Centroid of Section,,,,,,04S,01W,32,Mount Diablo,,9/24/1986,,,,46.0,,,2.0,,,,,,,,,,cnra3
4,WCR1986-006177,195563,DWR North Central Region Office,Alameda,Alameda County Water District,,,,,FREMONT,Monitoring,HEW DRILLING COMPANY HEW DRILLING COMPANY,604987,WellCompletion/New/Production or Monitoring/NA,37.55662,-121.99654,Derived from TRS,Centroid of Section,,,,,,04S,01W,29,Mount Diablo,,9/10/1986,,,,60.0,,,2.0,,,,,,,,,,cnra4


In [4]:
# drop these columns, as we are not extracting information from them.

dfinPOD.drop(['REGIONOFFICE','LOCALPERMITAGENCY','PERMITDATE','PERMITNUMBER','CITY','DRILLERNAME','DRILLERLICENSENUMBER','METHODOFDETERMINATIONLL','HORIZONTALDATUM','GROUNDSURFACEELEVATION','ELEVATIONACCURACY','ELEVATIONDETERMINATIONMETHOD','VERTICALDATUM','TOWNSHIP','RANGE','SECTION','BASELINEMERIDIAN','APN','RECEIVEDDATE','TOTALDRILLDEPTH','TOTALCOMPLETEDDEPTH','TOPOFPERFORATEDINTERVAL','BOTTOMOFPERFORATEDINTERVAL','CASINGDIAMETER','DRILLINGMETHOD','FLUID','STATICWATERLEVEL','TOTALDRAWDOWN','TESTTYPE','PUMPTESTLENGTH','OTHEROBSERVATIONS'], axis =1, inplace=True)
print(len(dfinPOD))
dfinPOD.head(1)

1067251


Unnamed: 0,WCRNUMBER,LEGACYLOGNUMBER,COUNTYNAME,OWNERASSIGNEDWELLNUMBER,WELLLOCATION,PLANNEDUSEFORMERUSE,RECORDTYPE,DECIMALLATITUDE,DECIMALLONGITUDE,LLACCURACY,DATEWORKENDED,WORKFLOWSTATUS,WELLYIELD,WELLYIELDUNITOFMEASURE,WaDEUUID
0,WCR1986-000560,01-553O,Alameda,,,Monitoring,WellCompletion/New/Production or Monitoring/NA,37.51314,-121.99661,Centroid of Section,4/24/1986,,,,cnra0


In [5]:
# Input File - wellreportpdflinks

lCsv = "RawInputData/wellreportpdflinks.zip"# to match wrcn to state well number
linkCsv = pd.read_csv(lCsv, encoding = "ISO-8859-1").replace(np.nan, "")

# merge with wellcompletionreports
dfinPOD2 = pd.merge(dfinPOD, linkCsv, left_on='WCRNUMBER', right_on='WCRNumber',how='left')
dfinPOD2 = dfinPOD2.drop_duplicates().replace(np.nan, "").replace("nan,nan", "").reset_index(drop=True)
print(len(dfinPOD2))
dfinPOD2.head()

1134203


Unnamed: 0,WCRNUMBER,LEGACYLOGNUMBER,COUNTYNAME,OWNERASSIGNEDWELLNUMBER,WELLLOCATION,PLANNEDUSEFORMERUSE,RECORDTYPE,DECIMALLATITUDE,DECIMALLONGITUDE,LLACCURACY,DATEWORKENDED,WORKFLOWSTATUS,WELLYIELD,WELLYIELDUNITOFMEASURE,WaDEUUID,WCRNumber,WCRLink
0,WCR1986-000560,01-553O,Alameda,,,Monitoring,WellCompletion/New/Production or Monitoring/NA,37.51314,-121.99661,Centroid of Section,4/24/1986,,,,cnra0,,
1,WCR1986-000350,01-247F,Alameda,,,Monitoring,WellCompletion/New/Production or Monitoring/NA,37.68707,-121.7058,Centroid of Section,4/29/1986,,,,cnra1,,
2,WCR1986-001334,179484,Alameda,,,Unknown,WellCompletion/Destruction/NA/NA,37.55672,-122.05109,Centroid of Section,7/1/1986,,,,cnra2,WCR1986-001334,https://cadwr.app.box.com/v/WellCompletionRepo...
3,WCR1986-001708,169091,Alameda,,,Monitoring,WellCompletion/New/Production or Monitoring/NA,37.54215,-121.99658,Centroid of Section,9/24/1986,,,,cnra3,WCR1986-001708,https://cadwr.app.box.com/v/WellCompletionRepo...
4,WCR1986-006177,195563,Alameda,,,Monitoring,WellCompletion/New/Production or Monitoring/NA,37.55662,-121.99654,Centroid of Section,9/10/1986,,,,cnra4,WCR1986-006177,https://cadwr.app.box.com/v/WellCompletionRepo...


In [6]:
#removing these records
dfinPOD2 = dfinPOD2[dfinPOD2['RECORDTYPE'] != 'WellCompletion/Destruction/NA/NA']
dfinPOD2 = dfinPOD2[dfinPOD2['RECORDTYPE'] != 'WellCompletion/Drill and Destroy/NA/NA']
print(len(dfinPOD2))

949573


In [7]:
# Calculating CFS from GPM
def cfsCalculation(A):
    if(A != ''):
        return str(float(A) * 0.0026757275)
    return ''
dfinPOD2['WELLYIELD_CFS'] = dfinPOD2.apply(lambda row: cfsCalculation(row['WELLYIELD']), axis=1)
dfinPOD2['WELLYIELD_CFS'].unique()

array(['', '0.013378637499999999', '0.0', ..., '2.7265663225',
       '1.8542791575', '267.974109125'], dtype=object)

In [8]:
# create output POD dataframe
df = pd.DataFrame()


# Data Assessment UUID
df['WaDEUUID'] = dfinPOD2['WaDEUUID']

# Method Info
df['in_MethodUUID'] = "CNRAwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = "CNRAwr_V1"

# Organization Info
df['in_OrganizationUUID'] = "CNRAwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = "Fresh"
df['in_WaterSourceName'] = ""
df['in_WaterSourceNativeID'] = "" # auto fill in below if not provdied
df['in_WaterSourceTypeCV'] = "Groundwater"

# Site Info
df['in_CoordinateAccuracy'] = dfinPOD2['LLACCURACY']
df['in_CoordinateMethodCV'] = "Digitized"
df['in_County'] = dfinPOD2['COUNTYNAME']
df['in_EPSGCodeCV'] = "4326"
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = ""
df['in_HUC8'] = ""
df['in_Latitude'] = dfinPOD2['DECIMALLATITUDE']
df['in_Longitude'] = dfinPOD2['DECIMALLONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"  # "Point of Diversion"
df['in_SiteName'] = dfinPOD2['OWNERASSIGNEDWELLNUMBER']
df['in_SiteNativeID'] = ""
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "Well"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = ""
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfinPOD2['WELLYIELD_CFS']
df['in_AllocationLegalStatusCV'] =""
df['in_AllocationNativeID'] =  dfinPOD2['WCRNUMBER']
df['in_AllocationOwner'] = "" 
df['in_AllocationPriorityDate'] = ""
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = ""
df['in_AllocationTimeframeStart'] = ""
df['in_AllocationTypeCV'] = ""
df['in_AllocationVolume_AF'] = ""
df['in_BeneficialUseCategory'] = dfinPOD2['PLANNEDUSEFORMERUSE']
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # either a 1 or 0
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = ""
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = dfinPOD2['LEGACYLOGNUMBER']
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = ""
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = dfinPOD2['WCRLink']

outPOD = df.copy()
outPOD = outPOD.drop_duplicates().reset_index(drop=True)
print(len(outPOD))
outPOD.tail()

949573


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
949568,cnra1067246,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,,,,Well,CA,,,,,,,,,,,,WCR1927-000425,,,,,,,,Unknown,,,,,,1,,,,C1220N,,,,,,
949569,cnra1067247,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,,,,Well,CA,,,,,,,,,,,,WCR2001-011202,,,,,,,,Unknown,,,,,,1,,,,,,,,,,
949570,cnra1067248,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,,,,Well,CA,,,,,,,,,,,,WCR2001-011229,,,,,,,,Unknown,,,,,,1,,,,,,,,,,
949571,cnra1067249,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,,,,Well,CA,,,,,,,,,,,,WCR2001-011971,,,,,,,,Unknown,,,,,,1,,,,157,,,,,,
949572,cnra1067250,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,,Groundwater,,Digitized,,4326,,,,,,,,POD,,,,Well,CA,,,,,,,,,,,,WCR2001-010946,,,,,,,,Unknown,,,,,,1,,,,272,,,,,,


In [9]:
# drop duplicates
outdf = outPOD
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

949573


## Clean Data / data types

In [10]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [11]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [12]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['', 'Pmb1', 'Mw7', ..., 'Placerdalby #4', 'Cdm35',
       'Indiana Shop Well #1'], dtype=object)

In [13]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array([''], dtype=object)

In [14]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [15]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array([''], dtype=object)

In [16]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Groundwater'], dtype=object)

In [17]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Well'], dtype=object)

In [18]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['', 'Pmb1', 'Mw7', ..., 'Placerdalby #4', 'Cdm35',
       'Indiana Shop Well #1'], dtype=object)

In [19]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array([''], dtype=object)

In [20]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Monitoring', 'Water Supply Domestic',
       'Water Supply Irrigation - Agriculture', 'Unknown',
       'Vapor Extraction', 'Cathodic Protection',
       'Water Supply Irrigation - Landscape', 'Test Well',
       'Water Supply Industrial', 'Remediation', 'Injection',
       'Water Supply Public', 'Other', 'Dewatering', 'Sparging',
       'Water Supply Stock or Animal Watering', ''], dtype=object)

In [21]:
# Ensure Latitude entry is either numireic or blank, no 0 entries
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace("",0).fillna(0)
outdf['in_Latitude'].unique()

array([37.51314 , 37.68707 , 37.54215 , ..., 38.205556, 38.088611,
       38.20422 ])

In [22]:
# Ensure Longitude entry is either numireic or blank, no 0 entries
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace("",0).fillna(0)
outdf['in_Longitude'].unique()

array([-121.99661 , -121.7058  , -121.99658 , ..., -122.931754,
       -121.931054, -122.943056])

In [23]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

array(['NaT'], dtype='datetime64[ns]')

In [24]:
# Ensure Flow entry is either numireic or blank, no 0 entries
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace("",0).fillna(0)
outdf['in_AllocationFlow_CFS'].unique()

array([0.00000000e+00, 1.33786375e-02, 5.35145500e-02, ...,
       2.72656632e+00, 1.85427916e+00, 2.67974109e+02])

In [25]:
# Ensure Volume entry is either numireic or blank, no 0 entries
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace("",0).fillna(0)
outdf['in_AllocationVolume_AF'].unique()

array([0.])

In [26]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceName'] = outdf['in_WaterSourceName']
dfWaterSourceNativeID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)
dfWaterSourceNativeID['linkKey'] = dfWaterSourceNativeID['in_WaterSourceName'].astype(str) + dfWaterSourceNativeID['in_WaterSourceTypeCV'].astype(str)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
WaterSourceNativeIDdict = pd.Series(dfWaterSourceNativeID.in_WaterSourceNativeID.values, index=dfWaterSourceNativeID.linkKey.astype(str)).to_dict()
def retrieveWaterSourceNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = WaterSourceNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveWaterSourceNativeID( row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeID1'], dtype=object)

In [27]:
# Creating WaDE Custom site native ID for easy site source identification
# ----------------------------------------------------------------------------------------------------

# Create temp site dataframe of unique water source.
def assignSiteNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "wadeID" + string1
    return outstring

dfSiteNativeID = pd.DataFrame()
dfSiteNativeID['in_Latitude'] = outdf['in_Latitude']
dfSiteNativeID['in_Longitude'] = outdf['in_Longitude']
dfSiteNativeID = dfSiteNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfSiteNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfSiteNativeID['in_SiteNativeID'] = dftemp.apply(lambda row: assignSiteNativeID(row['Count']), axis=1)
dfSiteNativeID['linkKey'] = dfSiteNativeID['in_Latitude'].astype(str) + dfSiteNativeID['in_Longitude'].astype(str)

# ----------------------------------------------------------------------------------------------------

#Retreive WaDE Custom site  native ID
SiteNativeIDdict = pd.Series(dfSiteNativeID.in_SiteNativeID.values, index=dfSiteNativeID.linkKey.astype(str)).to_dict()
def retrieveSiteNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        colrowValue = str(A).strip() + str(B).strip()
        try:
            outList = SiteNativeIDdict[colrowValue]
        except:
            outList = ''
    return outList

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveSiteNativeID( row['in_Latitude'], row['in_Longitude']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['wadeID1', 'wadeID2', 'wadeID3', ..., 'wadeID175359',
       'wadeID175360', 'wadeID175361'], dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For this {state name / organization}, we don't want water rights that are considered: "Inactive", "Revoked", "Pending", "Cancelled", "Rejected", "Closed"

In [28]:
# drop non-active AllocationLegalStatusCV values specific to that state.

# drop the list
dropLegalStatusList = ["Inactive", "Revoked", "Pending", "Cancelled", "Rejected", "Closed"] # enter string entries here

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(len(outdf))
outdf['in_AllocationLegalStatusCV'].unique()

949573


array([''], dtype=object)

## Export Data

In [29]:
outdf.info

<bound method DataFrame.info of            WaDEUUID in_MethodUUID in_VariableSpecificUUID in_OrganizationUUID  \
0             cnra0     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
1             cnra1     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
2             cnra3     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
3             cnra4     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
4             cnra5     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
...             ...           ...                     ...                 ...   
949568  cnra1067246     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
949569  cnra1067247     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
949570  cnra1067248     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
949571  cnra1067249     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1   
949572  cnra1067250     CNRAwr_M1               CNRAwr_V1           CNRAwr_O1

In [30]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL
0,cnra0,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.51314,-121.99661,,,POD,,wadeID1,,Well,CA,,,,,,,,,,0.00000,,WCR1986-000560,,NaT,,,,,0.00000,Monitoring,,,,,,1,,,,01-553O,,,,,,
1,cnra1,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.68707,-121.70580,,,POD,,wadeID2,,Well,CA,,,,,,,,,,0.00000,,WCR1986-000350,,NaT,,,,,0.00000,Monitoring,,,,,,1,,,,01-247F,,,,,,
2,cnra3,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.54215,-121.99658,,,POD,,wadeID3,,Well,CA,,,,,,,,,,0.00000,,WCR1986-001708,,NaT,,,,,0.00000,Monitoring,,,,,,1,,,,169091,,,,,,https://cadwr.app.box.com/v/WellCompletionRepo...
3,cnra4,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.55662,-121.99654,,,POD,,wadeID4,,Well,CA,,,,,,,,,,0.00000,,WCR1986-006177,,NaT,,,,,0.00000,Monitoring,,,,,,1,,,,195563,,,,,,https://cadwr.app.box.com/v/WellCompletionRepo...
4,cnra5,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,Centroid of Section,Digitized,Alameda,4326,,,,37.65812,-121.79657,,,POD,,wadeID5,,Well,CA,,,,,,,,,,0.00000,,WCR1986-006946,,NaT,,,,,0.00000,Water Supply Domestic,,,,,,1,,,,239327,,,,,,https://cadwr.app.box.com/v/WellCompletionRepo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949568,cnra1067246,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,0.00000,0.00000,,,POD,,wadeID8,,Well,CA,,,,,,,,,,0.00000,,WCR1927-000425,,NaT,,,,,0.00000,Unknown,,,,,,1,,,,C1220N,,,,,,
949569,cnra1067247,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,0.00000,0.00000,,,POD,,wadeID8,,Well,CA,,,,,,,,,,0.00000,,WCR2001-011202,,NaT,,,,,0.00000,Unknown,,,,,,1,,,,,,,,,,
949570,cnra1067248,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,0.00000,0.00000,,,POD,,wadeID8,,Well,CA,,,,,,,,,,0.00000,,WCR2001-011229,,NaT,,,,,0.00000,Unknown,,,,,,1,,,,,,,,,,
949571,cnra1067249,CNRAwr_M1,CNRAwr_V1,CNRAwr_O1,,,Fresh,,wadeID1,Groundwater,,Digitized,,4326,,,,0.00000,0.00000,,,POD,,wadeID8,,Well,CA,,,,,,,,,,0.00000,,WCR2001-011971,,NaT,,,,,0.00000,Unknown,,,,,,1,,,,157,,,,,,


In [31]:
# Export the output dataframe
# change output name / abbreviation to match native state provdier and wade data type 
outdf.to_csv('RawInputData/Pwr_caMain.zip', compression=dict(method='zip', archive_name='Pwr_caMain.csv'), index=False)  # The output, save as a zip