# Pre-processing California State Water Resource Control Board Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/California/WaterAllocation_WaterUse_CSWRCB" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/California/WaterAllocation_WaterUse_CSWRCB


## Data Input

In [3]:
# Input File #1 - ewrims_flat_file_pod
fileInput = "RawInputData/water_right/ewrims_flat_file_pod.zip"
dfin1 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/ewrims_flat_file_pod.zip", compression=dict(method='zip', archive_name='ewrims_flat_file_pod.csv'), index=False)

print(len(dfin1))
dfin1.head()

  dfin1 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")


64873


Unnamed: 0,ï»¿POD_ID,POD_NUMBER,POD_STATUS,SOURCE_TYPE,POD_NAME,POD_TYPE,DIVERSION_WORKS_STATUS,STREAM_CLASSIFICATION,DIRECT_DIV_AMOUNT,DIRECT_DIVERSION_RATE,DIRECT_DIV_RATE_UNIT,STORAGE_AMOUNT,DIVERSION_RATE_TO_OFF_STREAM,OFF_STO_DIV_RATE_UNIT,POD_LAST_UPDATE_DATE,POD_COUNT,APPL_ID,OBJECTID,POD_NUMBER_GIS,HAS_OPOD,APPL_POD,POD_ID_GIS,COUNTY,PARCEL_NUMBER,SP_ZONE,DIVERSION_SITE_NAME,NORTH_COORD,EAST_COORD,LATITUDE,LONGITUDE,QUARTER_QUARTER,QUARTER,SECTION_CLASSIFIER,SECTION_NUMBER,TOWNSHIP_NUMBER,TOWNSHIP_DIRECTION,RANGE_NUMBER,RANGE_DIRECTION,MERIDIAN,LOCATION_METHOD,SPECIAL_USE_AREA,SOURCE_NAME,TRIB_DESC,WATERSHED,HUC_12_NUMBER,HUC_12_NAME,HUC_8_NUMBER,HUC_8_NAME,QUAD_MAP_NAME,QUAD_MAP_NUMBER,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,RECEIPT_DATE,REJECTION_DATE,APPLICATION_RECD_DATE,APPLICATION_ACCEPTANCE_DATE,PROJECT_TYPE,RECORD_SUMMARY,INCOMPLETE_STATEMENT,NUMBER_OF_PROTESTS,AGENT_NAME,AGENT_ENTITY_TYPE,APPLICATION_PRIMARY_OWNER,PRIMARY_OWNER_ENTITY_TYPE,SUB_TYPE,INI_REPORTED_DIV_AMOUNT,INI_REPORTED_DIV_UNIT,FACE_VALUE_AMOUNT,FACE_VALUE_UNITS,FEE_DUE,FEE_RECEIVED,APPL_FEE_AMOUNT,APPL_FEE_AMT_RECD,MAX_DD_APPL,MAX_DD_UNITS,MAX_DD_ANN,MAX_STORAGE,MAX_TAKEN_FROM_SOURCE,YEAR_DIVERSION_COMMENCED,MAX_BENEFICIALLY_USED,SUPPLEMENTAL_STATEMENT_CYCLE,TYPE_OF_DIVERSION_FACILITY,QUANTITY_OF_WATER_DIVERTED,QOW_DIVERTED_UNIT,QUANTITY_MEASUREMENT_YEAR,MAX_RATE_OF_DIVERSION,MAX_RATE_OF_DIV_UNIT,RECENT_WATER_USE_MIN,WATER_USE_MIN_UNIT,RECENT_WATER_USE_MAX,WATER_USE_MAX_UNIT,REQUEST_FOR_REVOCATION_RECD,NUM_COMMENTS,NUM_ATTACHMENTS,LAST_UPDATE_DATE,STATE_WELL_NUMBER,DRILLED_WELL_YEAR,SURFACE_WATER_DIVERSIONS,DEPTH_OF_WELL,RELATIONSHIP_TYPE,PARTY_ID,EFFECTIVE_FROM_DATE,EFFECTIVE_TO_DATE,PRIMARY_OWNER_NAME,PRIMARY_OWNER_ENTITY_TYPE_P,OFFICIAL_MAIL_RECEIVER,COUNT_NPO_OR_OTHER,CURRENT_STATUS,EFFECTIVE_DATE,UPDATE_DATETIME,USE_CODE,USE_STATUS,NUMBER_OF_RESIDENCES,SEPERATELY_OWNED,USE_POPULATION,USE_POPULATION_PEOPLE,ESTIMATED_USE_PER_PERSON,USE_POPULATION_STOCK,TYPE_OF_STOCK,AREA_FOR_INCI_IRRIGATION,USE_NET_ACREAGE,USE_GROSS_ACREAGE,USE_DIRECT_DIV_ANNUAL_AMOUNT,USE_DIRECT_DIVERSION_RATE,USE_DIRECT_DIV_RATE_UNITS,POU_DEVELOPMENT_STATUS,DIRECT_DIV_SEASON_START,DIRECT_DIV_SEASON_END,USE_STORAGE_AMOUNT,STORAGE_SEASON_START,STORAGE_SEASON_END,SEASON_DIRECT_DIV_RATE,SEASON_STORAGE_AMOUNT,SEASON_DIRECT_DIV_AA,DIRECT_DIV_SEASON_STATUS,COLLECTION_SEASON_STATUS,USE_COUNT,PERMIT_PERMIT_ID,PERMIT_ORIGINAL_ISSUE_DATE,COMPLETE_CONSTRUCTION_DATE,COMPLETE_APPLIC_WATER_DATE,LICENSE_LICENSE_ID,LICENSE_ORIGINAL_ISSUE_DATE,WATER_RIGHT_DESCRIPTION,PROGRAM_UNIT,LICENSE_REQUEST_TYPE,LICENSE_REQUESTED_DATE,INSPECTION_DATE,REPORT_DATE,OFFER_SENT_DATE,ACCEPTED_OFFER_DATE,PETITION_ID,PETITION_TYPE,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,WaDEUUID
0,60497,1.0,Inactive,Surface,COMPOUND STOCK POND,Point of Direct Diversion,Existing,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,97404.0,1.0,N,T032025_01,60497,San Diego,,6.0,COMPOUND STOCK POND,1814680.0,6549776.0,32.6456,-116.2886,NE,SW,,32.0,17.0,S,7.0,E,San Bernardino,GIS_NE,,UNNAMED STREAM,BOUNDARY CREEK,ANZA BORREGO,181000000000.0,Boundary Creek,18100202.0,Carrizo Creek,LIVE OAK SPRINGS,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,569 EAST COUNTY BOULEVARD LLC,Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1-Jan,31-Dec,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in10
1,60498,2.0,Inactive,Surface,LAKE DOMINGO,Point of Direct Diversion,Proposed,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,95308.0,2.0,N,T032025_02,60498,San Diego,,6.0,LAKE DOMINGO,1803326.0,6554711.0,32.6144,-116.2726,SE,SW,,9.0,18.0,S,7.0,E,San Bernardino,GIS_NE,,LAKE DOMINGO,BOUNDARY CREEK,ANZA BORREGO,181000000000.0,Boundary Creek,18100202.0,Carrizo Creek,TIERRA DEL SOL,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,569 EAST COUNTY BOULEVARD LLC,Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1-Jan,31-Dec,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in11
2,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.891,6474612.148,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,181000000000.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1-Jan,31-Dec,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12
3,17603,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58414.0,1.0,N,A000016_01,17603,Los Angeles,,5.0,,2032842.891,6474612.148,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,KERR SPRING NO 2,,ANTELOPE,181000000000.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1-Jan,31-Dec,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in13
4,29028,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,57787.0,1.0,N,A000016_01,29028,Los Angeles,005-370-43-00,5.0,,2033642.802,6474614.806,34.58017,-118.28921,SE,NE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,KERR SPRING NO 1,,ANTELOPE,181000000000.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1-Jan,31-Dec,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in14


In [4]:
# Input File #2 - water-rights-water-use-reported-short
fileInput = "RawInputData/water_use/water-rights-water-use-reported-short.zip"
dfin2 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/water-rights-water-use-reported-short.zip", compression=dict(method='zip', archive_name='water-rights-water-use-reported-short.csv'), index=False)

print(len(dfin2))
dfin2.head()

11731056


Unnamed: 0,ï»¿MONTH NAME,MONTH FORMATTED,WATER_RIGHT_ID,APPL_ID,YEAR,MONTH,AMOUNT,DIVERSION_TYPE,WaDEUUID
0,January,1/1/2001,1307,A006599,2001,1,0.0,Combined (Direct + Storage),in20
1,January,1/1/2006,1393,A006944,2006,1,0.0,Combined (Direct + Storage),in21
2,January,1/1/2007,1299,A006563,2007,1,0.0,Combined (Direct + Storage),in22
3,January,1/1/2007,1307,A006599,2007,1,0.0,Combined (Direct + Storage),in23
4,January,1/1/2007,1308,A006603,2007,1,0.0,Combined (Direct + Storage),in24


In [5]:
#POD_TYPE fix. 
# we only want the first listed POD_TYPE value

def firstPOD_TYPEFunc(val):
    val = str(val).strip()
    if "," in val:
        outString = val.split(",")
        # outString = val[0]
        outString = ''.join(outString)
    else:
        outString = val
    return outString

dfin1['POD_TYPE'] = dfin1.apply(lambda row: firstPOD_TYPEFunc(row['POD_TYPE']), axis=1)
dfin1['POD_TYPE'].unique()

array(['Point of Direct Diversion', 'Point of Storage - Unspecified',
       'Point of Onstream Storage',
       'Point of Diversion to Underground Storage',
       'Point of Diversion to Offstream Storage', 'Point of Rediversion',
       'Movable Point of Diversion', 'Movable Point of Rediversion', ''],
      dtype=object)

In [6]:
# Clean data a little
dfin1 = dfin1.fillna("") # remove nan values
dfin1['USE_DIRECT_DIVERSION_RATE'] = pd.to_numeric(dfin1['USE_DIRECT_DIVERSION_RATE'], errors='coerce').fillna(0) # make sure this is numeric.

In [7]:
# Create VariableSpecificCv value
def createVariableSpecificUUID(unit):
    outString = ""
    if unit == "Cubic Feet per Second":
        outString = "CSWRCBwr_V1"
    if unit == "Gallons per Day":
        outString = "CSWRCBwr_V1"
    if unit == "Acre-feet per Year":
        outString = "CSWRCBwr_V1"
    if unit == "Gallons per Minute":
        outString = "CSWRCBwr_V1"
    if unit == 'Acre-feet':
        outString = "CSWRCBwr_V2"
    if unit == 'Gallons':
        outString = "CSWRCBwr_V2"
    else:
        outString = "CSWRCBwr_V1"

    return(outString)

dfin1['in_VariableSpecificUUID'] = dfin1.apply(lambda row: createVariableSpecificUUID(row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_VariableSpecificUUID'].unique()

array(['CSWRCBwr_V1', 'CSWRCBwr_V2'], dtype=object)

In [8]:
# convert all flow values to CFS
def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet per Second":
        CFS_Value = val
    if unit == "Gallons per Day":
        CFS_Value = val / (646316.883)
    if unit == "Acre-feet per Year":
        CFS_Value = val / (723.968)
    if unit == "Gallons per Minute":
        CFS_Value = val / (448.83117)
    return(CFS_Value)

dfin1['in_AllocationFlow_CFS'] = dfin1.apply(lambda row: convertFlowFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_AllocationFlow_CFS'].unique()

array([1.50000000e-01,            nan, 2.50000000e+00, ...,
       2.35946180e-05, 5.34722221e-01, 5.05758101e-01])

In [9]:
# convert all volume values to AF
def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-feet':
        AF_Value = val
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    return(AF_Value)

dfin1['in_AllocationVolume_AF'] = dfin1.apply(lambda row: convertVolumeFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_AllocationVolume_AF'].unique()

array([      nan, 0.000e+00, 1.300e-01, 3.650e+02, 2.062e+01])

In [10]:
# remove speicial characters from SUB_TYPE
def cleanupSubTypeFunc(val):
    val = str(val).strip()
    val = val.rstrip('_') # remove trailing "_"
    val = val.rstrip(',') # remove trailing commas
    if val == "" or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        val = val.replace("," ," ")
        outString = "" + val
    return outString

dfin1['wade_SUB_TYPE'] = dfin1.apply(lambda row: cleanupSubTypeFunc(row['SUB_TYPE']), axis=1)
dfin1['wade_SUB_TYPE'].unique()

array(['', 'RIPERIAN', 'RIPERIAN PRE1914', 'PRE1914', 'OTHER',
       'PRE1914 OTHER', 'COURTADJ', 'RIPERIAN PRE1914 OTHER',
       'RIPERIAN PRE1914 COURTADJ OTHER', 'RIPERIAN PRE1914 COURTADJ',
       'PRE1914 COURTADJ', 'RIPERIAN OTHER', 'RIPERIAN COURTADJ',
       'PRE1914 PENDING', 'PENDING', 'RIPERIAN PENDING', 'COURTADJ OTHER',
       'PENDING OTHER', 'RIPERIAN COURTADJ OTHER',
       'RIPERIAN PRE1914 PENDING', 'PRE1914 COURTADJ OTHER',
       'RIPERIAN PENDING OTHER'], dtype=object)

In [11]:
#left merge sites to water use
dfin1 = dfin1.merge(dfin2, left_on='APPLICATION_NUMBER', right_on='APPL_ID', how='left')
print(len(dfin1))
dfin1.head()

14198199


Unnamed: 0,ï»¿POD_ID,POD_NUMBER,POD_STATUS,SOURCE_TYPE,POD_NAME,POD_TYPE,DIVERSION_WORKS_STATUS,STREAM_CLASSIFICATION,DIRECT_DIV_AMOUNT,DIRECT_DIVERSION_RATE,DIRECT_DIV_RATE_UNIT,STORAGE_AMOUNT,DIVERSION_RATE_TO_OFF_STREAM,OFF_STO_DIV_RATE_UNIT,POD_LAST_UPDATE_DATE,POD_COUNT,APPL_ID_x,OBJECTID,POD_NUMBER_GIS,HAS_OPOD,APPL_POD,POD_ID_GIS,COUNTY,PARCEL_NUMBER,SP_ZONE,DIVERSION_SITE_NAME,NORTH_COORD,EAST_COORD,LATITUDE,LONGITUDE,QUARTER_QUARTER,QUARTER,SECTION_CLASSIFIER,SECTION_NUMBER,TOWNSHIP_NUMBER,TOWNSHIP_DIRECTION,RANGE_NUMBER,RANGE_DIRECTION,MERIDIAN,LOCATION_METHOD,SPECIAL_USE_AREA,SOURCE_NAME,TRIB_DESC,WATERSHED,HUC_12_NUMBER,HUC_12_NAME,HUC_8_NUMBER,HUC_8_NAME,QUAD_MAP_NAME,QUAD_MAP_NUMBER,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,RECEIPT_DATE,REJECTION_DATE,APPLICATION_RECD_DATE,APPLICATION_ACCEPTANCE_DATE,PROJECT_TYPE,RECORD_SUMMARY,INCOMPLETE_STATEMENT,NUMBER_OF_PROTESTS,AGENT_NAME,AGENT_ENTITY_TYPE,APPLICATION_PRIMARY_OWNER,PRIMARY_OWNER_ENTITY_TYPE,SUB_TYPE,INI_REPORTED_DIV_AMOUNT,INI_REPORTED_DIV_UNIT,FACE_VALUE_AMOUNT,FACE_VALUE_UNITS,FEE_DUE,FEE_RECEIVED,APPL_FEE_AMOUNT,APPL_FEE_AMT_RECD,MAX_DD_APPL,MAX_DD_UNITS,MAX_DD_ANN,MAX_STORAGE,MAX_TAKEN_FROM_SOURCE,YEAR_DIVERSION_COMMENCED,MAX_BENEFICIALLY_USED,SUPPLEMENTAL_STATEMENT_CYCLE,TYPE_OF_DIVERSION_FACILITY,QUANTITY_OF_WATER_DIVERTED,QOW_DIVERTED_UNIT,QUANTITY_MEASUREMENT_YEAR,MAX_RATE_OF_DIVERSION,MAX_RATE_OF_DIV_UNIT,RECENT_WATER_USE_MIN,WATER_USE_MIN_UNIT,RECENT_WATER_USE_MAX,WATER_USE_MAX_UNIT,REQUEST_FOR_REVOCATION_RECD,NUM_COMMENTS,NUM_ATTACHMENTS,LAST_UPDATE_DATE,STATE_WELL_NUMBER,DRILLED_WELL_YEAR,SURFACE_WATER_DIVERSIONS,DEPTH_OF_WELL,RELATIONSHIP_TYPE,PARTY_ID,EFFECTIVE_FROM_DATE,EFFECTIVE_TO_DATE,PRIMARY_OWNER_NAME,PRIMARY_OWNER_ENTITY_TYPE_P,OFFICIAL_MAIL_RECEIVER,COUNT_NPO_OR_OTHER,CURRENT_STATUS,EFFECTIVE_DATE,UPDATE_DATETIME,USE_CODE,USE_STATUS,NUMBER_OF_RESIDENCES,SEPERATELY_OWNED,USE_POPULATION,USE_POPULATION_PEOPLE,ESTIMATED_USE_PER_PERSON,USE_POPULATION_STOCK,TYPE_OF_STOCK,AREA_FOR_INCI_IRRIGATION,USE_NET_ACREAGE,USE_GROSS_ACREAGE,USE_DIRECT_DIV_ANNUAL_AMOUNT,USE_DIRECT_DIVERSION_RATE,USE_DIRECT_DIV_RATE_UNITS,POU_DEVELOPMENT_STATUS,DIRECT_DIV_SEASON_START,DIRECT_DIV_SEASON_END,USE_STORAGE_AMOUNT,STORAGE_SEASON_START,STORAGE_SEASON_END,SEASON_DIRECT_DIV_RATE,SEASON_STORAGE_AMOUNT,SEASON_DIRECT_DIV_AA,DIRECT_DIV_SEASON_STATUS,COLLECTION_SEASON_STATUS,USE_COUNT,PERMIT_PERMIT_ID,PERMIT_ORIGINAL_ISSUE_DATE,COMPLETE_CONSTRUCTION_DATE,COMPLETE_APPLIC_WATER_DATE,LICENSE_LICENSE_ID,LICENSE_ORIGINAL_ISSUE_DATE,WATER_RIGHT_DESCRIPTION,PROGRAM_UNIT,LICENSE_REQUEST_TYPE,LICENSE_REQUESTED_DATE,INSPECTION_DATE,REPORT_DATE,OFFER_SENT_DATE,ACCEPTED_OFFER_DATE,PETITION_ID,PETITION_TYPE,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,WaDEUUID_x,in_VariableSpecificUUID,in_AllocationFlow_CFS,in_AllocationVolume_AF,wade_SUB_TYPE,ï»¿MONTH NAME,MONTH FORMATTED,WATER_RIGHT_ID,APPL_ID_y,YEAR,MONTH,AMOUNT,DIVERSION_TYPE,WaDEUUID_y
0,60497,1.0,Inactive,Surface,COMPOUND STOCK POND,Point of Direct Diversion,Existing,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,97404.0,1.0,N,T032025_01,60497,San Diego,,6.0,COMPOUND STOCK POND,1814680.0,6549776.0,32.6456,-116.2886,NE,SW,,32.0,17.0,S,7.0,E,San Bernardino,GIS_NE,,UNNAMED STREAM,BOUNDARY CREEK,ANZA BORREGO,181000000000.0,Boundary Creek,18100202.0,Carrizo Creek,LIVE OAK SPRINGS,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,569 EAST COUNTY BOULEVARD LLC,Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1-Jan,31-Dec,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in10,CSWRCBwr_V1,0.15,,,,,,,,,,,
1,60498,2.0,Inactive,Surface,LAKE DOMINGO,Point of Direct Diversion,Proposed,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,95308.0,2.0,N,T032025_02,60498,San Diego,,6.0,LAKE DOMINGO,1803326.0,6554711.0,32.6144,-116.2726,SE,SW,,9.0,18.0,S,7.0,E,San Bernardino,GIS_NE,,LAKE DOMINGO,BOUNDARY CREEK,ANZA BORREGO,181000000000.0,Boundary Creek,18100202.0,Carrizo Creek,TIERRA DEL SOL,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,569 EAST COUNTY BOULEVARD LLC,Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1-Jan,31-Dec,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in11,CSWRCBwr_V1,0.15,,,,,,,,,,,
2,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.891,6474612.148,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,181000000000.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1-Jan,31-Dec,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12,CSWRCBwr_V1,,,,January,1/1/2011,2.0,A000016,2011.0,1.0,0.0,STORAGE,in211244013
3,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.891,6474612.148,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,181000000000.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1-Jan,31-Dec,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12,CSWRCBwr_V1,,,,January,1/1/2011,2.0,A000016,2011.0,1.0,0.0,DIRECT,in211244014
4,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.891,6474612.148,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,181000000000.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1-Jan,31-Dec,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12,CSWRCBwr_V1,,,,January,1/1/2011,2.0,A000016,2011.0,1.0,0.0,USE,in211244015


In [12]:
# # convert units to WaDE Approproiate values (CFS or AF)
# def convertAmountToUnitFunc(val, unit):
#     outValue = None
#     if unit == "Cubic Feet per Second":
#         outValue = val
#     if unit == "Gallons per Day":
#         outValue = val / (646316.883)
#     if unit == "Acre-feet per Year":
#         outValue = val / (723.968)
#     if unit == "Gallons per Minute":
#         outValue = val / (448.83117)
#     if unit == 'Acre-feet':
#         outValue = val
#     if unit == 'Gallons':
#         outValue = val / (325850.943)
#     return(outValue)

# dfin1['in_Amount'] = dfin1.apply(lambda row: convertAmountToUnitFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
# dfin1['in_Amount'].unique()

In [13]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x']

# Method Info
df['in_MethodUUID'] = "CSWRCBwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = dfin1['in_VariableSpecificUUID'] # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = "Water Use"

# Organization Info
df['in_OrganizationUUID'] = "CSWRCBwr_O1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfin1['SOURCE_NAME'].str.title()
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = dfin1['SOURCE_TYPE'].str.title()

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = dfin1['LOCATION_METHOD']
df['in_County'] = dfin1['COUNTY'].str.title()
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = dfin1['HUC_12_NUMBER'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_HUC8'] = dfin1['HUC_8_NUMBER'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_Latitude'] = dfin1['LATITUDE']
df['in_Longitude'] = dfin1['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfin1['POD_NAME'].str.title()
df['in_SiteNativeID'] = dfin1['ï»¿POD_ID'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = dfin1['POD_TYPE'].astype(str).str.title()
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin1['APPLICATION_RECD_DATE']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['in_AllocationFlow_CFS'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['WATER_RIGHT_STATUS'].str.title()
df['in_AllocationNativeID'] =  dfin1['APPLICATION_NUMBER'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin1['PRIMARY_OWNER_NAME']
df['in_AllocationPriorityDate'] = dfin1['PRIORITY_DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfin1['DIRECT_DIV_SEASON_END']
df['in_AllocationTimeframeStart'] = dfin1['DIRECT_DIV_SEASON_START']
df['in_AllocationTypeCV'] = dfin1['WATER_RIGHT_TYPE'].astype(str) + " " + dfin1['wade_SUB_TYPE'].astype(str)
df['in_AllocationTypeCV'] = df['in_AllocationTypeCV'].astype(str).str.strip()
df['in_AllocationVolume_AF'] = dfin1['in_AllocationVolume_AF'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['USE_CODE'].str.title()
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # we want this data to be exempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = "" # temp fix, leave blank for now
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = "" # temp fix, leave blank for now
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/EWServlet?Redirect_Page=EWPublicAppSummary.jsp&Purpose=getEwrimsPublicSummary&wrWaterRightID=" + dfin1['WR_WATER_RIGHT_ID'].replace("", 0).fillna(0).astype(int).astype(str)


# Site VariableAmounts Info
df['in_Amount'] = dfin1['AMOUNT']
df['in_AssociatedNativeAllocationIDs'] = dfin1['APPLICATION_NUMBER']
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = "" # see below
df['in_ReportYearCV'] = dfin1['YEAR'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['MONTH FORMATTED']
df['in_TimeframeStart'] = dfin1['MONTH FORMATTED']
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

7739363


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,in10,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Unnamed Stream,,Surface,,,,GIS_NE,San Diego,4326,,-2147483648,18100202,32.6456,-116.2886,,,POD,Compound Stock Pond,60497,,Point Of Direct Diversion,CA,,4/18/2013,,,,,,,,0.15,Cancelled,T032025,569 EAST COUNTY BOULEVARD LLC,,,31-Dec,1-Jan,Temporary Permit,,Dust Control,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,T032025,,,0,,,
1,in11,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Lake Domingo,,Surface,,,,GIS_NE,San Diego,4326,,-2147483648,18100202,32.6144,-116.2726,,,POD,Lake Domingo,60498,,Point Of Direct Diversion,CA,,4/18/2013,,,,,,,,0.15,Cancelled,T032025,569 EAST COUNTY BOULEVARD LLC,,,31-Dec,1-Jan,Temporary Permit,,Dust Control,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,T032025,,,0,,,
2,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,,,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,LILAC HILLS ESTATES LP,,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,0.0,A000016,,,2011,,1/1/2011,1/1/2011
3,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,,,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,LILAC HILLS ESTATES LP,,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,0.0,A000016,,,2011,,2/1/2011,2/1/2011
4,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,,,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,LILAC HILLS ESTATES LP,,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,0.0,A000016,,,2011,,3/1/2011,3/1/2011


## Concatenate POD and POU Data.  Make needed changes

In [14]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [15]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

7739363


## Clean Data / data types

In [16]:
# updating in_WaterSourceTypeCV to be more machine readable / WaDE specific
# ----------------------------------------------------------------------------------------------------

def createWaterSourceTypeCV(inWST):
    inWST = str(inWST).strip()
    
    if inWST == "":
        outString = "Unspecified"
    elif inWST == "Subsurfacer":
        outString = "Groundwater"
    else:
        outString =  "Surface Water"
      
    return outString

outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: createWaterSourceTypeCV(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Unspecified'], dtype=object)

In [17]:
# Fill empty ben use values

def fillEmptyBenUseFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "Unspecified"
    else:
        outString = val
    return outString
    
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fillEmptyBenUseFunc(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: fillEmptyBenUseFunc(row['in_PrimaryUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Dust Control', 'Domestic', 'Irrigation', 'Power', 'Municipal',
       'Fish And Wildlife Preservation And Enhancement', 'Stockwatering',
       'Industrial', 'Mining', 'Recreational', 'Fire Protection',
       'Incidental Power', 'Frost Protection', 'Aquaculture',
       'Snow Making', 'Milling', 'Heat Control', 'Other', 'Unspecified',
       'Aesthetic', 'Water Quality'], dtype=object)

In [18]:
# Update datatype of Priority Date to fit WaDE 2.0 structure
def formatDateString(inString1):
    inString = str(inString1).strip()
    try:
        if inString == "" or pd.isnull(inString):
            valndf = ""
        else:
            valD = pd.to_datetime(inString)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d/%Y')
    except:
        valndf = ""
    return valndf

outdf['in_AllocationPriorityDate'] = outdf.apply(lambda row: formatDateString(row['in_AllocationPriorityDate']), axis=1)
outdf['in_AllocationPriorityDate'].unique()

array(['', '07/22/1915', '04/17/1916', ..., '06/23/2023', '03/02/2022',
       '09/09/2021'], dtype=object)

In [19]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

In [20]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Unnamed Stream', 'Lake Domingo', 'Icy Spring No 1', ...,
       'Unnamed Stream, Tributary To Rowes Creek, Thence Outlet Creek',
       'Unnamed Stream, Tributary To Larabee Creek, Thence Eel River',
       'Broaddus Creek'], dtype=object)

In [21]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['San Diego', 'Los Angeles', 'Glenn', 'Madera', 'Lake', 'Colusa',
       'Mono', 'Lassen', 'Tuolumne', 'Riverside', 'San Bernardino',
       'Inyo', 'Modoc', 'Sacramento', 'El Dorado', 'Trinity', 'Fresno',
       'San Luis Obispo', 'Santa Clara', 'Shasta', 'Tulare', 'Kern',
       'San Joaquin', 'Tehama', 'Kings', 'Stanislaus', 'Siskiyou',
       'Contra Costa', 'Butte', 'Yuba', 'Sierra', 'Napa', 'Sutter',
       'Placer', 'Ventura', 'Yolo', 'Alpine', 'Plumas', 'Santa Barbara',
       'Mendocino', 'Sonoma', 'Calaveras', 'Mariposa', 'Merced',
       'Humboldt', 'Nevada', 'San Benito', 'Solano', 'Santa Cruz',
       'Monterey', 'Marin', 'Amador', 'Alameda', 'San Mateo', '',
       'Orange', 'Imperial', 'Del Norte', 'San Francisco'], dtype=object)

In [22]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Compound Stock Pond', 'Lake Domingo', '', ..., 'Highway Pump',
       'Money Pit', 'Kurfeld'], dtype=object)

In [23]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['569 East County Boulevard Llc', 'Lilac Hills Estates Lp',
       'Glenncolusa Irrigation District', ..., 'Overland Road Llc',
       'Harugama Llc', 'Richard Casarotti'], dtype=object)

In [24]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [25]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Unnamed Stream', 'Lake Domingo', 'Icy Spring No 1', ...,
       'Unnamed Stream, Tributary To Rowes Creek, Thence Outlet Creek',
       'Unnamed Stream, Tributary To Larabee Creek, Thence Eel River',
       'Broaddus Creek'], dtype=object)

In [26]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface Water', 'Unspecified'], dtype=object)

In [27]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['Point Of Direct Diversion', 'Point Of Storage - Unspecified',
       'Point Of Onstream Storage',
       'Point Of Diversion To Underground Storage',
       'Point Of Diversion To Offstream Storage', 'Point Of Rediversion',
       'Movable Point Of Diversion', 'Movable Point Of Rediversion', ''],
      dtype=object)

In [28]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Compound Stock Pond', 'Lake Domingo', '', ..., 'Highway Pump',
       'Money Pit', 'Kurfeld'], dtype=object)

In [29]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['San Diego', 'Los Angeles', 'Glenn', 'Madera', 'Lake', 'Colusa',
       'Mono', 'Lassen', 'Tuolumne', 'Riverside', 'San Bernardino',
       'Inyo', 'Modoc', 'Sacramento', 'El Dorado', 'Trinity', 'Fresno',
       'San Luis Obispo', 'Santa Clara', 'Shasta', 'Tulare', 'Kern',
       'San Joaquin', 'Tehama', 'Kings', 'Stanislaus', 'Siskiyou',
       'Contra Costa', 'Butte', 'Yuba', 'Sierra', 'Napa', 'Sutter',
       'Placer', 'Ventura', 'Yolo', 'Alpine', 'Plumas', 'Santa Barbara',
       'Mendocino', 'Sonoma', 'Calaveras', 'Mariposa', 'Merced',
       'Humboldt', 'Nevada', 'San Benito', 'Solano', 'Santa Cruz',
       'Monterey', 'Marin', 'Amador', 'Alameda', 'San Mateo', '',
       'Orange', 'Imperial', 'Del Norte', 'San Francisco'], dtype=object)

In [30]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['569 East County Boulevard Llc', 'Lilac Hills Estates Lp',
       'Glenncolusa Irrigation District', ..., 'Overland Road Llc',
       'Harugama Llc', 'Richard Casarotti'], dtype=object)

In [31]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Aesthetic',
 'Aquaculture',
 'Domestic',
 'Dust Control',
 'Fire Protection',
 'Fish And Wildlife Preservation And Enhancement',
 'Frost Protection',
 'Heat Control',
 'Incidental Power',
 'Industrial',
 'Irrigation',
 'Milling',
 'Mining',
 'Municipal',
 'Other',
 'Power',
 'Recreational',
 'Snow Making',
 'Stockwatering',
 'Unspecified',
 'Water Quality']

In [32]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([32.6456, 32.6144, 34.57797187, ..., 39.399486, 39.39983869,
       39.215055], dtype=object)

In [33]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-116.2886, -116.2726, -118.2892096, ..., -123.390005, -123.3898763,
       -123.312955], dtype=object)

In [34]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([0.15, '', 2.5, ..., 2.359461799411124e-05, 0.5347222208297164,
       0.5057581005347734], dtype=object)

In [35]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 0.13, 365.0, 20.62], dtype=object)

In [36]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 3270.0, 6758.0, ..., 141834.0, 411.82, 354.06], dtype=object)

In [37]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [38]:
#Update datatype of Priority Date to fit WaDE 2.0 structure
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'])
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
[                'NaT', '1915-07-22 00:00:00', '1916-04-17 00:00:00',
 '1916-04-28 00:00:00', '1917-01-03 00:00:00', '1917-07-12 00:00:00',
 '1918-12-31 00:00:00', '1919-02-26 00:00:00', '1919-10-10 00:00:00',
 '1920-02-11 00:00:00',
 ...
 '2020-10-01 00:00:00', '2023-05-10 00:00:00', '2021-06-07 00:00:00',
 '2021-12-10 00:00:00', '2022-04-05 00:00:00', '2021-04-19 00:00:00',
 '2022-05-20 00:00:00', '2023-06-23 00:00:00', '2022-03-02 00:00:00',
 '2021-09-09 00:00:00']
Length: 1868, dtype: datetime64[ns]

In [39]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
[                'NaT', '2011-01-01 00:00:00', '2011-02-01 00:00:00',
 '2011-03-01 00:00:00', '2011-04-01 00:00:00', '2011-05-01 00:00:00',
 '2011-06-01 00:00:00', '2011-07-01 00:00:00', '2011-08-01 00:00:00',
 '2011-09-01 00:00:00',
 ...
 '1974-12-01 00:00:00', '1977-12-01 00:00:00', '1978-12-01 00:00:00',
 '1979-12-01 00:00:00', '1980-12-01 00:00:00', '1981-12-01 00:00:00',
 '1982-12-01 00:00:00', '1983-12-01 00:00:00', '1984-12-01 00:00:00',
 '1985-12-01 00:00:00']
Length: 709, dtype: datetime64[ns]

In [40]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
[                'NaT', '2011-01-01 00:00:00', '2011-02-01 00:00:00',
 '2011-03-01 00:00:00', '2011-04-01 00:00:00', '2011-05-01 00:00:00',
 '2011-06-01 00:00:00', '2011-07-01 00:00:00', '2011-08-01 00:00:00',
 '2011-09-01 00:00:00',
 ...
 '1974-12-01 00:00:00', '1977-12-01 00:00:00', '1978-12-01 00:00:00',
 '1979-12-01 00:00:00', '1980-12-01 00:00:00', '1981-12-01 00:00:00',
 '1982-12-01 00:00:00', '1983-12-01 00:00:00', '1984-12-01 00:00:00',
 '1985-12-01 00:00:00']
Length: 709, dtype: datetime64[ns]

In [41]:
# extract year out
outdf['in_ReportYearCV'].unique()

array(['0', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2008', '2009', '2010', '2021', '2022',
       '2007', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
       '2003', '2004', '2005', '2006', '1975', '1930', '1993', '1994',
       '1995', '1991', '1992', '1986', '1987', '1988', '1989', '1990',
       '1915', '1963', '1913', '1959', '1906', '1916', '1976', '1971',
       '1972', '1973', '1974', '1977', '1978', '1979', '1980', '1981',
       '1982', '1983', '1984', '1985'], dtype=object)

In [42]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

array(['Other', 'Domestic', 'Agriculture Irrigation', 'Hydroelectric',
       'Public Supply', 'In-stream Flow', 'Livestock',
       'Commercial/Industrial', 'Mining', 'Recreation', 'Fire',
       'Unspecified', 'Aquaculture', 'Snow', 'Thermoelectric Cooling',
       'Municipal Irrigation'], dtype=object)

In [43]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Water Use_Monthly_Other_Surface Water',
       'Water Use_Monthly_Domestic_Unspecified',
       'Water Use_Monthly_Agriculture Irrigation_Unspecified',
       'Water Use_Monthly_Hydroelectric_Unspecified',
       'Water Use_Monthly_Domestic_Surface Water',
       'Water Use_Monthly_Public Supply_Unspecified',
       'Water Use_Monthly_In-Stream Flow_Unspecified',
       'Water Use_Monthly_Agriculture Irrigation_Surface Water',
       'Water Use_Monthly_Livestock_Unspecified',
       'Water Use_Monthly_Public Supply_Surface Water',
       'Water Use_Monthly_Commercial/Industrial_Surface Water',
       'Water Use_Monthly_Mining_Unspecified',
       'Water Use_Monthly_Recreation_Unspecified',
       'Water Use_Monthly_Commercial/Industrial_Unspecified',
       'Water Use_Monthly_Livestock_Surface Water',
       'Water Use_Monthly_Hydroelectric_Surface Water',
       'Water Use_Monthly_Fire_Unspecified',
       'Water Use_Monthly_Unspecified_Unspecified',
       'Water Use_Monthly_

In [44]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId9701', 'wadeId9702',
       'wadeId9703'], dtype=object)

In [45]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['60497', '60498', '404', ..., '85187', '85188', '85189'],
      dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For CA, we don't want water rights that are considered: "Cancelled", "Closed", "Inactive", "Pending", "Rejected", "Revoked"

In [46]:
# drop non-active AllocationLegalStatusCV values specific to that state.

print(f'length of df before removing non-active rights: ', len(outdf))

# drop list
dropLegalStatusList = ["Cancelled", "Closed", "Inactive", "Pending", "Rejected", "Revoked"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(f'length of df after removing non-active rights: ', len(outdf))
for x in outdf['in_AllocationLegalStatusCV'].sort_values().unique():
    print(f'"' + x + '",')

length of df before removing non-active rights:  7739363
length of df after removing non-active rights:  7545262
"",
"Active",
"Adjudicated",
"Certified",
"Claimed",
"Claimed - Local Oversight",
"Completed",
"Licensed",
"Permitted",
"Registered",


## Shapefile Data
- For attaching geometry to POU csv inputs.

In [47]:
# N/A

## Export Outputs

In [48]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7545262 entries, 0 to 7545261
Data columns (total 76 columns):
 #   Column                                        Dtype         
---  ------                                        -----         
 0   WaDEUUID                                      object        
 1   in_MethodUUID                                 object        
 2   in_VariableSpecificUUID                       object        
 3   in_AggregationIntervalUnitCV                  object        
 4   in_VariableCV                                 object        
 5   in_OrganizationUUID                           object        
 6   in_Geometry                                   object        
 7   in_GNISFeatureNameCV                          object        
 8   in_WaterQualityIndicatorCV                    object        
 9   in_WaterSourceName                            object        
 10  in_WaterSourceNativeID                        object        
 11  in_WaterSourceTypeCV    

In [49]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,wadeId3,Unspecified,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2011,,2011-01-01,2011-01-01,Water Use_Monthly_Domestic_Unspecified
1,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,wadeId3,Unspecified,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2011,,2011-02-01,2011-02-01,Water Use_Monthly_Domestic_Unspecified
2,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,wadeId3,Unspecified,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2011,,2011-03-01,2011-03-01,Water Use_Monthly_Domestic_Unspecified
3,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,wadeId3,Unspecified,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2011,,2011-04-01,2011-04-01,Water Use_Monthly_Domestic_Unspecified
4,in12,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Icy Spring No 1,wadeId3,Unspecified,,,,DD_NE,Los Angeles,4326,,-2147483648,18090206,34.57797,-118.28921,,,POD,,404,,Point Of Direct Diversion,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,31-Dec,1-Jan,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2011,,2011-05-01,2011-05-01,Water Use_Monthly_Domestic_Unspecified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7545257,in164868,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,Broaddus Creek,wadeId9703,Surface Water,,,,GIS_LL,Mendocino,4326,,-2147483648,18010103,39.39949,-123.39001,,,POD,Primary Pod,85123,,Point Of Diversion To Offstream Storage,CA,,3/2/2022,,,,,,,,0.02228,Registered,H511199,Richard Casarotti,2022-03-02,,31-Mar,1-Nov,Registration Cannabis,,Irrigation,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,H511199,,Agriculture Irrigation,0,,NaT,NaT,Water Use_Monthly_Agriculture Irrigation_Surfa...
7545258,in164869,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,,wadeId40,Surface Water,,,,GIS_CLICK,Mendocino,4326,,-2147483648,18010103,39.39984,-123.38988,,,POD,Money Pit,85124,,Point Of Diversion To Offstream Storage,CA,,3/2/2022,,,,,,,,0.02228,Registered,H511199,Richard Casarotti,2022-03-02,,31-Mar,1-Nov,Registration Cannabis,,Irrigation,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,H511199,,Agriculture Irrigation,0,,NaT,NaT,Water Use_Monthly_Agriculture Irrigation_Surfa...
7545259,in164870,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,,wadeId362,Unspecified,,,,,,4326,,0,0,,,,,POD,,85187,,,CA,,,,,,,,,,,Active,NJ000780,,NaT,,,,Non Jurisdictional,,Unspecified,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,NJ000780,,Unspecified,0,,NaT,NaT,Water Use_Monthly_Unspecified_Unspecified
7545260,in164871,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_O1,,,,,wadeId40,Surface Water,,,,,,4326,,0,0,,,,,POD,Kurfeld,85188,,Point Of Diversion To Offstream Storage,CA,,9/9/2021,,,,,,,,0.02228,Registered,H511005,David Kurzfeld,2021-09-09,,31-Mar,1-Nov,Registration Cannabis,,Fire Protection,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,H511005,,Fire,0,,NaT,NaT,Water Use_Monthly_Fire_Surface Water


In [50]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwr_wu_Main.zip', compression=dict(method='zip', archive_name='Pwr_wu_Main.csv'), index=False)  # The output, save as a zip