# Pre-processing Water Right and Time Series Water Use data for WaDE Upload
- Purpose:  To pre-process the data into one main file for simple DataFrame creation and extraction

In [1]:
# Needed Libraries / Modules

# ---- working with data ----
import os  # native operating system interaction
import numpy as np  # mathematical array manipulation
import pandas as pd  # data structure and data analysis
import geopandas as gpd  # geo-data structure and data analysis

# ---- visualization ----
import matplotlib.pyplot as plt  # plotting library
import seaborn as sns  # plotting library

# ---- API data retrieval ----
import requests  # http requests
import json  # JSON parse

# ---- Cleanup ----
import re  # string regular expression manipulation
from datetime import datetime  # date and time manipulation
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook
pd.set_option('display.float_format', lambda x: '%.5f' % x)  # suppress scientific notation in Pandas

In [2]:
# ---- working directory ----
workingDirString = "G:/Shared drives/WaDE Data/California/WaterAllocation_WaterUse_CSWRCB" # set working directory folder string here
os.chdir(workingDirString)
print(f'The working Directory is:', workingDirString)

The working Directory is: G:/Shared drives/WaDE Data/California/WaterAllocation_WaterUse_CSWRCB


## Data Input

In [3]:
# Input File #1 - ewrims_flat_file_pod
fileInput = "RawInputData/water_right/ewrims_flat_file_pod.zip"
dfin1 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin1:
    dfin1['WaDEUUID'] = "in1" + dfin1.index.astype(str)
    dfin1.to_csv("RawInputData/water_right/ewrims_flat_file_pod.zip", compression=dict(method='zip', archive_name='ewrims_flat_file_pod.csv'), index=False)

print(len(dfin1))
dfin1.head()

  dfin1 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")


65902


Unnamed: 0,Ã¯Â»Â¿POD_ID,POD_NUMBER,POD_STATUS,SOURCE_TYPE,POD_NAME,POD_TYPE,DIVERSION_WORKS_STATUS,STREAM_CLASSIFICATION,DIRECT_DIV_AMOUNT,DIRECT_DIVERSION_RATE,DIRECT_DIV_RATE_UNIT,STORAGE_AMOUNT,DIVERSION_RATE_TO_OFF_STREAM,OFF_STO_DIV_RATE_UNIT,POD_LAST_UPDATE_DATE,POD_COUNT,APPL_ID,OBJECTID,POD_NUMBER_GIS,HAS_OPOD,APPL_POD,POD_ID_GIS,COUNTY,PARCEL_NUMBER,SP_ZONE,DIVERSION_SITE_NAME,NORTH_COORD,EAST_COORD,LATITUDE,LONGITUDE,QUARTER_QUARTER,QUARTER,SECTION_CLASSIFIER,SECTION_NUMBER,TOWNSHIP_NUMBER,TOWNSHIP_DIRECTION,RANGE_NUMBER,RANGE_DIRECTION,MERIDIAN,LOCATION_METHOD,SPECIAL_USE_AREA,SOURCE_NAME,TRIB_DESC,WATERSHED,HUC_12_NUMBER,HUC_12_NAME,HUC_8_NUMBER,HUC_8_NAME,QUAD_MAP_NAME,QUAD_MAP_NUMBER,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,RECEIPT_DATE,REJECTION_DATE,APPLICATION_RECD_DATE,APPLICATION_ACCEPTANCE_DATE,PROJECT_TYPE,RECORD_SUMMARY,INCOMPLETE_STATEMENT,NUMBER_OF_PROTESTS,AGENT_NAME,AGENT_ENTITY_TYPE,APPLICATION_PRIMARY_OWNER,PRIMARY_OWNER_ENTITY_TYPE,SUB_TYPE,INI_REPORTED_DIV_AMOUNT,INI_REPORTED_DIV_UNIT,FACE_VALUE_AMOUNT,FACE_VALUE_UNITS,FEE_DUE,FEE_RECEIVED,APPL_FEE_AMOUNT,APPL_FEE_AMT_RECD,MAX_DD_APPL,MAX_DD_UNITS,MAX_DD_ANN,MAX_STORAGE,MAX_TAKEN_FROM_SOURCE,YEAR_DIVERSION_COMMENCED,MAX_BENEFICIALLY_USED,SUPPLEMENTAL_STATEMENT_CYCLE,TYPE_OF_DIVERSION_FACILITY,QUANTITY_OF_WATER_DIVERTED,QOW_DIVERTED_UNIT,QUANTITY_MEASUREMENT_YEAR,MAX_RATE_OF_DIVERSION,MAX_RATE_OF_DIV_UNIT,RECENT_WATER_USE_MIN,WATER_USE_MIN_UNIT,RECENT_WATER_USE_MAX,WATER_USE_MAX_UNIT,REQUEST_FOR_REVOCATION_RECD,NUM_COMMENTS,NUM_ATTACHMENTS,LAST_UPDATE_DATE,STATE_WELL_NUMBER,DRILLED_WELL_YEAR,SURFACE_WATER_DIVERSIONS,DEPTH_OF_WELL,RELATIONSHIP_TYPE,PARTY_ID,EFFECTIVE_FROM_DATE,EFFECTIVE_TO_DATE,PRIMARY_OWNER_NAME,PRIMARY_OWNER_ENTITY_TYPE_P,OFFICIAL_MAIL_RECEIVER,COUNT_NPO_OR_OTHER,CURRENT_STATUS,EFFECTIVE_DATE,UPDATE_DATETIME,USE_CODE,USE_STATUS,NUMBER_OF_RESIDENCES,SEPERATELY_OWNED,USE_POPULATION,USE_POPULATION_PEOPLE,ESTIMATED_USE_PER_PERSON,USE_POPULATION_STOCK,TYPE_OF_STOCK,AREA_FOR_INCI_IRRIGATION,USE_NET_ACREAGE,USE_GROSS_ACREAGE,USE_DIRECT_DIV_ANNUAL_AMOUNT,USE_DIRECT_DIVERSION_RATE,USE_DIRECT_DIV_RATE_UNITS,POU_DEVELOPMENT_STATUS,DIRECT_DIV_SEASON_START,DIRECT_DIV_SEASON_END,USE_STORAGE_AMOUNT,STORAGE_SEASON_START,STORAGE_SEASON_END,SEASON_DIRECT_DIV_RATE,SEASON_STORAGE_AMOUNT,SEASON_DIRECT_DIV_AA,DIRECT_DIV_SEASON_STATUS,COLLECTION_SEASON_STATUS,USE_COUNT,PERMIT_PERMIT_ID,PERMIT_ORIGINAL_ISSUE_DATE,COMPLETE_CONSTRUCTION_DATE,COMPLETE_APPLIC_WATER_DATE,LICENSE_LICENSE_ID,LICENSE_ORIGINAL_ISSUE_DATE,WATER_RIGHT_DESCRIPTION,PROGRAM_UNIT,LICENSE_REQUEST_TYPE,LICENSE_REQUESTED_DATE,INSPECTION_DATE,REPORT_DATE,OFFER_SENT_DATE,ACCEPTED_OFFER_DATE,PETITION_ID,PETITION_TYPE,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,WaDEUUID
0,60497,1.0,Inactive,Surface,COMPOUND STOCK POND,Point of Direct Diversion,Existing,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,97404.0,1.0,N,T032025_01,60497,San Diego,,6.0,COMPOUND STOCK POND,1814680.0,6549776.0,32.6456,-116.2886,NE,SW,,32.0,17.0,S,7.0,E,San Bernardino,GIS_NE,,UNNAMED STREAM,BOUNDARY CREEK,ANZA BORREGO,181002020203.0,Boundary Creek,18100202.0,Carrizo Creek,LIVE OAK SPRINGS,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1/1,12/31,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in10
1,60498,2.0,Inactive,Surface,LAKE DOMINGO,Point of Direct Diversion,Proposed,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,95308.0,2.0,N,T032025_02,60498,San Diego,,6.0,LAKE DOMINGO,1803326.0,6554711.0,32.6144,-116.2726,SE,SW,,9.0,18.0,S,7.0,E,San Bernardino,GIS_NE,,LAKE DOMINGO,BOUNDARY CREEK,ANZA BORREGO,181002020203.0,Boundary Creek,18100202.0,Carrizo Creek,TIERRA DEL SOL,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1/1,12/31,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in11
2,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.89063,6474612.14777,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,180902061401.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1/1,12/31,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12
3,17603,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58414.0,1.0,N,A000016_01,17603,Los Angeles,,5.0,,2032842.89063,6474612.14777,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,KERR SPRING NO 2,,ANTELOPE,180902061401.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1/1,12/31,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in13
4,29028,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,57787.0,1.0,N,A000016_01,29028,Los Angeles,005-370-43-00,5.0,,2033642.80241,6474614.80623,34.58017,-118.28921,SE,NE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,KERR SPRING NO 1,,ANTELOPE,180902061401.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1/1,12/31,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in14


In [4]:
# Input File #2 - water-rights-water-use-reported-short
fileInput = "RawInputData/water_use/water-rights-water-use-reported-short.zip"
dfin2 = pd.read_csv(fileInput, encoding = "ISO-8859-1").replace(np.nan, "")

# WaDE UUID tracker for data assessment
if 'WaDEUUID' not in dfin2:
    dfin2['WaDEUUID'] = "in2" + dfin2.index.astype(str)
    dfin2.to_csv("RawInputData/water_use/water-rights-water-use-reported-short.zip", compression=dict(method='zip', archive_name='water-rights-water-use-reported-short.csv'), index=False)

print(len(dfin2))
dfin2.head()

12467037


Unnamed: 0,Ã¯Â»Â¿MONTH NAME,MONTH FORMATTED,WATER_RIGHT_ID,APPL_ID,YEAR,MONTH,AMOUNT,DIVERSION_TYPE,WaDEUUID
0,January,1/1/2014,156,A001060,2014,1,0.0,STORAGE,in20
1,January,1/1/2014,156,A001060,2014,1,0.0,DIRECT,in21
2,February,2/1/2014,156,A001060,2014,2,0.0,STORAGE,in22
3,February,2/1/2014,156,A001060,2014,2,0.0,DIRECT,in23
4,February,2/1/2014,156,A001060,2014,2,0.0,USE,in24


In [5]:
# issue of too many comma separated values.  Will leave blank or now

# #POD_TYPE fix
# # we only want the first listed POD_TYPE value

# def firstPOD_TYPEFunc(val):
#     val = str(val).strip()
#     if "," in val:
#         outString = val.split(",")
#         # outString = val[0]
#         outString = ''.join(outString)
#     else:
#         outString = val
#     return outString

# dfin1['POD_TYPE'] = dfin1.apply(lambda row: firstPOD_TYPEFunc(row['POD_TYPE']), axis=1)
# dfin1['POD_TYPE'].value_counts()

In [6]:
# Clean data a little

dfin1 = dfin1.fillna("") # remove nan values
dfin1['USE_DIRECT_DIVERSION_RATE'] = pd.to_numeric(dfin1['USE_DIRECT_DIVERSION_RATE'], errors='coerce').fillna(0) # make sure this is numeric.

In [7]:
# WaterSoureTypeCV fix
# some SOURCE_TYPE are blank, but SOURCE_TYPE is blank for not all those that share a similar SOURCE_NAME

dfTempW = dfin1[['SOURCE_NAME', 'SOURCE_TYPE']].copy()
dfTempW['SOURCE_NAME'] = dfTempW['SOURCE_NAME'].astype(str).str.strip()
dfTempW['SOURCE_TYPE'] = dfTempW['SOURCE_TYPE'].astype(str).str.strip()
dfTempW = dfTempW.drop_duplicates()
dfTempW = dfTempW[dfTempW['SOURCE_TYPE'] != ""]
wDict = pd.Series(dfTempW.SOURCE_TYPE.values, index=dfTempW.SOURCE_NAME.astype(str)).to_dict()

# Retreive WaDE Custom site native ID
def fillinWaterSourceTypeWithNameFunc(valName, valType):
    valName = str(valName).strip()
    valType = str(valType).strip()
    if valType == "":
        try:
            outString = wDict[valName]
        except:
            outString = "WaDE Blank"
    else:
        outString = valType
    return outString

dfin1['in_WaterSourceTypeCV'] = dfin1.apply(lambda row: fillinWaterSourceTypeWithNameFunc(row['SOURCE_NAME'], row['SOURCE_TYPE']), axis=1)
dfin1['in_WaterSourceTypeCV'].value_counts()

in_WaterSourceTypeCV
Surface       35757
Subsurface    19290
WaDE Blank    10855
Name: count, dtype: int64

In [8]:
# Create VariableSpecificCv value
def createVariableSpecificUUID(unit):
    outString = ""
    if unit == "Cubic Feet per Second":
        outString = "CSWRCBwr_V1"
    if unit == "Gallons per Day":
        outString = "CSWRCBwr_V1"
    if unit == "Acre-feet per Year":
        outString = "CSWRCBwr_V1"
    if unit == "Gallons per Minute":
        outString = "CSWRCBwr_V1"
    if unit == 'Acre-feet':
        outString = "CSWRCBwr_V2"
    if unit == 'Gallons':
        outString = "CSWRCBwr_V2"
    else:
        outString = "CSWRCBwr_V1"

    return(outString)

dfin1['in_VariableSpecificUUID'] = dfin1.apply(lambda row: createVariableSpecificUUID(row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_VariableSpecificUUID'].unique()

array(['CSWRCBwr_V1', 'CSWRCBwr_V2'], dtype=object)

In [9]:
# convert all flow values to CFS

def convertFlowFunc(val, unit):
    CFS_Value = None
    if unit == "Cubic Feet per Second":
        CFS_Value = val
    if unit == "Gallons per Day":
        CFS_Value = val / (646316.883)
    if unit == "Acre-feet per Year":
        CFS_Value = val / (723.968)
    if unit == "Gallons per Minute":
        CFS_Value = val / (448.83117)
    return(CFS_Value)

dfin1['in_AllocationFlow_CFS'] = dfin1.apply(lambda row: convertFlowFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_AllocationFlow_CFS'].unique()

array([1.50000000e-01,            nan, 2.50000000e+00, ...,
       2.00500000e+03, 1.60000000e+03, 2.29918178e+01])

In [10]:
# convert all volume values to AF

def convertVolumeFunc(val, unit):
    AF_Value = None
    if unit == 'Acre-feet':
        AF_Value = val
    if unit == 'Gallons':
        AF_Value = val / (325850.943)
    return(AF_Value)

dfin1['in_AllocationVolume_AF'] = dfin1.apply(lambda row: convertVolumeFunc(row['USE_DIRECT_DIVERSION_RATE'], row['USE_DIRECT_DIV_RATE_UNITS']), axis=1)
dfin1['in_AllocationVolume_AF'].unique()

array([      nan, 0.000e+00, 1.300e-01, 3.650e+02, 2.062e+01])

In [11]:
# remove speicial characters from SUB_TYPE

def cleanupSubTypeFunc(val):
    val = str(val).strip()
    val = val.rstrip('_') # remove trailing "_"
    val = val.rstrip(',') # remove trailing commas
    if val == "" or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        val = val.replace("," ," ")
        outString = "" + val
    return outString

dfin1['wade_SUB_TYPE'] = dfin1.apply(lambda row: cleanupSubTypeFunc(row['SUB_TYPE']), axis=1)
dfin1['wade_SUB_TYPE'].unique()

array(['', 'RIPARIAN', 'RIPARIAN PRE1914', 'PRE1914', 'OTHER',
       'PRE1914 OTHER', 'COURTADJ', 'RIPARIAN PRE1914 OTHER',
       'RIPARIAN PRE1914 COURTADJ OTHER', 'RIPARIAN PRE1914 COURTADJ',
       'PRE1914 COURTADJ', 'RIPARIAN OTHER', 'RIPARIAN COURTADJ',
       'PRE1914 PENDING', 'PENDING', 'RIPARIAN PENDING', 'COURTADJ OTHER',
       'PENDING OTHER', 'RIPARIAN COURTADJ OTHER',
       'RIPARIAN PRE1914 PENDING', 'PRE1914 COURTADJ OTHER',
       'RIPARIAN PENDING OTHER'], dtype=object)

In [12]:
# left-join water right / sites data -to- water use data

dfin1 = dfin1.merge(dfin2[['APPL_ID', 'AMOUNT', 'YEAR', 'MONTH FORMATTED', 'WaDEUUID']], left_on='APPLICATION_NUMBER', right_on='APPL_ID', how='left').reset_index(drop=True)
print(len(dfin1))
dfin1.head()

15073676


Unnamed: 0,Ã¯Â»Â¿POD_ID,POD_NUMBER,POD_STATUS,SOURCE_TYPE,POD_NAME,POD_TYPE,DIVERSION_WORKS_STATUS,STREAM_CLASSIFICATION,DIRECT_DIV_AMOUNT,DIRECT_DIVERSION_RATE,DIRECT_DIV_RATE_UNIT,STORAGE_AMOUNT,DIVERSION_RATE_TO_OFF_STREAM,OFF_STO_DIV_RATE_UNIT,POD_LAST_UPDATE_DATE,POD_COUNT,APPL_ID_x,OBJECTID,POD_NUMBER_GIS,HAS_OPOD,APPL_POD,POD_ID_GIS,COUNTY,PARCEL_NUMBER,SP_ZONE,DIVERSION_SITE_NAME,NORTH_COORD,EAST_COORD,LATITUDE,LONGITUDE,QUARTER_QUARTER,QUARTER,SECTION_CLASSIFIER,SECTION_NUMBER,TOWNSHIP_NUMBER,TOWNSHIP_DIRECTION,RANGE_NUMBER,RANGE_DIRECTION,MERIDIAN,LOCATION_METHOD,SPECIAL_USE_AREA,SOURCE_NAME,TRIB_DESC,WATERSHED,HUC_12_NUMBER,HUC_12_NAME,HUC_8_NUMBER,HUC_8_NAME,QUAD_MAP_NAME,QUAD_MAP_NUMBER,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,RECEIPT_DATE,REJECTION_DATE,APPLICATION_RECD_DATE,APPLICATION_ACCEPTANCE_DATE,PROJECT_TYPE,RECORD_SUMMARY,INCOMPLETE_STATEMENT,NUMBER_OF_PROTESTS,AGENT_NAME,AGENT_ENTITY_TYPE,APPLICATION_PRIMARY_OWNER,PRIMARY_OWNER_ENTITY_TYPE,SUB_TYPE,INI_REPORTED_DIV_AMOUNT,INI_REPORTED_DIV_UNIT,FACE_VALUE_AMOUNT,FACE_VALUE_UNITS,FEE_DUE,FEE_RECEIVED,APPL_FEE_AMOUNT,APPL_FEE_AMT_RECD,MAX_DD_APPL,MAX_DD_UNITS,MAX_DD_ANN,MAX_STORAGE,MAX_TAKEN_FROM_SOURCE,YEAR_DIVERSION_COMMENCED,MAX_BENEFICIALLY_USED,SUPPLEMENTAL_STATEMENT_CYCLE,TYPE_OF_DIVERSION_FACILITY,QUANTITY_OF_WATER_DIVERTED,QOW_DIVERTED_UNIT,QUANTITY_MEASUREMENT_YEAR,MAX_RATE_OF_DIVERSION,MAX_RATE_OF_DIV_UNIT,RECENT_WATER_USE_MIN,WATER_USE_MIN_UNIT,RECENT_WATER_USE_MAX,WATER_USE_MAX_UNIT,REQUEST_FOR_REVOCATION_RECD,NUM_COMMENTS,NUM_ATTACHMENTS,LAST_UPDATE_DATE,STATE_WELL_NUMBER,DRILLED_WELL_YEAR,SURFACE_WATER_DIVERSIONS,DEPTH_OF_WELL,RELATIONSHIP_TYPE,PARTY_ID,EFFECTIVE_FROM_DATE,EFFECTIVE_TO_DATE,PRIMARY_OWNER_NAME,PRIMARY_OWNER_ENTITY_TYPE_P,OFFICIAL_MAIL_RECEIVER,COUNT_NPO_OR_OTHER,CURRENT_STATUS,EFFECTIVE_DATE,UPDATE_DATETIME,USE_CODE,USE_STATUS,NUMBER_OF_RESIDENCES,SEPERATELY_OWNED,USE_POPULATION,USE_POPULATION_PEOPLE,ESTIMATED_USE_PER_PERSON,USE_POPULATION_STOCK,TYPE_OF_STOCK,AREA_FOR_INCI_IRRIGATION,USE_NET_ACREAGE,USE_GROSS_ACREAGE,USE_DIRECT_DIV_ANNUAL_AMOUNT,USE_DIRECT_DIVERSION_RATE,USE_DIRECT_DIV_RATE_UNITS,POU_DEVELOPMENT_STATUS,DIRECT_DIV_SEASON_START,DIRECT_DIV_SEASON_END,USE_STORAGE_AMOUNT,STORAGE_SEASON_START,STORAGE_SEASON_END,SEASON_DIRECT_DIV_RATE,SEASON_STORAGE_AMOUNT,SEASON_DIRECT_DIV_AA,DIRECT_DIV_SEASON_STATUS,COLLECTION_SEASON_STATUS,USE_COUNT,PERMIT_PERMIT_ID,PERMIT_ORIGINAL_ISSUE_DATE,COMPLETE_CONSTRUCTION_DATE,COMPLETE_APPLIC_WATER_DATE,LICENSE_LICENSE_ID,LICENSE_ORIGINAL_ISSUE_DATE,WATER_RIGHT_DESCRIPTION,PROGRAM_UNIT,LICENSE_REQUEST_TYPE,LICENSE_REQUESTED_DATE,INSPECTION_DATE,REPORT_DATE,OFFER_SENT_DATE,ACCEPTED_OFFER_DATE,PETITION_ID,PETITION_TYPE,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,WaDEUUID_x,in_WaterSourceTypeCV,in_VariableSpecificUUID,in_AllocationFlow_CFS,in_AllocationVolume_AF,wade_SUB_TYPE,APPL_ID_y,AMOUNT,YEAR,MONTH FORMATTED,WaDEUUID_y
0,60497,1.0,Inactive,Surface,COMPOUND STOCK POND,Point of Direct Diversion,Existing,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,97404.0,1.0,N,T032025_01,60497,San Diego,,6.0,COMPOUND STOCK POND,1814680.0,6549776.0,32.6456,-116.2886,NE,SW,,32.0,17.0,S,7.0,E,San Bernardino,GIS_NE,,UNNAMED STREAM,BOUNDARY CREEK,ANZA BORREGO,181002020203.0,Boundary Creek,18100202.0,Carrizo Creek,LIVE OAK SPRINGS,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1/1,12/31,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in10,Surface,CSWRCBwr_V1,0.15,,,,,,,
1,60498,2.0,Inactive,Surface,LAKE DOMINGO,Point of Direct Diversion,Proposed,Unknown,48.0,0.15,Cubic Feet per Second,,,,,2,T032025,95308.0,2.0,N,T032025_02,60498,San Diego,,6.0,LAKE DOMINGO,1803326.0,6554711.0,32.6144,-116.2726,SE,SW,,9.0,18.0,S,7.0,E,San Bernardino,GIS_NE,,LAKE DOMINGO,BOUNDARY CREEK,ANZA BORREGO,181002020203.0,Boundary Creek,18100202.0,Carrizo Creek,TIERRA DEL SOL,,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,4/18/2013,,4/18/2013,4/18/2013,,Migrated data from old WRIMS system.,,0,,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,,,,48.0,Acre-feet per Year,1570.0,1570.0,1570.0,1570.0,0.15,Cubic Feet per Second,48.0,,0.0,,0.0,,,,,,,,,,,,N,1,1,,,,,0.0,Primary Owner,539608.0,4/18/2013,,"569 EAST COUNTY BOULEVARD, LLC",Limited Liability Company,Y,0,Cancelled,8/9/2013,,Dust Control,Requested when filed,,,0.0,,,,,,0.0,0.0,109.0,0.15,Cubic Feet per Second,Partial,1/1,12/31,0.0,,,0.15,0.0,109.0,Requested when filed,,1,,,,,,,,,,,,,,,,,,,,,,,,,,0,in11,Surface,CSWRCBwr_V1,0.15,,,,,,,
2,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.89063,6474612.14777,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,180902061401.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1/1,12/31,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12,WaDE Blank,CSWRCBwr_V1,,,,A000016,0.0,2011.0,1/1/2011,in211979129
3,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.89063,6474612.14777,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,180902061401.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1/1,12/31,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12,WaDE Blank,CSWRCBwr_V1,,,,A000016,0.0,2012.0,1/1/2012,in211979142
4,404,1.0,Active,,,Point of Direct Diversion,,,0.0,0.025,Cubic Feet per Second,0.0,,,,4,A000016,58413.0,1.0,N,A000016_01,404,Los Angeles,,5.0,,2032842.89063,6474612.14777,34.57797,-118.28921,NE,SE,,30.0,6.0,N,13.0,W,San Bernardino,DD_NE,,ICY SPRING NO 1,,ANTELOPE,180902061401.0,Headwaters Amargosa Creek,18090206.0,Antelope-Fremont Valleys,SLEEPY VALLEY,OO071,2,A000016,41.0,30.0,41.0,Appropriative,Licensed,A000016,,,,,,2/15/1915,,Migrated data from old WRIMS system.,,0,MATT CRAIG,Individual,LILAC HILLS ESTATES LP,Corporation,,,,18.1,Acre-feet per Year,10.0,10.0,10.0,10.0,0.025,Cubic Feet per Second,0.0,0.0,18.1,,0.0,,,,,,,,,,,,N,0,2,,,,,,Primary Owner,437566.0,7/21/2006,,LILAC HILLS ESTATES LP,Corporation,N,2,Licensed,2/19/1915,,Domestic,Migrated from old WRIMS data,,,0.0,,,,,,0.0,0.0,0.0,0.0,,,1/1,12/31,0.0,,,,0.0,,Migrated from old WRIMS data,Migrated from old WRIMS data,2,30.0,10/28/1915,,,41.0,9/25/1918,Migrated data from old WRIMS system.,,Permitee Request,,,,,,,,,,,,,,,,,0,in12,WaDE Blank,CSWRCBwr_V1,,,,A000016,0.0,2013.0,1/1/2013,in211979156


In [13]:
# create output POD dataframe
df = pd.DataFrame()

# Data Assessment UUID
df['WaDEUUID'] = dfin1['WaDEUUID_x'] + "," + dfin1['WaDEUUID_y']

# Method Info
df['in_MethodUUID'] = "CSWRCBwr_M1"

# Variable Info
df['in_VariableSpecificUUID'] = dfin1['in_VariableSpecificUUID'] # for wr records portion only, will create sa portion below
df['in_AggregationIntervalUnitCV'] = "Monthly"
df['in_VariableCV'] = "Water Use"

# Organization Info
df['in_OrganizationUUID'] = "CSWRCBwr_OR1"

# WaterSource Info
df['in_Geometry'] = ""
df['in_GNISFeatureNameCV'] = ""
df['in_WaterQualityIndicatorCV'] = ""
df['in_WaterSourceName'] = dfin1['SOURCE_NAME'].str.title()
df['in_WaterSourceNativeID'] = "" # create customID for temp solution
df['in_WaterSourceTypeCV'] = dfin1['in_WaterSourceTypeCV'] # see above

# Site Info
df['in_RegulatoryOverlayUUIDs'] = ""
df['in_WaterSourceUUID'] = "" # ???
df['in_CoordinateAccuracy'] = ""
df['in_CoordinateMethodCV'] = dfin1['LOCATION_METHOD']
df['in_County'] = dfin1['COUNTY'].str.title()
df['in_EPSGCodeCV'] = 4326
df['in_Geometry'] = ""
df['in_GNISCodeCV'] = ""
df['in_HUC12'] = dfin1['HUC_12_NUMBER'].replace("", 0).fillna(0).astype('int64').astype(str)
df['in_HUC8'] = dfin1['HUC_8_NUMBER'].replace("", 0).fillna(0).astype('int64').astype(str)
df['in_Latitude'] = dfin1['LATITUDE']
df['in_Longitude'] = dfin1['LONGITUDE']
df['in_NHDNetworkStatusCV'] = ""
df['in_NHDProductCV'] = ""
df['in_PODorPOUSite'] = "POD"
df['in_SiteName'] = dfin1['POD_NAME'].str.title().str.replace(",", "")
df['in_SiteNativeID'] = dfin1['Ã¯Â»Â¿POD_ID'].replace("", 0).fillna(0).astype('int64').astype(str)
df['in_SitePoint'] = ""
df['in_SiteTypeCV'] = "WaDE Blank"
df['in_StateCV'] = "CA"
df['in_USGSSiteID'] = ""

# AllocationAmount Info
df['in_AllocationApplicationDate'] = dfin1['APPLICATION_RECD_DATE']
df['in_AllocationAssociatedConsumptiveUseSiteIDs'] = ""
df['in_AllocationAssociatedWithdrawalSiteIDs'] = ""
df['in_AllocationBasisCV'] = ""
df['in_AllocationChangeApplicationIndicator'] = ""
df['in_AllocationCommunityWaterSupplySystem'] = ""
df['in_AllocationCropDutyAmount'] = ""
df['in_AllocationExpirationDate'] = ""
df['in_AllocationFlow_CFS'] = dfin1['in_AllocationFlow_CFS'].astype(float) # see above for conversion
df['in_AllocationLegalStatusCV'] = dfin1['WATER_RIGHT_STATUS'].str.title()
df['in_AllocationNativeID'] =  dfin1['APPLICATION_NUMBER'].replace("", 0).fillna(0).astype(str)
df['in_AllocationOwner'] = dfin1['PRIMARY_OWNER_NAME']
df['in_AllocationPriorityDate'] = dfin1['PRIORITY_DATE']
df['in_AllocationSDWISIdentifierCV'] = ""
df['in_AllocationTimeframeEnd'] = dfin1['DIRECT_DIV_SEASON_END']
df['in_AllocationTimeframeStart'] = dfin1['DIRECT_DIV_SEASON_START']
df['in_AllocationTypeCV'] = dfin1['WATER_RIGHT_TYPE'].astype(str) + " " + dfin1['wade_SUB_TYPE'].astype(str)
df['in_AllocationTypeCV'] = df['in_AllocationTypeCV'].astype(str).str.strip()
df['in_AllocationVolume_AF'] = dfin1['in_AllocationVolume_AF'].astype(float) # see above for conversion
df['in_BeneficialUseCategory'] = dfin1['USE_CODE'].str.title()
df['in_CommunityWaterSupplySystem'] = ""
df['in_CropTypeCV'] = ""
df['in_CustomerTypeCV'] = ""
df['in_DataPublicationDate'] = ""
df['in_DataPublicationDOI'] = ""
df['in_ExemptOfVolumeFlowPriority'] = "1" # we want this data to be exempt
df['in_GeneratedPowerCapacityMW'] = ""
df['in_IrrigatedAcreage'] = "" # temp fix, leave blank for now
df['in_IrrigationMethodCV'] = ""
df['in_LegacyAllocationIDs'] = ""
df['in_OwnerClassificationCV'] = ""
df['in_PopulationServed'] = "" # temp fix, leave blank for now
df['in_PowerType'] = ""
df['in_PrimaryBeneficialUseCategory'] = ""
df['in_SDWISIdentifierCV'] = ""
df['in_WaterAllocationNativeURL'] = "https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/EWServlet?Redirect_Page=EWPublicAppSummary.jsp&Purpose=getEwrimsPublicSummary&wrWaterRightID=" + dfin1['WR_WATER_RIGHT_ID'].replace("", 0).fillna(0).astype(int).astype(str)


# Site VariableAmounts Info
df['in_Amount'] = dfin1['AMOUNT']
df['in_AssociatedNativeAllocationIDs'] = dfin1['APPLICATION_NUMBER']
df['in_PowerGeneratedGWh'] = ""
df['in_PrimaryUseCategory'] = "" # see below
df['in_ReportYearCV'] = dfin1['YEAR'].replace("", 0).fillna(0).astype(int).astype(str)
df['in_SDWISIdentifier'] = ""
df['in_TimeframeEnd'] = dfin1['MONTH FORMATTED']
df['in_TimeframeStart'] = dfin1['MONTH FORMATTED']
# df['in_AllocationCropDutyAmount'] = "" see above AllocationAmount Info
# df['in_BeneficialUseCategory'] = "" see above AllocationAmount Info
# df['in_CommunityWaterSupplySystem'] = "" see above AllocationAmount Info
# df['in_CropTypeCV'] = "" see above AllocationAmount Info
# df['in_CustomerTypeCV'] = "" see above AllocationAmount Info
# df['in_DataPublicationDate'] = "" see above AllocationAmount Info
# df['in_DataPublicationDOI'] = "" see above AllocationAmount Info
# df['in_Geometry'] = "" see above Site Info
# df['in_IrrigatedAcreage'] = "" see above AllocationAmount Info
# df['in_IrrigationMethodCV'] = "" see above AllocationAmount Info
# df['in_PopulationServed'] = "" see above AllocationAmount Info
# df['in_PowerType'] = "" see above AllocationAmount Info
# df['in_SDWISIdentifier'] = "" see above AllocationAmount Info

outdf1 = df.copy()
outdf1 = outdf1.drop_duplicates().reset_index(drop=True)
print(len(outdf1))
outdf1.head()

  df['in_HUC12'] = dfin1['HUC_12_NUMBER'].replace("", 0).fillna(0).astype('int64').astype(str)
  df['in_HUC8'] = dfin1['HUC_8_NUMBER'].replace("", 0).fillna(0).astype('int64').astype(str)


15073668


Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart
0,,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Unnamed Stream,,Surface,,,,GIS_NE,San Diego,4326,,181002020203,18100202,32.6456,-116.2886,,,POD,Compound Stock Pond,60497,,WaDE Blank,CA,,4/18/2013,,,,,,,,0.15,Cancelled,T032025,"569 EAST COUNTY BOULEVARD, LLC",,,12/31,1/1,Temporary Permit,,Dust Control,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,T032025,,,0,,,
1,,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Lake Domingo,,Surface,,,,GIS_NE,San Diego,4326,,181002020203,18100202,32.6144,-116.2726,,,POD,Lake Domingo,60498,,WaDE Blank,CA,,4/18/2013,,,,,,,,0.15,Cancelled,T032025,"569 EAST COUNTY BOULEVARD, LLC",,,12/31,1/1,Temporary Permit,,Dust Control,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,T032025,,,0,,,
2,"in12,in211979129",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,LILAC HILLS ESTATES LP,,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,0.0,A000016,,,2011,,1/1/2011,1/1/2011
3,"in12,in211979142",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,LILAC HILLS ESTATES LP,,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,0.0,A000016,,,2012,,1/1/2012,1/1/2012
4,"in12,in211979156",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,LILAC HILLS ESTATES LP,,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,0.0,A000016,,,2013,,1/1/2013,1/1/2013


## Concatenate POD and POU Data.  Make needed changes

In [14]:
# etc etc,
# outdf2

## Concatenate DataFrames together

In [15]:
# Concatenate dataframes
frames = [outdf1]  # list all out dataframes here
outdf = pd.concat(frames)
outdf = outdf.drop_duplicates().reset_index(drop=True).replace(np.nan, "")
print(len(outdf))

15073668


## Clean Data / data types

In [16]:
# Fill empty ben use values

def fillEmptyBenUseFunc(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = "Unspecified"
    else:
        outString = val
    return outString
    
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: fillEmptyBenUseFunc(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: fillEmptyBenUseFunc(row['in_PrimaryUseCategory']), axis=1)
outdf['in_BeneficialUseCategory'].unique()

array(['Dust Control', 'Domestic', 'Irrigation', 'Power', 'Municipal',
       'Fish And Wildlife Preservation And Enhancement', 'Stockwatering',
       'Industrial', 'Mining', 'Recreational', 'Fire Protection',
       'Incidental Power', 'Frost Protection', 'Aquaculture',
       'Snow Making', 'Milling', 'Heat Control', 'Other', 'Unspecified',
       'Aesthetic', 'Water Quality'], dtype=object)

In [17]:
# Update datatype of Priority Date to fit WaDE 2.0 structure
def formatDateString(inString1):
    inString = str(inString1).strip()
    try:
        if inString == "" or pd.isnull(inString):
            valndf = ""
        else:
            valD = pd.to_datetime(inString)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d/%Y')
    except:
        valndf = ""
    return valndf

outdf['in_AllocationPriorityDate'] = outdf.apply(lambda row: formatDateString(row['in_AllocationPriorityDate']), axis=1)
outdf['in_AllocationPriorityDate'].unique()

array(['', '07/22/1915', '04/17/1916', ..., '11/08/2023', '12/18/2023',
       '10/20/2020'], dtype=object)

In [18]:
# Clean name entries of spcial characters
def removeSpecialCharsFunc(Val):
    Val = str(Val)
    Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()
    return Val

  Val = re.sub("[$@&.;/\)(-]", "", Val).title().replace("  ", " ").strip()


In [19]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Unnamed Stream', 'Lake Domingo', 'Icy Spring No 1', ...,
       'Unnamed Spring, Tributary To Unnamed Spring, Thence Eel River',
       'Unnamed Stream, Tributary To Unnamed Stream, Thence Deam Creek',
       'Clover Creek Tributary To Clover Creek Thence Gilbert Creek'],
      dtype=object)

In [20]:
outdf['in_County'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['San Diego', 'Los Angeles', 'Glenn', 'Madera', 'Lake', 'Colusa',
       'Mono', 'Lassen', 'Tuolumne', 'Riverside', 'San Bernardino',
       'Inyo', 'Modoc', 'Sacramento', 'El Dorado', 'Trinity', 'Fresno',
       'San Luis Obispo', 'Santa Clara', 'Shasta', 'Tulare', 'Kern',
       'San Joaquin', 'Tehama', 'Kings', 'Stanislaus', 'Siskiyou',
       'Contra Costa', 'Butte', 'Yuba', 'Sierra', 'Napa', 'Sutter',
       'Placer', 'Ventura', 'Yolo', 'Alpine', 'Plumas', 'Santa Barbara',
       'Mendocino', 'Sonoma', 'Calaveras', 'Mariposa', 'Merced',
       'Humboldt', 'Nevada', 'San Benito', 'Solano', 'Santa Cruz',
       'Monterey', 'Marin', 'Amador', 'Alameda', 'San Mateo', '',
       'Orange', 'Imperial', 'Del Norte', 'San Francisco'], dtype=object)

In [21]:
outdf['in_SiteName'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Compound Stock Pond', 'Lake Domingo', '', ...,
       'La Cienega Well 1', 'S026212 Spring', 'Willow Springs'],
      dtype=object)

In [22]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: removeSpecialCharsFunc(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['569 East County Boulevard, Llc', 'Lilac Hills Estates Lp',
       'Glenncolusa Irrigation District', ..., 'Eel River Dry Farms',
       'Seeba Creek Inc', 'Parker Briggs'], dtype=object)

In [23]:
# Ensure Empty String / remove string value of "nan"

def ensureEmptyString(val):
    val = str(val).strip()
    if val == "" or val == " " or val == "nan" or pd.isnull(val):
        outString = ""
    else:
        outString = val
    return outString

In [24]:
outdf['in_WaterSourceName'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceName']), axis=1)
outdf['in_WaterSourceName'].unique()

array(['Unnamed Stream', 'Lake Domingo', 'Icy Spring No 1', ...,
       'Unnamed Spring, Tributary To Unnamed Spring, Thence Eel River',
       'Unnamed Stream, Tributary To Unnamed Stream, Thence Deam Creek',
       'Clover Creek Tributary To Clover Creek Thence Gilbert Creek'],
      dtype=object)

In [25]:
outdf['in_WaterSourceTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceTypeCV'].unique()

array(['Surface', 'WaDE Blank', 'Subsurface'], dtype=object)

In [26]:
outdf['in_SiteTypeCV'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteTypeCV']), axis=1)
outdf['in_SiteTypeCV'].unique()

array(['WaDE Blank'], dtype=object)

In [27]:
outdf['in_SiteName'] = outdf.apply(lambda row: ensureEmptyString(row['in_SiteName']), axis=1)
outdf['in_SiteName'].unique()

array(['Compound Stock Pond', 'Lake Domingo', '', ...,
       'La Cienega Well 1', 'S026212 Spring', 'Willow Springs'],
      dtype=object)

In [28]:
outdf['in_County'] = outdf.apply(lambda row: ensureEmptyString(row['in_County']), axis=1)
outdf['in_County'].unique()

array(['San Diego', 'Los Angeles', 'Glenn', 'Madera', 'Lake', 'Colusa',
       'Mono', 'Lassen', 'Tuolumne', 'Riverside', 'San Bernardino',
       'Inyo', 'Modoc', 'Sacramento', 'El Dorado', 'Trinity', 'Fresno',
       'San Luis Obispo', 'Santa Clara', 'Shasta', 'Tulare', 'Kern',
       'San Joaquin', 'Tehama', 'Kings', 'Stanislaus', 'Siskiyou',
       'Contra Costa', 'Butte', 'Yuba', 'Sierra', 'Napa', 'Sutter',
       'Placer', 'Ventura', 'Yolo', 'Alpine', 'Plumas', 'Santa Barbara',
       'Mendocino', 'Sonoma', 'Calaveras', 'Mariposa', 'Merced',
       'Humboldt', 'Nevada', 'San Benito', 'Solano', 'Santa Cruz',
       'Monterey', 'Marin', 'Amador', 'Alameda', 'San Mateo', '',
       'Orange', 'Imperial', 'Del Norte', 'San Francisco'], dtype=object)

In [29]:
outdf['in_AllocationOwner'] = outdf.apply(lambda row: ensureEmptyString(row['in_AllocationOwner']), axis=1)
outdf['in_AllocationOwner'].unique()

array(['569 East County Boulevard, Llc', 'Lilac Hills Estates Lp',
       'Glenncolusa Irrigation District', ..., 'Eel River Dry Farms',
       'Seeba Creek Inc', 'Parker Briggs'], dtype=object)

In [30]:
outdf['in_BeneficialUseCategory'] = outdf.apply(lambda row: ensureEmptyString(row['in_BeneficialUseCategory']), axis=1)
uniqueList = list(set([i.strip() for i in ','.join(outdf['in_BeneficialUseCategory'].astype(str)).split(',')]))
uniqueList.sort()
uniqueList

['Aesthetic',
 'Aquaculture',
 'Domestic',
 'Dust Control',
 'Fire Protection',
 'Fish And Wildlife Preservation And Enhancement',
 'Frost Protection',
 'Heat Control',
 'Incidental Power',
 'Industrial',
 'Irrigation',
 'Milling',
 'Mining',
 'Municipal',
 'Other',
 'Power',
 'Recreational',
 'Snow Making',
 'Stockwatering',
 'Unspecified',
 'Water Quality']

In [31]:
# Ensure Latitude entry is numireic, replace '0' values for removal
outdf['in_Latitude'] = pd.to_numeric(outdf['in_Latitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Latitude'].unique()

array([32.6456, 32.6144, 34.57797187, ..., 39.143222, 39.141755,
       39.141965], dtype=object)

In [32]:
# Ensure Longitude entry is numireic, replace '0' values for removal
outdf['in_Longitude'] = pd.to_numeric(outdf['in_Longitude'], errors='coerce').replace(0,"").fillna("")
outdf['in_Longitude'].unique()

array([-116.2886, -116.2726, -118.28920963, ..., -122.859216, -122.858926,
       -122.858624], dtype=object)

In [33]:
# Fixing in_AllocationFlow_CFS datatype
outdf['in_AllocationFlow_CFS'] = pd.to_numeric(outdf['in_AllocationFlow_CFS'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationFlow_CFS'].unique()

array([0.15, '', 2.5, ..., 2005.0, 1600.0, 22.9918177767917], dtype=object)

In [34]:
# Fixing in_AllocationVolume_AF datatype
outdf['in_AllocationVolume_AF'] = pd.to_numeric(outdf['in_AllocationVolume_AF'], errors='coerce').replace(0,"").fillna("")
outdf['in_AllocationVolume_AF'].unique()

array(['', 0.13, 365.0, 20.62], dtype=object)

In [35]:
# Ensure Amount entry is either numireic or blank, no 0 entries
outdf['in_Amount'] = pd.to_numeric(outdf['in_Amount'], errors='coerce').round(2).replace(0,"").fillna("")
outdf['in_Amount'].unique()

array(['', 3270.0, 3273.0, ..., 141834.0, 411.82, 354.06], dtype=object)

In [36]:
# Ensure PopulationServed entry is numireic WITH 0 entries (no blank strings)
outdf['in_PopulationServed'] = pd.to_numeric(outdf['in_PopulationServed'], errors='coerce').round().replace("",0).fillna(0).astype(int).replace(0,"").fillna("")
outdf['in_PopulationServed'].unique()

array([''], dtype=object)

In [37]:
# Changing datatype of Priority Date to date fields entry
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf['in_AllocationPriorityDate'], errors = 'coerce')
outdf['in_AllocationPriorityDate'] = pd.to_datetime(outdf["in_AllocationPriorityDate"].dt.strftime('%m/%d/%Y'))
outdf['in_AllocationPriorityDate'].unique()

<DatetimeArray>
[                'NaT', '1915-07-22 00:00:00', '1916-04-17 00:00:00',
 '1916-04-28 00:00:00', '1917-01-03 00:00:00', '1917-07-12 00:00:00',
 '1918-12-31 00:00:00', '1919-02-26 00:00:00', '1919-10-10 00:00:00',
 '1920-02-11 00:00:00',
 ...
 '2022-12-28 00:00:00', '2019-11-21 00:00:00', '2023-04-20 00:00:00',
 '2023-10-18 00:00:00', '2022-11-14 00:00:00', '2023-10-27 00:00:00',
 '2022-11-16 00:00:00', '2023-11-08 00:00:00', '2023-12-18 00:00:00',
 '2020-10-20 00:00:00']
Length: 1892, dtype: datetime64[ns]

In [38]:
# Convert TimeframeEnd to YYYY-MM-DD format.
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf['in_TimeframeEnd'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeEnd'] = pd.to_datetime(outdf["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeEnd'].unique()

<DatetimeArray>
[                'NaT', '2011-01-01 00:00:00', '2012-01-01 00:00:00',
 '2013-01-01 00:00:00', '2014-01-01 00:00:00', '2015-01-01 00:00:00',
 '2016-01-01 00:00:00', '2017-01-01 00:00:00', '2018-01-01 00:00:00',
 '2019-01-01 00:00:00',
 ...
 '1974-06-01 00:00:00', '1977-06-01 00:00:00', '1978-06-01 00:00:00',
 '1979-06-01 00:00:00', '1980-06-01 00:00:00', '1981-06-01 00:00:00',
 '1982-06-01 00:00:00', '1983-06-01 00:00:00', '1984-06-01 00:00:00',
 '1985-06-01 00:00:00']
Length: 721, dtype: datetime64[ns]

In [39]:
# Convert TimeframeStart to YYYY-MM-DD format.
outdf['in_TimeframeStart'] = pd.to_datetime(outdf['in_TimeframeStart'], utc=True, errors = 'coerce').fillna("")
outdf['in_TimeframeStart'] = pd.to_datetime(outdf["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))
outdf['in_TimeframeStart'].unique()

<DatetimeArray>
[                'NaT', '2011-01-01 00:00:00', '2012-01-01 00:00:00',
 '2013-01-01 00:00:00', '2014-01-01 00:00:00', '2015-01-01 00:00:00',
 '2016-01-01 00:00:00', '2017-01-01 00:00:00', '2018-01-01 00:00:00',
 '2019-01-01 00:00:00',
 ...
 '1974-06-01 00:00:00', '1977-06-01 00:00:00', '1978-06-01 00:00:00',
 '1979-06-01 00:00:00', '1980-06-01 00:00:00', '1981-06-01 00:00:00',
 '1982-06-01 00:00:00', '1983-06-01 00:00:00', '1984-06-01 00:00:00',
 '1985-06-01 00:00:00']
Length: 721, dtype: datetime64[ns]

In [40]:
# extract year out
outdf['in_ReportYearCV'].unique()

array(['0', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022', '2008', '2009', '2010',
       '2023', '2007', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '1975', '1930', '1993',
       '1994', '1995', '1991', '1992', '1986', '1987', '1988', '1989',
       '1990', '1915', '1963', '1913', '1959', '1906', '1916', '1976',
       '1971', '1972', '1973', '1974', '1977', '1978', '1979', '1980',
       '1981', '1982', '1983', '1984', '1985'], dtype=object)

In [41]:
# Assign Primary Use Category

import sys
sys.path.append("C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/5_CustomFunctions/AssignPrimaryUseCategory")
import AssignPrimaryUseCategoryFile # Use Custom import file

outdf['in_PrimaryUseCategory'] = outdf.apply(lambda row: AssignPrimaryUseCategoryFile.retrievePrimaryUseCategory(row['in_BeneficialUseCategory']), axis=1)
outdf['in_PrimaryUseCategory'].unique()

array(['Other', 'Domestic', 'Agriculture Irrigation', 'Hydroelectric',
       'Public Supply', 'In-stream Flow', 'Livestock',
       'Commercial/Industrial', 'Mining', 'Recreation', 'Fire',
       'Unspecified', 'Aquaculture', 'Snow', 'Thermoelectric Cooling',
       'Municipal Irrigation'], dtype=object)

In [42]:
# Creating WaDE Custom VariableSpecificCV
# ----------------------------------------------------------------------------------------------------
def createVariableSpecificCV(inV, inAIU, inPU, inWST):
    inV = str(inV).strip()
    inAIU = str(inAIU).strip()
    inPU = str(inPU).strip().title()
    inWST = str(inWST).strip()
    outString = inV + "_" + inAIU + "_" + inPU + "_" + inWST
    return outString

outdf['in_VariableSpecificCV'] = outdf.apply(lambda row: createVariableSpecificCV(row['in_VariableCV'], 
                                                                                  row['in_AggregationIntervalUnitCV'],
                                                                                  row['in_PrimaryUseCategory'],
                                                                                  row['in_WaterSourceTypeCV']), axis=1)
outdf['in_VariableSpecificCV'].unique()

array(['Water Use_Monthly_Other_Surface',
       'Water Use_Monthly_Domestic_WaDE Blank',
       'Water Use_Monthly_Agriculture Irrigation_Surface',
       'Water Use_Monthly_Domestic_Surface',
       'Water Use_Monthly_Domestic_Subsurface',
       'Water Use_Monthly_Hydroelectric_Subsurface',
       'Water Use_Monthly_Hydroelectric_Surface',
       'Water Use_Monthly_Agriculture Irrigation_Subsurface',
       'Water Use_Monthly_Agriculture Irrigation_WaDE Blank',
       'Water Use_Monthly_Public Supply_Surface',
       'Water Use_Monthly_In-Stream Flow_WaDE Blank',
       'Water Use_Monthly_Livestock_WaDE Blank',
       'Water Use_Monthly_Commercial/Industrial_Surface',
       'Water Use_Monthly_Mining_WaDE Blank',
       'Water Use_Monthly_Hydroelectric_WaDE Blank',
       'Water Use_Monthly_Recreation_Surface',
       'Water Use_Monthly_In-Stream Flow_Surface',
       'Water Use_Monthly_Commercial/Industrial_Subsurface',
       'Water Use_Monthly_Livestock_Subsurface',
       'Water

In [43]:
# Creating WaDE Custom water source native ID for easy water source identification
# use unique WaterSourceName and WaterSourceType values
# ----------------------------------------------------------------------------------------------------

# Create temp in_WaterSourceNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_WaterSourceName'] = outdf['in_WaterSourceName'].astype(str).str.strip()
dfTempID['in_WaterSourceTypeCV'] = outdf['in_WaterSourceTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_WaterSourceNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_WaterSourceName'].astype(str) + dfTempID['in_WaterSourceTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_WaterSourceNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_WaterSourceNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_WaterSourceNativeID'], 
                                                                              row['in_WaterSourceName'], row['in_WaterSourceTypeCV']), axis=1)
outdf['in_WaterSourceNativeID'].unique()

array(['wadeId1', 'wadeId2', 'wadeId3', ..., 'wadeId8868', 'wadeId8869',
       'wadeId8870'], dtype=object)

In [44]:
# Creating WaDE Custom site native ID for easy site identification
# use Unique Latitude, Longitude, SiteName and SiteTypeCV values
# ----------------------------------------------------------------------------------------------------

# Create temp in_SiteNativeID dataframe of unique water source.
def assignIdValueFunc(colRowValue):
    string1 = str(colRowValue)
    outstring = "wadeId" + string1
    return outstring

dfTempID = pd.DataFrame()
dfTempID['in_Latitude'] = outdf['in_Latitude'].astype(str).str.strip()
dfTempID['in_Longitude'] = outdf['in_Longitude'].astype(str).str.strip()
dfTempID['in_SiteName'] = outdf['in_SiteName'].astype(str).str.strip()
dfTempID['in_SiteTypeCV'] = outdf['in_SiteTypeCV'].astype(str).str.strip()
dfTempID = dfTempID.drop_duplicates()

dfTempCount = pd.DataFrame(index=dfTempID.index)
dfTempCount["Count"] = range(1, len(dfTempCount.index) + 1)
dfTempID['in_SiteNativeID'] = dfTempCount.apply(lambda row: assignIdValueFunc(row['Count']), axis=1)
dfTempID['linkKey'] = dfTempID['in_Latitude'].astype(str) + dfTempID['in_Longitude'].astype(str) + dfTempID['in_SiteName'].astype(str)+ dfTempID['in_SiteTypeCV'].astype(str)
IdDict = pd.Series(dfTempID.in_SiteNativeID.values, index=dfTempID.linkKey.astype(str)).to_dict()
# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom site native ID
def retrieveIdValueFunc(checkVal, valA, valB, valC, valD):
    checkVal = str(checkVal).strip()
    if checkVal == "":
        linkKeyVal = str(valA).strip() + str(valB).strip() + str(valC).strip() + str(valD).strip()
        outString = IdDict[linkKeyVal]
    else:
        outString = checkVal
    return outString

outdf['in_SiteNativeID'] = outdf.apply(lambda row: retrieveIdValueFunc(row['in_SiteNativeID'], 
                                                                       row['in_Latitude'], row['in_Longitude'],
                                                                       row['in_SiteName'], row['in_SiteTypeCV']), axis=1)
outdf['in_SiteNativeID'].unique()

array(['60497', '60498', '404', ..., '86252', '86253', '86254'],
      dtype=object)

## Drop non-Active AllocationLegalStatusCV Water Rights
- For CA, we don't want water rights that are considered: "Cancelled", "Closed", "Inactive", "Pending", "Rejected", "Revoked"

In [45]:
# drop non-active AllocationLegalStatusCV values specific to that state.

print(f'length of df before removing non-active rights: ', len(outdf))

# drop list
dropLegalStatusList = ["Cancelled", "Closed", "Inactive", "Pending", "Rejected", "Revoked"]

# drop rows from above list
outdf = outdf[outdf.in_AllocationLegalStatusCV.isin(dropLegalStatusList) == False].reset_index(drop=True)

print(f'length of df after removing non-active rights: ', len(outdf))
for x in outdf['in_AllocationLegalStatusCV'].sort_values().unique():
    print(f'"' + x + '",')

length of df before removing non-active rights:  15073668
length of df after removing non-active rights:  14661640
"",
"Active",
"Adjudicated",
"Certified",
"Claimed",
"Claimed - Local Oversight",
"Completed",
"Licensed",
"Permitted",
"Registered",


## Shapefile Data
- For attaching geometry to POU csv inputs.

In [46]:
# N/A

## Export Outputs

In [47]:
outdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14661640 entries, 0 to 14661639
Data columns (total 76 columns):
 #   Column                                        Dtype         
---  ------                                        -----         
 0   WaDEUUID                                      object        
 1   in_MethodUUID                                 object        
 2   in_VariableSpecificUUID                       object        
 3   in_AggregationIntervalUnitCV                  object        
 4   in_VariableCV                                 object        
 5   in_OrganizationUUID                           object        
 6   in_Geometry                                   object        
 7   in_GNISFeatureNameCV                          object        
 8   in_WaterQualityIndicatorCV                    object        
 9   in_WaterSourceName                            object        
 10  in_WaterSourceNativeID                        object        
 11  in_WaterSourceTypeCV  

In [48]:
outdf

Unnamed: 0,WaDEUUID,in_MethodUUID,in_VariableSpecificUUID,in_AggregationIntervalUnitCV,in_VariableCV,in_OrganizationUUID,in_Geometry,in_GNISFeatureNameCV,in_WaterQualityIndicatorCV,in_WaterSourceName,in_WaterSourceNativeID,in_WaterSourceTypeCV,in_RegulatoryOverlayUUIDs,in_WaterSourceUUID,in_CoordinateAccuracy,in_CoordinateMethodCV,in_County,in_EPSGCodeCV,in_GNISCodeCV,in_HUC12,in_HUC8,in_Latitude,in_Longitude,in_NHDNetworkStatusCV,in_NHDProductCV,in_PODorPOUSite,in_SiteName,in_SiteNativeID,in_SitePoint,in_SiteTypeCV,in_StateCV,in_USGSSiteID,in_AllocationApplicationDate,in_AllocationAssociatedConsumptiveUseSiteIDs,in_AllocationAssociatedWithdrawalSiteIDs,in_AllocationBasisCV,in_AllocationChangeApplicationIndicator,in_AllocationCommunityWaterSupplySystem,in_AllocationCropDutyAmount,in_AllocationExpirationDate,in_AllocationFlow_CFS,in_AllocationLegalStatusCV,in_AllocationNativeID,in_AllocationOwner,in_AllocationPriorityDate,in_AllocationSDWISIdentifierCV,in_AllocationTimeframeEnd,in_AllocationTimeframeStart,in_AllocationTypeCV,in_AllocationVolume_AF,in_BeneficialUseCategory,in_CommunityWaterSupplySystem,in_CropTypeCV,in_CustomerTypeCV,in_DataPublicationDate,in_DataPublicationDOI,in_ExemptOfVolumeFlowPriority,in_GeneratedPowerCapacityMW,in_IrrigatedAcreage,in_IrrigationMethodCV,in_LegacyAllocationIDs,in_OwnerClassificationCV,in_PopulationServed,in_PowerType,in_PrimaryBeneficialUseCategory,in_SDWISIdentifierCV,in_WaterAllocationNativeURL,in_Amount,in_AssociatedNativeAllocationIDs,in_PowerGeneratedGWh,in_PrimaryUseCategory,in_ReportYearCV,in_SDWISIdentifier,in_TimeframeEnd,in_TimeframeStart,in_VariableSpecificCV
0,"in12,in211979129",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,wadeId3,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2011,,2011-01-01,2011-01-01,Water Use_Monthly_Domestic_WaDE Blank
1,"in12,in211979142",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,wadeId3,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2012,,2012-01-01,2012-01-01,Water Use_Monthly_Domestic_WaDE Blank
2,"in12,in211979156",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,wadeId3,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2013,,2013-01-01,2013-01-01,Water Use_Monthly_Domestic_WaDE Blank
3,"in12,in211979175",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,wadeId3,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2014,,2014-01-01,2014-01-01,Water Use_Monthly_Domestic_WaDE Blank
4,"in12,in211979275",CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Icy Spring No 1,wadeId3,WaDE Blank,,,,DD_NE,Los Angeles,4326,,180902061401,18090206,34.57797,-118.28921,,,POD,,404,,WaDE Blank,CA,,,,,,,,,,,Licensed,A000016,Lilac Hills Estates Lp,NaT,,12/31,1/1,Appropriative,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,A000016,,Domestic,2015,,2015-01-01,2015-01-01,Water Use_Monthly_Domestic_WaDE Blank
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14661635,,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Unnamed Spring,wadeId3547,Surface,,,,GIS_LL,Humboldt,4326,,180101060407,18010106,40.27041,-123.83895,,,POD,Unnamed Spring,85832,,WaDE Blank,CA,,12/14/2022,,,,,,,,0.00668,Claimed,S028982,Avenue Of The Giants Farm Llc,NaT,,,,Statement of Div and Use RIPARIAN,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,S028982,,Domestic,0,,NaT,NaT,Water Use_Monthly_Domestic_Surface
14661636,,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Bosworth Creek,wadeId7534,Surface,,,,GIS_LL,Humboldt,4326,,180101050601,18010105,40.35420,-123.72030,,,POD,Pod,85833,,WaDE Blank,CA,,9/12/2023,,,,,,,,0.02228,Claimed,S028983,Magdalena Anguelova,NaT,,,,Statement of Div and Use RIPARIAN,,Domestic,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,S028983,,Domestic,0,,NaT,NaT,Water Use_Monthly_Domestic_Surface
14661637,,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,Unnamed Stream,wadeId1,Surface,,,,GIS_LL,Mendocino,4326,,180101050204,18010105,39.88714,-123.49899,,,POD,Pump,85834,,WaDE Blank,CA,,9/16/2023,,,,,,,,,Claimed,S028984,Jeremy Hodys,NaT,,,,Statement of Div and Use RIPARIAN,,Irrigation,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,S028984,,Agriculture Irrigation,0,,NaT,NaT,Water Use_Monthly_Agriculture Irrigation_Surface
14661638,,CSWRCBwr_M1,CSWRCBwr_V1,Monthly,Water Use,CSWRCBwr_OR1,,,,,wadeId351,Subsurface,,,,GIS_LL,Los Angeles,4326,,180701040300,18070104,34.04127,-118.37713,,,POD,La Cienega Well 1,85870,,WaDE Blank,CA,,9/19/2023,,,,,,,,13.36806,Active,G193679,City Of Beverly Hills Public Works Department,NaT,,,,Groundwater Recordation,,Municipal,,,,,,1,,,,,,,,,,https://ciwqs.waterboards.ca.gov/ciwqs/ewrims/...,,G193679,,Public Supply,0,,NaT,NaT,Water Use_Monthly_Public Supply_Subsurface


In [49]:
# Export the output dataframe
outdf.to_csv('RawInputData/Pwrwu_Main.zip', compression=dict(method='zip', archive_name='Pwrwu_Main.csv'), index=False)  # The output, save as a zip