In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "EWRIMS MASTER FLAT FILE DATA DICTIONARY DRAFT 1-17-20.xlsx" 
# water sources look up
inp_wtrsrs="watersources.csv"
# sites look up
inp_sitdim = 'sites.csv'

#output: water allocation
out_alloc = "waterallocations.csv"    #output

In [4]:
######## WaDE columns

#the followwing fields have difference between the table here (edited by DPL) and that on the schema website
#http://schema.westernstateswater.org/tables/Input_AllocationAmounts_fact.html
"""
BeneficialUseCategory, PrimaryUseCategory, AllocationTimeframeStart, AllocationTimeframeEnd, " "
BeneficialUseCategoryCV, PrimaryUseCategoryCV, TimeframeStartDate,	TimeframeEndDate,	Geometry	
"""
# UUIDs: Add UUIDs for all dim tables
# OrganizationUUID, SiteUUID, VariableSpecificUUID, WaterSourceUUID, MethodUUID
columns = ["OrganizationUUID", "SiteUUID", "VariableSpecificUUID", "WaterSourceUUID", "MethodUUID", "PrimaryUseCategory",
           "BeneficialUseCategory", "AllocationNativeID", "AllocationTypeCV", "AllocationOwner",
           "AllocationApplicationDate", "AllocationPriorityDate", "AllocationLegalStatusCV", "AllocationCropDutyAmount",
           "AllocationExpirationDate",
           "AllocationChangeApplicationIndicator", "LegacyAllocationIDs", "AllocationBasisCV", "AllocationTimeframeStart",
           "AllocationTimeframeEnd", "AllocationAmount", "AllocationMaximum", "PopulationServed", "PowerType", "GeneratedPowerCapacityMW",
           "IrrigatedAcreage", "AllocationCommunityWaterSupplySystem", "AllocationSDWISIdentifierCV",
           "AllocationAssociatedWithdrawalSiteIDs", "AllocationAssociatedConsumptiveUseSiteIDs", "WaterAllocationNativeURL",
           "CustomerTypeCV", "IrrigationMethodCV", "CropTypeCV", "CommunityWaterSupplySystem", "DataPublicationDate",
           "DataPublicationDOI"]

dtypesx = [''] #here we could theoretically specify data types for each column name, but we didn't need to do that

In [5]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# 
# sheet to read = "ewrims_flat_file"
df100 = pd.read_excel(fileInput1, header=0, sheet_name="ewrims_flat_file", skiprows=0, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
#print (len(df100.index))
#df100


# sites look up
df500 = pd.read_csv(inp_sitdim, encoding = "ISO-8859-1")

# water sources look up
df400 = pd.read_csv(inp_wtrsrs, encoding = "ISO-8859-1")

Reading inputs...


  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# use only unique water rights that may have multiple sites/pds
print("Dropping duplicates...")

print (len(df100.index))

df100.drop_duplicates(subset = ['APPLICATION_NUMBER'], inplace=True)   #
df100 = df100.reset_index(drop=True)

print (len(df100.index))

df100 = df100.replace(np.nan, '')

df100

Dropping duplicates...
57736
57715


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,PETITION_STATUS_TYPE,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE
0,0,,,,,Not Determined,,,,,...,,,,,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,,,,,0
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,,,,,0
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,,,,,0
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,Pending,1578441600000000000,,1578564981000000000,34,,,,,0
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,,,,,,0
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,Completed,1367798400000000000,1372636800000000000,1372778637000000000,4,,,,,0
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,,,,,,0
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,,,,,,0
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,,,,,,0


In [8]:
print("Adding SiteUUID...")

def assignSiteID(colrowValue, df500):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        sitl = df500.loc[df500['SiteNativeID'] == colrowValue, 'SiteUUID']
        #print(sitl)
        #print(sitl.empty)
        if not(sitl.empty):            # check if the series is empty
            outList = ', '.join(str(inx) for inx in sitl) #sil.iloc[0]
        else:
            outList = ''
    return outList

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: assignSiteID(row['POD_ID'], df500), axis=1)
#pod_location_id
df100

Adding SiteUUID...


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,DATE_RECEIVED,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,SiteUUID
0,0,,,,,Not Determined,,,,,...,,,,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98,
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,,,,0,CA_60498
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,,,,0,CA_34881
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,,,,0,CA_28036
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,1578441600000000000,,1578564981000000000,34,,,,,0,CA_23233
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,,,,,0,CA_405
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,1367798400000000000,1372636800000000000,1372778637000000000,4,,,,,0,CA_6233
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,,,,,0,CA_40741
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,,,,,0,CA_29033
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,,,,,0,CA_20017


In [9]:
print("Water sources...")

def assignWaterSourceID(colrowValue11, df400):
    colrowValue1 = str(colrowValue11).strip()
    if ((colrowValue1 == '') | (pd.isnull(colrowValue1))):
        outList = ''
    else:
        ml = df400.loc[df400['WaterSourceName'] == colrowValue1, 'WaterSourceUUID']
        #print(ml)
        #print(ml.empty)
        if not(ml.empty):            # check if the series is empty
            outList = ml.iloc[0]   # watersourceSer.append(ml.iloc[0])
        else:
            outList = ''
    return outList

df100 = df100.assign(WaterSourceUUID='')

df100['WaterSourceUUID'] = df100.apply(lambda row: 
                        assignWaterSourceID(row['SOURCE_NAME'], df400), axis=1)

df100

Water sources...


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,DATE_COMPLETED,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,SiteUUID,WaterSourceUUID
0,0,,,,,Not Determined,,,,,...,,,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98,,
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,,,0,CA_60498,CA_2
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,,,0,CA_34881,CA_3
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,,,0,CA_28036,CA_4
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,,1578564981000000000,34,,,,,0,CA_23233,CA_5
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,,,,0,CA_405,CA_6
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,1372636800000000000,1372778637000000000,4,,,,,0,CA_6233,CA_4
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,,,,0,CA_40741,CA_4
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,,,,0,CA_29033,CA_7
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,,,,0,CA_20017,CA_8


In [10]:
print("Allocation priority date...")

# input format 1924-12-06 00:00:00 format
def formatDateString(inString1):
    #print(inString1)
    inString = str(inString1).strip()
    #print(inString)
    try:
        if inString == '' or pd.isnull(inString):
            valndf = ''
        else:
            valD = pd.to_datetime(inString) # Also valD = parse(inString) #--datetuil.parser.parse
            #valD = datetime.strptime(inString, '%Y-%m-%d 00:00:00')
            #print(valD)
            valnDd = valD.date()
            #print(valnDd)
            valndf = valnDd.strftime('%m/%d/%Y')
            #print('date:', valndf)
    except:
        valndf = ''

    return valndf

df100 = df100.assign(AllocationPriorityDate='')

df100['AllocationPriorityDate'] = df100.apply(lambda row: 
                         formatDateString(row['PRIORITY_DATE']) if str(row['PRIORITY_DATE']) != ''
                         else formatDateString(row['APPLICATION_ACCEPTANCE_DATE']), axis=1)

df100

Allocation priority date...


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,PET_LAST_UPDATE_DATE,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,SiteUUID,WaterSourceUUID,AllocationPriorityDate
0,0,,,,,Not Determined,,,,,...,,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98,,,
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,,0,CA_60498,CA_2,04/18/2013
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,,0,CA_34881,CA_3,02/15/1915
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,,0,CA_28036,CA_4,03/03/1915
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,1578564981000000000,34,,,,,0,CA_23233,CA_5,03/27/1915
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,,,0,CA_405,CA_6,05/14/1915
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,1372778637000000000,4,,,,,0,CA_6233,CA_4,04/02/1915
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,,,0,CA_40741,CA_4,04/02/1915
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,,,0,CA_29033,CA_7,05/17/1915
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,,,0,CA_20017,CA_8,06/03/1915


In [12]:
print("Timeframe start and time frame end...")

def formatDateString2(inString1):
    #print(inString1)
    inString = str(inString1).strip()
    #print(inString)
    try:
        if inString == '' or pd.isnull(inString):
            valndf = ''
        else:            
            valD = pd.to_datetime(inString) # Also valD = parse(inString) #--datetuil.parser.parse
            #print(valD)
            valnDd = valD.date()
            valndf = valnDd.strftime('%m/%d')
            #print(valndf)
    except:
        valndf = ''

    return valndf

Timeframe start and time frame end...


In [13]:
print("Timeframe start...")

df100 = df100.assign(AllocationTimeframeStart='')

df100['AllocationTimeframeStart'] = df100.apply(lambda row: 
                                        formatDateString2(row['DIRECT_DIV_SEASON_START']),
                                        axis=1)
df100

Timeframe start...


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,NUM_OF_PETITIONS,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,SiteUUID,WaterSourceUUID,AllocationPriorityDate,AllocationTimeframeStart
0,0,,,,,Not Determined,,,,,...,,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98,,,,
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,,0,CA_60498,CA_2,04/18/2013,01/01
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,,0,CA_34881,CA_3,02/15/1915,01/01
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,,0,CA_28036,CA_4,03/03/1915,03/01
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,34,,,,,0,CA_23233,CA_5,03/27/1915,04/01
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,,0,CA_405,CA_6,05/14/1915,05/01
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,4,,,,,0,CA_6233,CA_4,04/02/1915,04/01
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,,0,CA_40741,CA_4,04/02/1915,05/01
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,,0,CA_29033,CA_7,05/17/1915,06/01
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,,0,CA_20017,CA_8,06/03/1915,01/01


In [15]:
print("Timeframe end...")

df100 = df100.assign(AllocationTimeframeEnd='')

df100['AllocationTimeframeEnd'] = df100.apply(lambda row: 
                                            formatDateString2(row['DIRECT_DIV_SEASON_END']),
                                            axis=1)
df100

Timeframe end...


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,ENF_CASE_NUMBER,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,SiteUUID,WaterSourceUUID,AllocationPriorityDate,AllocationTimeframeStart,AllocationTimeframeEnd
0,0,,,,,Not Determined,,,,,...,ENF03549,1573084800000000000,1577750400000000000,Santa Clara,98,,,,,
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,,0,CA_60498,CA_2,04/18/2013,01/01,12/31
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,,0,CA_34881,CA_3,02/15/1915,01/01,12/31
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,,0,CA_28036,CA_4,03/03/1915,03/01,11/01
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,,,,,0,CA_23233,CA_5,03/27/1915,04/01,07/01
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,,0,CA_405,CA_6,05/14/1915,05/01,10/31
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,,,,,0,CA_6233,CA_4,04/02/1915,04/01,10/15
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,,0,CA_40741,CA_4,04/02/1915,05/01,11/30
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,,0,CA_29033,CA_7,05/17/1915,06/01,10/01
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,,0,CA_20017,CA_8,06/03/1915,01/01,12/31


In [16]:
print("Allocation amount adjust units")

df100 = df100.assign(AllocationAmount='')

#input in Gallons Per Minute (GPM); Convert to CFS
convFact = 0.00222800926
df100['AllocationAmount'] = df100.apply(lambda row: '' if str(row['MAX_RATE_OF_DIVERSION']) == ''
                                      else float(row['MAX_RATE_OF_DIVERSION']) * convFact, axis=1)

df100

Allocation amount adjust units


Unnamed: 0,WR_WATER_RIGHT_ID,APPLICATION_NUMBER,CERTIFICATE_ID,PERMIT_ID,LICENSE_ID,WATER_RIGHT_TYPE,WATER_RIGHT_STATUS,APPLICATION_NUMBER_PARTY,PWSS_ID,PRIORITY_DATE,...,ENF_CASE_START_DATE,ENF_CASE_CLOSURE_DATE,ENF_CASE_COUNTY,NUMBER_OF_ENFORCEMENT_CASE,SiteUUID,WaterSourceUUID,AllocationPriorityDate,AllocationTimeframeStart,AllocationTimeframeEnd,AllocationAmount
0,0,,,,,Not Determined,,,,,...,1573084800000000000,1577750400000000000,Santa Clara,98,,,,,,
1,1,T032025,,,,Temporary Permit,Cancelled,T032025,,,...,,,,0,CA_60498,CA_2,04/18/2013,01/01,12/31,
2,2,A000016,41,30,41,Appropriative,Licensed,A000016,,,...,,,,0,CA_34881,CA_3,02/15/1915,01/01,12/31,
3,3,A000018,2871,29,2871,Appropriative,Licensed,A000018,,,...,,,,0,CA_28036,CA_4,03/03/1915,03/01,11/01,
4,4,A000023,1986,273,1986,Appropriative,Licensed,A000023,,,...,,,,0,CA_23233,CA_5,03/27/1915,04/01,07/01,
5,5,A000026,36,4,36,Appropriative,Licensed,A000026,,,...,,,,0,CA_405,CA_6,05/14/1915,05/01,10/31,
6,6,A000027,3165,31,3165,Appropriative,Licensed,A000027,,,...,,,,0,CA_6233,CA_4,04/02/1915,04/01,10/15,
7,7,A000027A,2762,29,2762,Appropriative,Revoked,A000027A,,,...,,,,0,CA_40741,CA_4,04/02/1915,05/01,11/30,
8,8,A000042,211,274,211,Appropriative,Licensed,A000042,,,...,,,,0,CA_29033,CA_7,05/17/1915,06/01,10/01,
9,9,A000051,622,81,622,Appropriative,Licensed,A000051,,,...,,,,0,CA_20017,CA_8,06/03/1915,01/01,12/31,


In [17]:
print("Copying all columns...")
#
destCols=["SiteUUID", "WaterSourceUUID", 
          "AllocationNativeID",  
          "BeneficialUseCategory", 
          "AllocationOwner", 
          "AllocationTypeCV", 
          #"AllocationApplicationDate", 
          "AllocationPriorityDate",
          "AllocationLegalStatusCV",
          "IrrigatedAcreage",
          "AllocationTimeframeStart", "AllocationTimeframeEnd",
          "AllocationAmount", 
          "AllocationMaximum",           
          #"AllocationCropDutyAmount", "AllocationExpirationDate",           
          #"WaterAllocationNativeURL"
         ]
#
srsCols=["SiteUUID", "WaterSourceUUID", 
          "APPLICATION_NUMBER", 
          "USE_CODE", 
          "PRIMARY_OWNER_NAME",
          "WATER_RIGHT_TYPE", 
          #"AllocationApplicationDate",
          "AllocationPriorityDate", 
          "WATER_RIGHT_STATUS",
          "USE_NET_ACREAGE",
          "AllocationTimeframeStart", "AllocationTimeframeEnd",
          "AllocationAmount",
          "USE_DIRECT_DIV_ANNUAL_AMOUNT",          
          #"IRRIGATION_DEPLETION", "DATE_TERMINATED",          
          #"wris_link"
         ]

outdf100[destCols] = df100[srsCols]

outdf100

Copying all columns...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,,,,,,,,,Not Determined,,...,,,,,,,,,,
1,,CA_60498,,CA_2,,,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,,
2,,CA_34881,,CA_3,,,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,,
3,,CA_28036,,CA_4,,,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,,
4,,CA_23233,,CA_5,,,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,,
5,,CA_405,,CA_6,,,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,,
6,,CA_6233,,CA_4,,,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,,
7,,CA_40741,,CA_4,,,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,,
8,,CA_29033,,CA_7,,,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,,
9,,CA_20017,,CA_8,,,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,,


In [18]:
# hard coded
print("Hard coded...")
#hard coded
outdf100.OrganizationUUID = "CSWRCB"
outdf100.VariableSpecificUUID = "CSWRCB Allocation all"
outdf100.MethodUUID = "CSWRCB-Water Rights"
outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
#
#outdf100.AllocationTimeframeStart = "01/01"
#outdf100.AllocationTimeframeEnd = "12/31"

#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y') 
#"10/31/2019" # edit this to the code run date

outdf100 = outdf100.replace(np.nan, '')

outdf100

Hard coded...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,,CSWRCB Allocation all,,CSWRCB-Water Rights,Irrigation,,,Not Determined,,...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [19]:
print("Droping null allocations...")
# if both Allocation amount and Allocation maximum are empty drop row and save it to a Allocations_missing.csv
outdf100 = outdf100.replace(np.nan, '') #replace blank strings by NaN,
outdf100purge = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('waterallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100
#outdf100purge

Droping null allocations...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_29035,CSWRCB Allocation all,CA_9,CSWRCB-Water Rights,Irrigation,Power,A000052,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [20]:
print("Droping null SiteUUIDs...")
outdf100nullID = outdf100.loc[outdf100["SiteUUID"] == '']
if len(outdf100nullID.index) > 0:
    dropIndex = outdf100.loc[outdf100["SiteUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping null SiteUUIDs...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_29035,CSWRCB Allocation all,CA_9,CSWRCB-Water Rights,Irrigation,Power,A000052,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [21]:
print("Droping null Priority date...")
outdf100nullPR = outdf100.loc[outdf100["AllocationPriorityDate"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["AllocationPriorityDate"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100
#outdf100nullPR

Droping null Priority date...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_29035,CSWRCB Allocation all,CA_9,CSWRCB-Water Rights,Irrigation,Power,A000052,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [22]:
print("Droping null WaterSourceUUID ...")
outdf100nullPR = outdf100.loc[outdf100["WaterSourceUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["WaterSourceUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
outdf100

Droping null WaterSourceUUID ...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_29035,CSWRCB Allocation all,CA_9,CSWRCB-Water Rights,Irrigation,Power,A000052,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [23]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("waterallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_29035,CSWRCB Allocation all,CA_9,CSWRCB-Water Rights,Irrigation,Power,A000052,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [24]:
print("Checking required is not null...")
# check if any cell of these columns is null
requiredCols = ["OrganizationUUID", "VariableSpecificUUID", "WaterSourceUUID", 
                "MethodUUID", "AllocationPriorityDate"] #SiteUUID
# outdf100_nullMand = outdf100.loc[outdf100.isnull().any(axis=1)] --for all cols
# outdf100_nullMand = outdf100.loc[outdf100[requiredCols].isnull().any(axis=1)]
#(outdf100["SiteUUID"].isnull()) |
outdf100_nullMand = outdf100.loc[(outdf100["OrganizationUUID"] == '') |
                                (outdf100["VariableSpecificUUID"] == '') |
                                (outdf100["WaterSourceUUID"] == '') |
                                (outdf100["MethodUUID"] == '') |
                                (outdf100["AllocationPriorityDate"] == '')]
#outdf100_nullMand = outdf100.loc[[False | (outdf100[varName].isnull()) for varName in requiredCols]]
if(len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('waterallocations_mandatoryFieldMissing.csv')  # index=False,
#ToDO: purge these cells if there is any missing? #For now left to be inspected
#outdf100_nullMand

Checking required is not null...


In [25]:
print("Fields longer than 100 chars...")

tarCols = ["OrganizationUUID", "SiteUUID", "VariableSpecificUUID", "WaterSourceUUID", "MethodUUID", "PrimaryUseCategory",
           "BeneficialUseCategory", "AllocationNativeID", "AllocationTypeCV", "AllocationOwner",
           "AllocationApplicationDate", "AllocationPriorityDate", "AllocationLegalStatusCV", "AllocationCropDutyAmount",
           "AllocationExpirationDate",
           "AllocationChangeApplicationIndicator", "LegacyAllocationIDs", "AllocationBasisCV", "AllocationTimeframeStart",
           "AllocationTimeframeEnd", "AllocationAmount", "AllocationMaximum", "PopulationServed", "PowerType", "GeneratedPowerCapacityMW",
           "IrrigatedAcreage", "AllocationCommunityWaterSupplySystem", "AllocationSDWISIdentifierCV",
           "AllocationAssociatedWithdrawalSiteIDs", "AllocationAssociatedConsumptiveUseSiteIDs", "WaterAllocationNativeURL",
           "CustomerTypeCV", "IrrigationMethodCV", "CropTypeCV", "CommunityWaterSupplySystem", "DataPublicationDate",
           "DataPublicationDOI"]

for colVal in tarCols:
    print(colVal)
    outdf100Long = outdf100[outdf100[colVal].apply(lambda x: len(str(x)) > 100)]
    if len(outdf100Long.index) > 0:
        print("There are rows with > 100 chars in "+colVal)

Fields longer than 100 chars...
OrganizationUUID
SiteUUID
VariableSpecificUUID
WaterSourceUUID
MethodUUID
PrimaryUseCategory
BeneficialUseCategory
AllocationNativeID
AllocationTypeCV
AllocationOwner
AllocationApplicationDate
AllocationPriorityDate
AllocationLegalStatusCV
AllocationCropDutyAmount
AllocationExpirationDate
AllocationChangeApplicationIndicator
LegacyAllocationIDs
AllocationBasisCV
AllocationTimeframeStart
AllocationTimeframeEnd
AllocationAmount
AllocationMaximum
PopulationServed
PowerType
GeneratedPowerCapacityMW
IrrigatedAcreage
AllocationCommunityWaterSupplySystem
AllocationSDWISIdentifierCV
AllocationAssociatedWithdrawalSiteIDs
AllocationAssociatedConsumptiveUseSiteIDs
WaterAllocationNativeURL
CustomerTypeCV
IrrigationMethodCV
CropTypeCV
CommunityWaterSupplySystem
DataPublicationDate
DataPublicationDOI


In [26]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("Long site ids...")

print (len(outdf100.index))

outdf100Long = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)]
longSiteIDs = False
if len(outdf100Long.index) > 0:
    print("There are rows with too long siteids")
    outdf100Long.to_csv("waterallocations_longsiteid.csv")  # index=False,
    longSiteIDs = True
#outdf100

outdf100Long

Long site ids...
37783


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI


In [27]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("Long site ids remove rows...")

if longSiteIDs:
    print("There are rows with too long siteids")
    dropIndex = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)].index
    outdf100 = outdf100.drop(dropIndex)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Long site ids remove rows...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,CSWRCB,CA_60498,CSWRCB Allocation all,CA_2,CSWRCB-Water Rights,Irrigation,Dust Control,T032025,Temporary Permit,"569 EAST COUNTY BOULEVARD, LLC",...,,,,,,,,,01/23/2020,
1,CSWRCB,CA_34881,CSWRCB Allocation all,CA_3,CSWRCB-Water Rights,Irrigation,Domestic,A000016,Appropriative,LILAC HILLS ESTATES LP,...,,,,,,,,,01/23/2020,
2,CSWRCB,CA_28036,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000018,Appropriative,GLENN-COLUSA IRRIGATION DISTRICT,...,,,,,,,,,01/23/2020,
3,CSWRCB,CA_23233,CSWRCB Allocation all,CA_5,CSWRCB-Water Rights,Irrigation,Domestic,A000023,Appropriative,U.S. BUREAU OF RECLAMATION,...,,,,,,,,,01/23/2020,
4,CSWRCB,CA_405,CSWRCB Allocation all,CA_6,CSWRCB-Water Rights,Irrigation,Domestic,A000026,Appropriative,"JAMES S PHELPS, TRUSTEE",...,,,,,,,,,01/23/2020,
5,CSWRCB,CA_6233,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027,Appropriative,RECLAMATION DISTRICT #1004,...,,,,,,,,,01/23/2020,
6,CSWRCB,CA_40741,CSWRCB Allocation all,CA_4,CSWRCB-Water Rights,Irrigation,Irrigation,A000027A,Appropriative,RUTH SPENCE,...,,,,,,,,,01/23/2020,
7,CSWRCB,CA_29033,CSWRCB Allocation all,CA_7,CSWRCB-Water Rights,Irrigation,Domestic,A000042,Appropriative,MEADOW RANCH CORP,...,,,,,,,,,01/23/2020,
8,CSWRCB,CA_20017,CSWRCB Allocation all,CA_8,CSWRCB-Water Rights,Irrigation,Power,A000051,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,
9,CSWRCB,CA_29035,CSWRCB Allocation all,CA_9,CSWRCB-Water Rights,Irrigation,Power,A000052,Appropriative,SOUTHERN CALIFORNIA EDISON COMPANY,...,,,,,,,,,01/23/2020,


In [28]:
print("Writing outputs...")

#write out
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")

Writing outputs...
Done Water Allocation
