In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "Permits.csv" 
# water sources look up
inp_wtrsrs="watersources.csv"
# sites look up
inp_sitdim = 'sites.csv'

#output: water allocation
out_alloc = "waterallocations.csv"    #output

In [4]:
######## WaDE columns

#the followwing fields have difference between the table here (edited by DPL) and that on the schema website
#http://schema.westernstateswater.org/tables/Input_AllocationAmounts_fact.html
"""
BeneficialUseCategory, PrimaryUseCategory, AllocationTimeframeStart, AllocationTimeframeEnd, " "
BeneficialUseCategoryCV, PrimaryUseCategoryCV, TimeframeStartDate,	TimeframeEndDate,	Geometry	
"""
# UUIDs: Add UUIDs for all dim tables
# OrganizationUUID, SiteUUID, VariableSpecificUUID, WaterSourceUUID, MethodUUID
columns = ["OrganizationUUID", "SiteUUID", "VariableSpecificUUID", "WaterSourceUUID", "MethodUUID", "PrimaryUseCategory",
           "BeneficialUseCategory", "AllocationNativeID", "AllocationTypeCV", "AllocationOwner",
           "AllocationApplicationDate", "AllocationPriorityDate", "AllocationLegalStatusCV", "AllocationCropDutyAmount",
           "AllocationExpirationDate",
           "AllocationChangeApplicationIndicator", "LegacyAllocationIDs", "AllocationBasisCV", "AllocationTimeframeStart",
           "AllocationTimeframeEnd", "AllocationAmount", "AllocationMaximum", "PopulationServed", "PowerType", "GeneratedPowerCapacityMW",
           "IrrigatedAcreage", "AllocationCommunityWaterSupplySystem", "AllocationSDWISIdentifierCV",
           "AllocationAssociatedWithdrawalSiteIDs", "AllocationAssociatedConsumptiveUseSiteIDs", "WaterAllocationNativeURL",
           "CustomerTypeCV", "IrrigationMethodCV", "CropTypeCV", "CommunityWaterSupplySystem", "DataPublicationDate",
           "DataPublicationDOI"]

dtypesx = [''] #here we could theoretically specify data types for each column name, but we didn't need to do that

In [5]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# 
df100 = pd.read_csv(fileInput1, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
#print (len(df100.index))

#df100


# sites look up
df500 = pd.read_csv(inp_sitdim, encoding = "ISO-8859-1")

# water sources look up
df400 = pd.read_csv(inp_wtrsrs, encoding = "ISO-8859-1")
#drop duplicate rows 
#---this one is not necessary once the water sources table is refined to remove duplicates
df400 = df400.drop_duplicates(subset=['WaterSourceName', 'WaterSourceTypeCV'])
df400

Reading inputs...


Unnamed: 0,WaterSourceUUID,WaterSourceNativeID,WaterSourceName,WaterSourceTypeCV,WaterQualityIndicatorCV,GNISFeatureNameCV,Geometry
0,ND_1,1,Unspecified,Ground Water,Fresh,,
1,ND_2,2,Unspecified,Surface Water,Fresh,,
2,ND_3,3,Charbonneau Creek,Surface Water,Fresh,,
3,ND_4,4,Beaver Creek,Surface Water,Fresh,,
4,ND_5,5,Nelson Creek,Surface Water,Fresh,,
5,ND_6,6,Little Creek,Surface Water,Fresh,,
6,ND_7,7,Little Muddy River,Surface Water,Fresh,,
7,ND_8,8,Un-named coulee & Dry Fork Coulee,Surface Water,Fresh,,
8,ND_9,9,"Intermittent draws, trib. to Little Miss",Surface Water,Fresh,,
9,ND_10,10,Heart River,Surface Water,Fresh,,


In [7]:
# use only unique water rights that may have multiple sites/pds
print("Dropping duplicates...")

print (len(df100.index))

df100.drop_duplicates(subset = ['permit_num'], inplace=True)   #
df100 = df100.reset_index(drop=True)

print (len(df100.index))

df100

Dropping duplicates...
12181
7128


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,...,discharge_,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",3/4/1991,Irrigation,Denied,,,...,,,,0,0.0,0.0,0.0,0.0,-99.789880,46.111300
1,2,1E,2,15310236CC,"HYDE, GEORGE H.",8/15/1901,Irrigation,Cancelled,,,...,,,,0,0.0,0.0,0.0,0.0,-103.752160,48.026220
2,3,2B,3,15310236BA,"SLATER, A. L.",9/2/1901,Irrigation,Cancelled,,,...,,,,0,0.0,0.0,0.0,0.0,-103.746910,48.037070
3,4,2D,4,14910026AB,"GUDMUNSEN, ROBERT AND LOWRAINE",1/26/1906,Irrigation,Perfected,4/30/1937,,...,,,,0,0.0,0.0,0.0,0.0,-103.441260,47.701760
4,5,3C,5,15009823DC,"HARTEL, LEMOINE",2/3/1906,Irrigation,Cancelled,1/10/1990,6/7/2017,...,,,,0,0.0,0.0,0.0,0.0,-103.184202,47.791708
5,6,4C,6,15310234DD,"WOOD, ARTHUR",4/18/1902,Irrigation,Cancelled,,,...,,,,0,0.0,0.0,0.0,0.0,-103.779470,48.026220
6,7,7D,7,15507714DA,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,3/16/1932,Irrigation,Conditionally Approved,2/19/1958,,...,,,,0,0.0,0.0,0.0,0.0,-100.520880,48.244860
7,8,7E,8,15110331AA,"LASSEY, JERRY AND RODNEY",9/21/1937,Irrigation,Perfected,,,...,,,,0,0.0,0.0,0.0,0.0,-103.907440,47.860910
8,9,8A,9,15509628C,"WEYRAUCH, DANIEL",6/18/1901,Irrigation,Perfected,2/2/1995,,...,,,,0,0.0,0.0,0.0,0.0,-103.038480,48.215410
9,10,8D,10,15510030BB,"ROLFSTAD, DORA",4/16/1932,Irrigation,Cancelled,,9/30/1993,...,,,,0,0.0,0.0,0.0,0.0,-103.601270,48.224830


In [8]:
print("Adding SiteUUID...")

def assignSiteID(colrowValue, df500):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        sitl = df500.loc[df500['SiteNativeID'] == colrowValue, 'SiteUUID']
        #print(sitl)
        #print(sitl.empty)
        if not(sitl.empty):            # check if the series is empty
            outList = ', '.join(str(inx) for inx in sitl) #sil.iloc[0]
        else:
            outList = ''
    return outList

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: assignSiteID(row['pod'], df500), axis=1)
#pod_location_id
df100

Adding SiteUUID...


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,...,period_sta,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude,SiteUUID
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",3/4/1991,Irrigation,Denied,,,...,,,0,0.0,0.0,0.0,0.0,-99.789880,46.111300,ND_1
1,2,1E,2,15310236CC,"HYDE, GEORGE H.",8/15/1901,Irrigation,Cancelled,,,...,,,0,0.0,0.0,0.0,0.0,-103.752160,48.026220,ND_2
2,3,2B,3,15310236BA,"SLATER, A. L.",9/2/1901,Irrigation,Cancelled,,,...,,,0,0.0,0.0,0.0,0.0,-103.746910,48.037070,ND_3
3,4,2D,4,14910026AB,"GUDMUNSEN, ROBERT AND LOWRAINE",1/26/1906,Irrigation,Perfected,4/30/1937,,...,,,0,0.0,0.0,0.0,0.0,-103.441260,47.701760,ND_4
4,5,3C,5,15009823DC,"HARTEL, LEMOINE",2/3/1906,Irrigation,Cancelled,1/10/1990,6/7/2017,...,,,0,0.0,0.0,0.0,0.0,-103.184202,47.791708,"ND_5, ND_52"
5,6,4C,6,15310234DD,"WOOD, ARTHUR",4/18/1902,Irrigation,Cancelled,,,...,,,0,0.0,0.0,0.0,0.0,-103.779470,48.026220,ND_6
6,7,7D,7,15507714DA,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,3/16/1932,Irrigation,Conditionally Approved,2/19/1958,,...,,,0,0.0,0.0,0.0,0.0,-100.520880,48.244860,ND_7
7,8,7E,8,15110331AA,"LASSEY, JERRY AND RODNEY",9/21/1937,Irrigation,Perfected,,,...,,,0,0.0,0.0,0.0,0.0,-103.907440,47.860910,"ND_8, ND_5521"
8,9,8A,9,15509628C,"WEYRAUCH, DANIEL",6/18/1901,Irrigation,Perfected,2/2/1995,,...,,,0,0.0,0.0,0.0,0.0,-103.038480,48.215410,"ND_9, ND_9355, ND_9937"
9,10,8D,10,15510030BB,"ROLFSTAD, DORA",4/16/1932,Irrigation,Cancelled,,9/30/1993,...,,,0,0.0,0.0,0.0,0.0,-103.601270,48.224830,ND_10


In [9]:
print("Water sources...")

def assignWaterSourceID2(colrowValue11, colrowValue22, df400):
    colrowValue1 = str(colrowValue11).strip()
    colrowValue2 = str(colrowValue22).strip()
    if ((colrowValue1 == '') | (pd.isnull(colrowValue1))):
        colrowValue1 = 'Unspecified'
    if ((colrowValue2 == '') | (pd.isnull(colrowValue2))):
        colrowValue2 = 'Unknown'
    
    ml = df400.loc[(df400['WaterSourceName'] == colrowValue1) 
                   & (df400['WaterSourceTypeCV'] == colrowValue2),
                   'WaterSourceUUID']
    #print(ml)
    #print(ml.empty)
    if not(ml.empty):            # check if the series is empty
        outList = ml.iloc[0]   # watersourceSer.append(ml.iloc[0])
    else:
        outList = ''
        
    return outList

df100 = df100.assign(WaterSourceUUID='')

df100 = df100.replace(np.nan, '')

df100['WaterSourceUUID'] = df100.apply(lambda row: 
                        assignWaterSourceID2(row['source_nam'], row['source'], df400), axis=1)

df100

Water sources...


Unnamed: 0,permit_ind,permit_num,pod_index,pod,permit_hol,priority_d,use_type,status,date_issue,date_cance,...,period_end,return_qua,held_acft,held_acre,held_rate,held_stora,longitude,latitude,SiteUUID,WaterSourceUUID
0,1,4407,1,13007302B,"KETTERLING, ROLAND & LORRAINE",3/4/1991,Irrigation,Denied,,,...,,0,0.0,0.0,0.0,0.0,-99.789880,46.111300,ND_1,ND_1
1,2,1E,2,15310236CC,"HYDE, GEORGE H.",8/15/1901,Irrigation,Cancelled,,,...,,0,0.0,0.0,0.0,0.0,-103.752160,48.026220,ND_2,ND_2
2,3,2B,3,15310236BA,"SLATER, A. L.",9/2/1901,Irrigation,Cancelled,,,...,,0,0.0,0.0,0.0,0.0,-103.746910,48.037070,ND_3,ND_2
3,4,2D,4,14910026AB,"GUDMUNSEN, ROBERT AND LOWRAINE",1/26/1906,Irrigation,Perfected,4/30/1937,,...,,0,0.0,0.0,0.0,0.0,-103.441260,47.701760,ND_4,ND_2
4,5,3C,5,15009823DC,"HARTEL, LEMOINE",2/3/1906,Irrigation,Cancelled,1/10/1990,6/7/2017,...,,0,0.0,0.0,0.0,0.0,-103.184202,47.791708,"ND_5, ND_52",ND_2
5,6,4C,6,15310234DD,"WOOD, ARTHUR",4/18/1902,Irrigation,Cancelled,,,...,,0,0.0,0.0,0.0,0.0,-103.779470,48.026220,ND_6,ND_2
6,7,7D,7,15507714DA,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,3/16/1932,Irrigation,Conditionally Approved,2/19/1958,,...,,0,0.0,0.0,0.0,0.0,-100.520880,48.244860,ND_7,ND_2
7,8,7E,8,15110331AA,"LASSEY, JERRY AND RODNEY",9/21/1937,Irrigation,Perfected,,,...,,0,0.0,0.0,0.0,0.0,-103.907440,47.860910,"ND_8, ND_5521",ND_3
8,9,8A,9,15509628C,"WEYRAUCH, DANIEL",6/18/1901,Irrigation,Perfected,2/2/1995,,...,,0,0.0,0.0,0.0,0.0,-103.038480,48.215410,"ND_9, ND_9355, ND_9937",ND_4
9,10,8D,10,15510030BB,"ROLFSTAD, DORA",4/16/1932,Irrigation,Cancelled,,9/30/1993,...,,0,0.0,0.0,0.0,0.0,-103.601270,48.224830,ND_10,ND_2


In [10]:
print("Copying all columns...")
#
destCols=["SiteUUID", "WaterSourceUUID", 
          "AllocationNativeID", 
          "AllocationLegalStatusCV", 
          "BeneficialUseCategory", 
          "AllocationOwner", 
          #"AllocationTypeCV", 
          #"AllocationApplicationDate", 
          "AllocationPriorityDate",
          "AllocationAmount", 
          "AllocationMaximum", 
          #"IrrigatedAcreage",
          #"AllocationCropDutyAmount", "AllocationExpirationDate", 
          "AllocationTimeframeStart", "AllocationTimeframeEnd",
          #"WaterAllocationNativeURL"
         ]
#
srsCols=["SiteUUID", "WaterSourceUUID", 
          "permit_num", 
          "status",
          "use_type", 
          "permit_hol",
          #"AllocationTypeCV", 
          #"AllocationApplicationDate",
          "priority_d", 
          "req_rate",
          "req_acft",
          #"IrrigatedAreaQuantity",
          #"IRRIGATION_DEPLETION", "DATE_TERMINATED",
          "period_sta", "period_end",
          #"wris_link"
         ]

outdf100[destCols] = df100[srsCols]

outdf100

Copying all columns...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,,ND_1,,ND_1,,,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,,
1,,ND_2,,ND_2,,,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,,
2,,ND_3,,ND_2,,,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,,
3,,ND_4,,ND_2,,,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,,
4,,"ND_5, ND_52",,ND_2,,,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,,
5,,ND_6,,ND_2,,,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,,
6,,ND_7,,ND_2,,,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,,
7,,"ND_8, ND_5521",,ND_3,,,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,,
8,,"ND_9, ND_9355, ND_9937",,ND_4,,,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,,
9,,ND_10,,ND_2,,,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,,


In [11]:
# hard coded
print("Hard coded...")
#hard coded
outdf100.OrganizationUUID = "NDSWC"
outdf100.VariableSpecificUUID = "NDSWC Allocation all"
outdf100.MethodUUID = "NDSWC-Water Rights"
outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
#
#outdf100.AllocationTimeframeStart = "01/01"
#outdf100.AllocationTimeframeEnd = "12/31"

#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y')    #"10/31/2019" # edit this to the code run date

outdf100

Hard coded...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,NDSWC,ND_1,NDSWC Allocation all,ND_1,NDSWC-Water Rights,Irrigation,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,01/07/2020,
1,NDSWC,ND_2,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,01/07/2020,
2,NDSWC,ND_3,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,01/07/2020,
3,NDSWC,ND_4,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,01/07/2020,
4,NDSWC,"ND_5, ND_52",NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,01/07/2020,
5,NDSWC,ND_6,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,01/07/2020,
6,NDSWC,ND_7,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,01/07/2020,
7,NDSWC,"ND_8, ND_5521",NDSWC Allocation all,ND_3,NDSWC-Water Rights,Irrigation,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,01/07/2020,
8,NDSWC,"ND_9, ND_9355, ND_9937",NDSWC Allocation all,ND_4,NDSWC-Water Rights,Irrigation,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,01/07/2020,
9,NDSWC,ND_10,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,01/07/2020,


In [12]:
print("Droping null allocations...")
# if both Allocation amount and Allocation maximum are empty drop row and save it to a Allocations_missing.csv
outdf100 = outdf100.replace(np.nan, '') #replace NaN by blank strings 
outdf100purge = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('waterallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100
#outdf100purge

Droping null allocations...


  result = method(y)


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,NDSWC,ND_1,NDSWC Allocation all,ND_1,NDSWC-Water Rights,Irrigation,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,01/07/2020,
1,NDSWC,ND_2,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,01/07/2020,
2,NDSWC,ND_3,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,01/07/2020,
3,NDSWC,ND_4,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,01/07/2020,
4,NDSWC,"ND_5, ND_52",NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,01/07/2020,
5,NDSWC,ND_6,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,01/07/2020,
6,NDSWC,ND_7,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,01/07/2020,
7,NDSWC,"ND_8, ND_5521",NDSWC Allocation all,ND_3,NDSWC-Water Rights,Irrigation,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,01/07/2020,
8,NDSWC,"ND_9, ND_9355, ND_9937",NDSWC Allocation all,ND_4,NDSWC-Water Rights,Irrigation,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,01/07/2020,
9,NDSWC,ND_10,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,01/07/2020,


In [13]:
print("Droping null SiteUUIDs...")
outdf100nullID = outdf100.loc[outdf100["SiteUUID"] == '']
if len(outdf100nullID.index) > 0:
    dropIndex = outdf100.loc[outdf100["SiteUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping null SiteUUIDs...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,NDSWC,ND_1,NDSWC Allocation all,ND_1,NDSWC-Water Rights,Irrigation,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,01/07/2020,
1,NDSWC,ND_2,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,01/07/2020,
2,NDSWC,ND_3,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,01/07/2020,
3,NDSWC,ND_4,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,01/07/2020,
4,NDSWC,"ND_5, ND_52",NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,01/07/2020,
5,NDSWC,ND_6,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,01/07/2020,
6,NDSWC,ND_7,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,01/07/2020,
7,NDSWC,"ND_8, ND_5521",NDSWC Allocation all,ND_3,NDSWC-Water Rights,Irrigation,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,01/07/2020,
8,NDSWC,"ND_9, ND_9355, ND_9937",NDSWC Allocation all,ND_4,NDSWC-Water Rights,Irrigation,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,01/07/2020,
9,NDSWC,ND_10,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,01/07/2020,


In [14]:
print("Droping null Priority date...")
outdf100nullPR = outdf100.loc[outdf100["AllocationPriorityDate"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["AllocationPriorityDate"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100
#outdf100nullPR

Droping null Priority date...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,NDSWC,ND_1,NDSWC Allocation all,ND_1,NDSWC-Water Rights,Irrigation,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,01/07/2020,
1,NDSWC,ND_2,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,01/07/2020,
2,NDSWC,ND_3,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,01/07/2020,
3,NDSWC,ND_4,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,01/07/2020,
4,NDSWC,"ND_5, ND_52",NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,01/07/2020,
5,NDSWC,ND_6,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,01/07/2020,
6,NDSWC,ND_7,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,01/07/2020,
7,NDSWC,"ND_8, ND_5521",NDSWC Allocation all,ND_3,NDSWC-Water Rights,Irrigation,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,01/07/2020,
8,NDSWC,"ND_9, ND_9355, ND_9937",NDSWC Allocation all,ND_4,NDSWC-Water Rights,Irrigation,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,01/07/2020,
9,NDSWC,ND_10,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,01/07/2020,


In [15]:
print("Droping null WaterSourceUUID ...")
outdf100nullPR = outdf100.loc[outdf100["WaterSourceUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["WaterSourceUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
outdf100

Droping null WaterSourceUUID ...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,NDSWC,ND_1,NDSWC Allocation all,ND_1,NDSWC-Water Rights,Irrigation,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,01/07/2020,
1,NDSWC,ND_2,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,01/07/2020,
2,NDSWC,ND_3,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,01/07/2020,
3,NDSWC,ND_4,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,01/07/2020,
4,NDSWC,"ND_5, ND_52",NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,01/07/2020,
5,NDSWC,ND_6,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,01/07/2020,
6,NDSWC,ND_7,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,01/07/2020,
7,NDSWC,"ND_8, ND_5521",NDSWC Allocation all,ND_3,NDSWC-Water Rights,Irrigation,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,01/07/2020,
8,NDSWC,"ND_9, ND_9355, ND_9937",NDSWC Allocation all,ND_4,NDSWC-Water Rights,Irrigation,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,01/07/2020,
9,NDSWC,ND_10,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,01/07/2020,


In [16]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("waterallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

Droping duplicates...


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI
0,NDSWC,ND_1,NDSWC Allocation all,ND_1,NDSWC-Water Rights,Irrigation,Irrigation,4407,,"KETTERLING, ROLAND & LORRAINE",...,,,,,,,,,01/07/2020,
1,NDSWC,ND_2,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,1E,,"HYDE, GEORGE H.",...,,,,,,,,,01/07/2020,
2,NDSWC,ND_3,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2B,,"SLATER, A. L.",...,,,,,,,,,01/07/2020,
3,NDSWC,ND_4,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,2D,,"GUDMUNSEN, ROBERT AND LOWRAINE",...,,,,,,,,,01/07/2020,
4,NDSWC,"ND_5, ND_52",NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,3C,,"HARTEL, LEMOINE",...,,,,,,,,,01/07/2020,
5,NDSWC,ND_6,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,4C,,"WOOD, ARTHUR",...,,,,,,,,,01/07/2020,
6,NDSWC,ND_7,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,7D,,MCHENRY COUNTY BOARD OF FLOOD IRRIGATION,...,,,,,,,,,01/07/2020,
7,NDSWC,"ND_8, ND_5521",NDSWC Allocation all,ND_3,NDSWC-Water Rights,Irrigation,Irrigation,7E,,"LASSEY, JERRY AND RODNEY",...,,,,,,,,,01/07/2020,
8,NDSWC,"ND_9, ND_9355, ND_9937",NDSWC Allocation all,ND_4,NDSWC-Water Rights,Irrigation,Irrigation,8A,,"WEYRAUCH, DANIEL",...,,,,,,,,,01/07/2020,
9,NDSWC,ND_10,NDSWC Allocation all,ND_2,NDSWC-Water Rights,Irrigation,Irrigation,8D,,"ROLFSTAD, DORA",...,,,,,,,,,01/07/2020,


In [17]:
print("Checking required is not null...")
# check if any cell of these columns is null
requiredCols = ["OrganizationUUID", "VariableSpecificUUID", "WaterSourceUUID", 
                "MethodUUID", "AllocationPriorityDate"] #SiteUUID
# outdf100_nullMand = outdf100.loc[outdf100.isnull().any(axis=1)] --for all cols
# outdf100_nullMand = outdf100.loc[outdf100[requiredCols].isnull().any(axis=1)]
#(outdf100["SiteUUID"].isnull()) |
outdf100_nullMand = outdf100.loc[(outdf100["OrganizationUUID"] == '') |
                                (outdf100["VariableSpecificUUID"] == '') |
                                (outdf100["WaterSourceUUID"] == '') |
                                (outdf100["MethodUUID"] == '') |
                                (outdf100["AllocationPriorityDate"] == '')]
#outdf100_nullMand = outdf100.loc[[False | (outdf100[varName].isnull()) for varName in requiredCols]]
if(len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('waterallocations_mandatoryFieldMissing.csv')  # index=False,
#ToDO: purge these cells if there is any missing? #For now left to be inspected
#outdf100_nullMand

Checking required is not null...


In [18]:
print("Writing outputs...")
#write out
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")

Writing outputs...
Done Water Allocation


### The following code checks if the SiteIDs are too long (>250 characters)

In [20]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("Long site ids...")

print (len(outdf100.index))

outdf100Long = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)]
if len(outdf100Long.index) > 0:
    print("There are rows with too long siteids")
outdf100Long

Long site ids...
7089


Unnamed: 0,OrganizationUUID,SiteUUID,VariableSpecificUUID,WaterSourceUUID,MethodUUID,PrimaryUseCategory,BeneficialUseCategory,AllocationNativeID,AllocationTypeCV,AllocationOwner,...,AllocationSDWISIdentifierCV,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,WaterAllocationNativeURL,CustomerTypeCV,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,DataPublicationDate,DataPublicationDOI


In [None]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("Long site ids...")

print (len(outdf100.index))

outdf100Long = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)]
if len(outdf100Long.index) > 0:
    print("There are rows with too long siteids")
    outdf100Long.to_csv("waterallocations_longsiteid.csv")  # index=False,
    dropIndex = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)].index
    outdf100 = outdf100.drop(dropIndex)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

print(len(outdf100Long))

print("Writing outputs...")
#write out
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")

##### The following is an attempt to see if the "Cross group error" on import to WaDE is fixed by making cells empty (rather than NAN)

In [None]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("replace NAN with ''")

#output: water allocation
in_alloc = "waterallocations1.csv"    #output
# ground water
outdf100 = pd.read_csv(in_alloc, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

outdf100

In [None]:
outdf100 = outdf100.replace(np.nan, '')

print("Writing outputs...")
#write out
#output: water allocation
out_alloc = "waterallocations.csv"    #output
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")
outdf100