In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse

In [2]:
# working directory
working_dir = "./ProcessedInputData"
os.chdir(working_dir)

In [None]:
# Input files
fileInput1 = "wr_v_pod_public_xy.csv" 
# water sources look up
inp_wtrsrs="watersources.csv"
# sites look up
inp_sitdim = 'sites.csv'

#output: water allocation
out_alloc = "waterallocations.csv"    #output

In [None]:
######## WaDE columns

#the followwing fields have difference between the table here (edited by DPL) and that on the schema website
#http://schema.westernstateswater.org/tables/Input_AllocationAmounts_fact.html
"""
BeneficialUseCategory, PrimaryUseCategory, AllocationTimeframeStart, AllocationTimeframeEnd, " "
BeneficialUseCategoryCV, PrimaryUseCategoryCV, TimeframeStartDate,	TimeframeEndDate,	Geometry	
"""
# UUIDs: Add UUIDs for all dim tables
# OrganizationUUID, SiteUUID, VariableSpecificUUID, WaterSourceUUID, MethodUUID
columns = ["OrganizationUUID", "SiteUUID", "VariableSpecificUUID", "WaterSourceUUID", "MethodUUID", "PrimaryUseCategory",
           "BeneficialUseCategory", "AllocationNativeID", "AllocationTypeCV", "AllocationOwner",
           "AllocationApplicationDate", "AllocationPriorityDate", "AllocationLegalStatusCV", "AllocationCropDutyAmount",
           "AllocationExpirationDate",
           "AllocationChangeApplicationIndicator", "LegacyAllocationIDs", "AllocationBasisCV", "AllocationTimeframeStart",
           "AllocationTimeframeEnd", "AllocationAmount", "AllocationMaximum", "PopulationServed", "PowerType", "GeneratedPowerCapacityMW",
           "IrrigatedAcreage", "AllocationCommunityWaterSupplySystem", "AllocationSDWISIdentifierCV",
           "AllocationAssociatedWithdrawalSiteIDs", "AllocationAssociatedConsumptiveUseSiteIDs", "WaterAllocationNativeURL",
           "CustomerTypeCV", "IrrigationMethodCV", "CropTypeCV", "CommunityWaterSupplySystem", "DataPublicationDate",
           "DataPublicationDOI"]

dtypesx = [''] #here we could theoretically specify data types for each column name, but we didn't need to do that

In [None]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [None]:
print("Reading inputs...")

# 
df100 = pd.read_csv(fileInput1, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
#print (len(df100.index))

#df100


# sites look up
df500 = pd.read_csv(inp_sitdim, encoding = "ISO-8859-1")

# water sources look up
df400 = pd.read_csv(inp_wtrsrs, encoding = "ISO-8859-1")
#drop duplicate rows 
#---this one is not necessary once the water sources table is refined to remove duplicates
df400 = df400.drop_duplicates(subset=['WaterSourceName', 'WaterSourceTypeCV'])
df400

In [None]:
# use only unique water rights that may have multiple sites/pds
print("Dropping duplicates...")

print (len(df100.index))

df100.drop_duplicates(subset = ['permit_nbr'], inplace=True)   #
df100 = df100.reset_index(drop=True)

print (len(df100.index))

df100

In [None]:
print("Adding SiteUUID...")

def assignSiteID(colrowValue, df500):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        sitl = df500.loc[df500['SiteNativeID'] == colrowValue, 'SiteUUID']
        #print(sitl)
        #print(sitl.empty)
        if not(sitl.empty):            # check if the series is empty
            outList = ', '.join(str(inx) for inx in sitl) #sil.iloc[0]
        else:
            outList = ''
    return outList

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: assignSiteID(row['snp_id'], df500), axis=1)
#pod_location_id
df100

In [None]:
print("Water sources...")

WRSourceTypeCVDictOR = {
    "ST":"storage",
    "SW":"surface water",
    "GW":"groundwater"
}


def assignWaterSourceID2(colrowValue11, colrowValue22, df400):
    colrowValue1 = str(colrowValue11).strip()
    colrowValue2 = str(colrowValue22).strip()
    if ((colrowValue1 == '') | (pd.isnull(colrowValue1))) & ((colrowValue2 == '') | (pd.isnull(colrowValue2))):
        outList = ''
    else:
        ml = df400.loc[(df400['WaterSourceName'] == colrowValue1) 
                       & (df400['WaterSourceTypeCV'] == WRSourceTypeCVDictOR[colrowValue2]),
                       'WaterSourceUUID']
        #print(ml)
        #print(ml.empty)
        if not(ml.empty):            # check if the series is empty
            outList = ml.iloc[0]   # watersourceSer.append(ml.iloc[0])
        else:
            outList = ''
    return outList

df100 = df100.assign(WaterSourceUUID='')

df100 = df100.replace(np.nan, '')

df100['WaterSourceUUID'] = df100.apply(lambda row: 
                        assignWaterSourceID2(row['source'], row['wr_type'], df400), axis=1)

df100

In [None]:
print("AllocationTypeCV dictionary and function...")

claimCharDictOR = {
    "GR":"groundwater registrations",
    "PC":"power claim",
    "SW":"surface water registrations",
    "KL":"Klamath Adjudication claim",
    "KA":"Klamath Adjudication"
}

permitCharDictOR = {
    "E":"enlargement",
    "R":"reservoir",
    "G":"groundwater",
    "U":"underground",
    "S":"surface",
    "AL":"aquifer storage & recovery limited license"
}

def assignAllocTypeCVOR(colrowValue11, colrowValue22):
    colrowValue1 = str(colrowValue11).strip()
    colrowValue2 = str(colrowValue22).strip()
    if ((colrowValue1 == '') | (pd.isnull(colrowValue1))) & ((colrowValue2 == '') | (pd.isnull(colrowValue2))):
        outList = ''
    elif ((colrowValue1 == '') | (pd.isnull(colrowValue1))): 
        keyStr = colrowValue2.strip()  # remove whitespace chars
        #try:
        outList = permitCharDictOR[keyStr] 
        #except:
        #    outList = ''
    else:
        keyStr = colrowValue1.strip()  # remove whitespace chars
        #try:
        outList = claimCharDictOR[keyStr] 
        #except:
        #    outList = ''
        
    return outList


In [None]:
print("AllocationTypeCV...")

df100 = df100.assign(AllocationTypeCV='')

df100 = df100.replace(np.nan, '')

df100['AllocationTypeCV'] = df100.apply(lambda row: 
                        assignAllocTypeCVOR(row['claim_char'], row['permit_char']), axis=1)

df100

In [None]:
print("AllocationOwner functions...")

def assignownerName(colrowValue1, colrowValue2):
    if colrowValue1 == '' or pd.isnull(colrowValue1):
        outList1 = ''
    else:
        outList1 = colrowValue1.strip()  # remove whitespace chars
    if colrowValue2 == '' or pd.isnull(colrowValue2):
        outList2 = ''
    else:
        outList2 = colrowValue2.strip()  # remove whitespace chars

    if outList1 == '' and outList2 == '':
        outList = ''
    elif outList1 == '':
        outList = outList2
    elif outList2 == '':
        outList = outList1
    else:
        outList = ", ".join(map(str, [colrowValue1, colrowValue2]))
    return outList


def assignownerNameORCompany(colrowValue1, colrowValue2, colrowValue3):
    if colrowValue1 == '' or pd.isnull(colrowValue1):
        outList = assignownerName(colrowValue2, colrowValue3)
    else:
        outList = colrowValue1
    return outList

In [None]:
print("AllocationOwner...")

df100 = df100.assign(AllocationOwner='')
df100['AllocationOwner'] = df100.apply(lambda row: 
                                       assignownerNameORCompany(row['name_company'], 
                                            row['name_last'], row['name_first']), axis=1)
df100

In [None]:
print("Allocation priority date...")

# input format 1989-11-21T00:00:00.000
def formatDateString(inString):
    #print(inString)
    try:
        if inString == '' or pd.isnull(inString):
            valndf = ''
        else:
            valD = datetime.strptime(inString, '%Y-%m-%dT00:00:00.000')
            #print(valD)
            valnDd = valD.date()
            #print(valnDd)
            valndf = valnDd.strftime('%m/%d/%Y')
            #print('date:', valndf)
    except:
        valndf = ''

    return valndf

df100 = df100.assign(AllocationPriorityDate='')

df100['AllocationPriorityDate'] = df100.apply(lambda row: 
                                        formatDateString(row['priority_date']), axis=1)

df100

In [None]:
# 
def formatDateString(inString1, inString2):
    #print(inString)
    try:
        if inString1 == '' | pd.isnull(inString1) | inString2 == '' | pd.isnull(inString2):
            valndf = ''
        else:
            valD = datetime.strptime(inString, '%Y-%m-%dT00:00:00.000')
            #print(valD)
            valnDd = valD.date()
            #print(valnDd)
            valndf = valnDd.strftime('%m/%d/%Y')
            #print('date:', valndf)
    except:
        valndf = ''

    return valndf

In [None]:
print("Timeframe start...")

df100 = df100.assign(TimeframeStart='')

df100['AllocationTimeframeStart'] = df100.apply(lambda row: 
                      str(int(row['begin_month'])).strip() + '/' + str(int(row['begin_day'])).strip(),
                                axis=1)
df100

In [None]:
print("Timeframe end...")

df100 = df100.assign(AllocationTimeframeEnd='')

df100['AllocationTimeframeEnd'] = df100.apply(lambda row: 
                        str(int(row['end_month'])).strip() + '/' + str(int(row['end_day'])).strip(),
                                axis=1)
df100

In [None]:
print("Copying all columns...")
#
destCols=["SiteUUID", "WaterSourceUUID", 
          "AllocationNativeID", 
          #"AllocationLegalStatusCV", 
          "BeneficialUseCategory", 
          "AllocationOwner", 
          "AllocationTypeCV", 
          #"AllocationApplicationDate", 
          "AllocationPriorityDate",
          "AllocationAmount", 
          "AllocationMaximum", 
          #"IrrigatedAcreage",
          #"AllocationCropDutyAmount", "AllocationExpirationDate", 
          "AllocationTimeframeStart", "AllocationTimeframeEnd",
          "WaterAllocationNativeURL"
         ]
#
srsCols=["SiteUUID", "WaterSourceUUID", 
          "permit_nbr", 
         #"WaRecProcessStatusTypeCode",
          "use_code_description", 
          "AllocationOwner",
          "AllocationTypeCV", 
          #"AllocationApplicationDate",
          "AllocationPriorityDate", 
          "rate_cfs",
          "max_rate_acre_feet",
          #"IrrigatedAreaQuantity",
          #"IRRIGATION_DEPLETION", "DATE_TERMINATED",
          "AllocationTimeframeStart", "AllocationTimeframeEnd",
          "wris_link"
         ]

outdf100[destCols] = df100[srsCols]

outdf100

In [None]:
# hard coded
print("Hard coded...")
#hard coded
outdf100.OrganizationUUID = "OWRD"
outdf100.VariableSpecificUUID = "OWRD Allocation all"
outdf100.MethodUUID = "OWRD-Water Rights"
outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
#
#outdf100.AllocationTimeframeStart = "01/01"
#outdf100.AllocationTimeframeEnd = "12/31"

#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y')    #"10/31/2019" # edit this to the code run date

outdf100

In [None]:
print("Droping null allocations...")
# if both Allocation amount and Allocation maximum are empty drop row and save it to a Allocations_missing.csv
outdf100 = outdf100.replace(np.nan, '') #replace blank strings by NaN,
outdf100purge = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('waterallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100
#outdf100purge

In [None]:
print("Droping null SiteUUIDs...")
outdf100nullID = outdf100.loc[outdf100["SiteUUID"] == '']
if len(outdf100nullID.index) > 0:
    dropIndex = outdf100.loc[outdf100["SiteUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100

In [None]:
print("Droping null Priority date...")
outdf100nullPR = outdf100.loc[outdf100["AllocationPriorityDate"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["AllocationPriorityDate"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)

outdf100
#outdf100nullPR

In [None]:
print("Droping null WaterSourceUUID ...")
outdf100nullPR = outdf100.loc[outdf100["WaterSourceUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["WaterSourceUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
outdf100

In [None]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("waterallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)

outdf100

In [None]:
print("Checking required is not null...")
# check if any cell of these columns is null
requiredCols = ["OrganizationUUID", "VariableSpecificUUID", "WaterSourceUUID", 
                "MethodUUID", "AllocationPriorityDate"] #SiteUUID
# outdf100_nullMand = outdf100.loc[outdf100.isnull().any(axis=1)] --for all cols
# outdf100_nullMand = outdf100.loc[outdf100[requiredCols].isnull().any(axis=1)]
#(outdf100["SiteUUID"].isnull()) |
outdf100_nullMand = outdf100.loc[(outdf100["OrganizationUUID"] == '') |
                                (outdf100["VariableSpecificUUID"] == '') |
                                (outdf100["WaterSourceUUID"] == '') |
                                (outdf100["MethodUUID"] == '') |
                                (outdf100["AllocationPriorityDate"] == '')]
#outdf100_nullMand = outdf100.loc[[False | (outdf100[varName].isnull()) for varName in requiredCols]]
if(len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('waterallocations_mandatoryFieldMissing.csv')  # index=False,
#ToDO: purge these cells if there is any missing? #For now left to be inspected
#outdf100_nullMand

In [None]:
print("Writing outputs...")
#write out
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")

### Do not run the following with the rest of the code  (it is for inspection)

In [None]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("Long site ids...")

#output: water allocation
in_alloc = "waterallocations1.csv"    #output
# ground water
outdf100 = pd.read_csv(in_alloc, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(outdf100.index))

outdf100Long = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)]
if len(outdf100Long.index) > 0:
    print("There are rows with too long siteids")
    outdf100Long.to_csv("waterallocations_longsiteid.csv")  # index=False,
    dropIndex = outdf100[outdf100['SiteUUID'].apply(lambda x: len(x) > 250)].index
    outdf100 = outdf100.drop(dropIndex)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

print(len(outdf100Long))

print("Writing outputs...")
#write out
#output: water allocation
out_alloc = "waterallocations.csv"    #output
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")

In [3]:
##### Do not run the following with the rest of the code  (it is for inspection)
print("replace NAN with ''")

#output: water allocation
in_alloc = "waterallocations1.csv"    #output
# ground water
outdf100 = pd.read_csv(in_alloc, encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"

outdf100

replace NAN with ''


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,OrganizationUUID,VariableSpecificUUID,WaterSourceUUID,SiteUUID,MethodUUID,PrimaryUseCategory,DataPublicationDate,DataPublicationDOI,BeneficialUseCategory,AllocationNativeID,...,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,AllocationChangeApplicationIndicator,LegacyAllocationIDs,CropTypeCV,CustomerTypeCV,IrrigationMethodCV,WaterAllocationNativeURL,CommunityWaterSupplySystem,PowerType
0,OWRD,OWRD Allocation all,OR_4,OR_3,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,12684.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
1,OWRD,OWRD Allocation all,OR_4,OR_4,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,12750.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
2,OWRD,OWRD Allocation all,OR_4,OR_5,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,12779.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
3,OWRD,OWRD Allocation all,OR_445,"OR_6, OR_7, OR_8, OR_9",OWRD-Water Rights,Irrigation,12/30/2019,,MINING,13450.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
4,OWRD,OWRD Allocation all,OR_4,OR_10,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,13525.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
5,OWRD,OWRD Allocation all,OR_4,OR_11,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,13584.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
6,OWRD,OWRD Allocation all,OR_4,OR_12,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,14006.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
7,OWRD,OWRD Allocation all,OR_4,OR_13,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,15090.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
8,OWRD,OWRD Allocation all,OR_16821,OR_18,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,46057.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
9,OWRD,OWRD Allocation all,OR_3191,"OR_19, OR_20, OR_21, OR_22, OR_4686",OWRD-Water Rights,Irrigation,12/30/2019,,MINING,46165.0,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,


In [4]:
outdf100 = outdf100.replace(np.nan, '')

print("Writing outputs...")
#write out
#output: water allocation
out_alloc = "waterallocations.csv"    #output
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")
outdf100

Writing outputs...
Done Water Allocation


Unnamed: 0,OrganizationUUID,VariableSpecificUUID,WaterSourceUUID,SiteUUID,MethodUUID,PrimaryUseCategory,DataPublicationDate,DataPublicationDOI,BeneficialUseCategory,AllocationNativeID,...,AllocationAssociatedWithdrawalSiteIDs,AllocationAssociatedConsumptiveUseSiteIDs,AllocationChangeApplicationIndicator,LegacyAllocationIDs,CropTypeCV,CustomerTypeCV,IrrigationMethodCV,WaterAllocationNativeURL,CommunityWaterSupplySystem,PowerType
0,OWRD,OWRD Allocation all,OR_4,OR_3,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,12684,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
1,OWRD,OWRD Allocation all,OR_4,OR_4,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,12750,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
2,OWRD,OWRD Allocation all,OR_4,OR_5,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,12779,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
3,OWRD,OWRD Allocation all,OR_445,"OR_6, OR_7, OR_8, OR_9",OWRD-Water Rights,Irrigation,12/30/2019,,MINING,13450,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
4,OWRD,OWRD Allocation all,OR_4,OR_10,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,13525,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
5,OWRD,OWRD Allocation all,OR_4,OR_11,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,13584,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
6,OWRD,OWRD Allocation all,OR_4,OR_12,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,14006,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
7,OWRD,OWRD Allocation all,OR_4,OR_13,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,15090,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
8,OWRD,OWRD Allocation all,OR_16821,OR_18,OWRD-Water Rights,Irrigation,12/30/2019,,MINING,46057,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
9,OWRD,OWRD Allocation all,OR_3191,"OR_19, OR_20, OR_21, OR_22, OR_4686",OWRD-Water Rights,Irrigation,12/30/2019,,MINING,46165,...,,,,,,,,http://apps.wrd.state.or.us/apps/wr/wrinfo/wr_...,,
