In [2]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse
from waterallocationsFunctions import *

In [3]:
# working directory
working_dir = "C:/tseg/OKTest"
os.chdir(working_dir)

In [None]:
# Input files
fileInput1 = "Permitted_Groundwater_Wells.csv" 
FileInput2 = "Permitted_Surface_Water_Diversion_Points.csv" # Points of diversion
FileInput3 = "Areas_of_Use.csv"  # 
# water sources look up
inp_wtrsrs="watersources.csv"
# sites look up
inp_sitdim = 'sites.csv'

#output: water allocation
out_alloc = "waterallocations.csv"    #output

In [4]:
######## WaDE columns

#the followwing fields have difference between the table here (edited by DPL) and that on the schema website
#http://schema.westernstateswater.org/tables/Input_AllocationAmounts_fact.html
"""
BeneficialUseCategory, PrimaryUseCategory, AllocationTimeframeStart, AllocationTimeframeEnd, " "
BeneficialUseCategoryCV, PrimaryUseCategoryCV, TimeframeStartDate,	TimeframeEndDate,	Geometry	
"""
# UUIDs: Add UUIDs for all dim tables
# OrganizationUUID, SiteUUID, VariableSpecificUUID, WaterSourceUUID, MethodUUID
columns = ["OrganizationUUID", "SiteUUID", "VariableSpecificUUID", "WaterSourceUUID", "MethodUUID", "PrimaryUseCategory",
           "BeneficialUseCategory", "AllocationNativeID", "AllocationTypeCV", "AllocationOwner",
           "AllocationApplicationDate", "AllocationPriorityDate", "AllocationLegalStatusCV", "AllocationCropDutyAmount",
           "AllocationExpirationDate",
           "AllocationChangeApplicationIndicator", "LegacyAllocationIDs", "AllocationBasisCV", "AllocationTimeframeStart",
           "AllocationTimeframeEnd", "AllocationAmount", "AllocationMaximum", "PopulationServed", "PowerGeneratedGWh",
           "IrrigatedAcreage", "AllocationCommunityWaterSupplySystem", "AllocationSDWISIdentifierCV",
           "AllocationAssociatedWithdrawalSiteIDs", "AllocationAssociatedConsumptiveUseSiteIDs", "WaterAllocationNativeURL",
           "CustomerTypeCV", "IrrigationMethodCV", "CropTypeCV", "CommunityWaterSupplySystem", "DataPublicationDate",
           "DataPublicationDOI"]

dtypesx = [''] #here we could theoretically specify data types for each column name, but we didn't need to do that

In [None]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [None]:
print("Reading inputs...")

# Read Inputs and merge tables
# ToDO: We are joining 'on-left': keep all rows of mater table (check if need to be refined)

# ground water
df100_l = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df100_l.index))

#### Join tables

# surface water 
df200 = pd.read_csv(FileInput2,encoding = "ISO-8859-1")  
print (len(df200.index))

df100=pd.merge(df100_l, df200, left_on='OBJECTID', right_on='OBJECTID', how='left') #joined Points of diversiont table into Master_Table
#df100
print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

#df100 = df100.replace('', np.nan)
df100.head(5)

# water sources look up
df400 = pd.read_csv(inp_wtrsrs,encoding = "ISO-8859-1")
#drop duplicate rows ---this one is not necessary once the water sources table is refined to remove duplicates
df400 = df400.drop_duplicates(subset=['WaterSourceName'])
#df400

# sites look up
df500 = pd.read_csv(inp_sitdim,encoding = "ISO-8859-1")
#df500

In [None]:
# use only unique water rights based on permit number 
print("Dropping duplicates...")

outdf100 = outdf100.drop_duplicates(subset = ['Permit Number'], inplace=True)   #
outdf100 = outdf100.reset_index(drop=True)

In [7]:
print("Adding SiteUUID...")

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: assignSiteID(row['Permit Number'], df500), axis=1)
#df100


Adding SiteUUID...


In [27]:
print("Water sources...")
df100 = df100.assign(WaterSourceUUID='')

df100['WaterSourceUUID'] = df100.apply(lambda row: 
                                       'OK_1' if row['Water Type'].strip() == 'Groundwater'
                                              else 'UT_2', axis=1)
#df100['WaterSourceUUID'] = df100['Water Type'].apply(lambda cv: 
#                                                     'OK_1' if cv.strip() == 'Groundwater'
#                                                            else 'UT_2', axis=1)

#df100

In [None]:
print("Copying all columns...")
#
destCols=["SiteUUID", "WaterSourceUUID", 
          "AllocationNativeID", "AllocationTypeCV", "AllocationLegalStatusCV", 
          "BeneficialUseCategory", 
          "AllocationOwner", 
          "AllocationApplicationDate", "AllocationPriorityDate",
          #"AllocationAmount", 
          "AllocationMaximum", 
          #"IrrigatedAcreage",
          #"AllocationCropDutyAmount", "AllocationExpirationDate", 
          #"AllocationTimeframeStart", "AllocationTimeframeEnd"
         ]
#
sourCols=["SiteUUID", "WaterSourceUUID", 
          "Permit Number", "Permit Type", "Status",
          "Primary Purpose", 
          "Entity Name",
          "Date Filed", "Date Issued",
          #"",
          "Total Amount (AFY)",
          #"Areas_of_Use.SHAPE.AREA",
          #"IRRIGATION_DEPLETION", "DATE_TERMINATED",
          #"USE_BEG_DATE", "USE_END_DATE"
         ]

outdf100[destCols] = df100[sourCols]
#outdf100

In [None]:
# hard coded
print("Hard coded...")
#hard coded
outdf100.OrganizationUUID = "OWRB"
outdf100.VariableSpecificUUID = "OWRB Allocation All"
outdf100.MethodUUID = "OK_WaterAllocation"
outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
outdf100.TimeframeStart = "01/01"
outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y')    #"10/31/2019" # edit this to the code run date

#outdf100

In [None]:
print("Droping null allocations...")
# if both Allocation amount and Allocation maximum are empty drop row and save it to a Allocations_missing.csv
#outdf100 = outdf100.replace('', np.nan) #replace blank strings by NaN,
outdf100purge = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('waterallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Droping null SiteUUIDs...")
outdf100nullID = outdf100.loc[outdf100["SiteUUID"] == '']
if len(outdf100nullID.index) > 0:
    dropIndex = outdf100.loc[outdf100["SiteUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Droping null Priority date...")
outdf100nullPR = outdf100.loc[outdf100["AllocationPriorityDate"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["AllocationPriorityDate"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Droping null WaterSourceUUID ...")
outdf100nullPR = outdf100.loc[outdf100["WaterSourceUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["WaterSourceUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("waterallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Checking required is not null...")
# check if any cell of these columns is null
requiredCols = ["OrganizationUUID", "VariableSpecificUUID", "WaterSourceUUID", 
                "MethodUUID", "AllocationPriorityDate"] #SiteUUID
# outdf100_nullMand = outdf100.loc[outdf100.isnull().any(axis=1)] --for all cols
# outdf100_nullMand = outdf100.loc[outdf100[requiredCols].isnull().any(axis=1)]
#(outdf100["SiteUUID"].isnull()) |
outdf100_nullMand = outdf100.loc[(outdf100["OrganizationUUID"] == '') |
                                (outdf100["VariableSpecificUUID"] == '') |
                                (outdf100["WaterSourceUUID"] == '') |
                                (outdf100["MethodUUID"] == '') |
                                (outdf100["AllocationPriorityDate"] == '')]
#outdf100_nullMand = outdf100.loc[[False | (outdf100[varName].isnull()) for varName in requiredCols]]
if(len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('waterallocations_mandatoryFieldMissing.csv')  # index=False,
#ToDO: purge these cells if there is any missing? #For now left to be inspected
#outdf100_nullMand

In [None]:
print("Writing outputs...")
#write out
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")