In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse
from waterallocationsFunctions import *

In [2]:
# working directory
working_dir = "C:/tseg/OKTest"
os.chdir(working_dir)

In [3]:
# Input files
fileInput1 = "Permitted_Groundwater_Wells.csv" 
FileInput2 = "Permitted_Surface_Water_Diversion_Points.csv" # Points of diversion
FileInput3 = "Areas_of_Use.csv"  # 
# water sources look up
inp_wtrsrs="watersources.csv"
# sites look up
inp_sitdim = 'sites.csv'

#output: water allocation
out_alloc = "waterallocations.csv"    #output

In [4]:
######## WaDE columns

#the followwing fields have difference between the table here (edited by DPL) and that on the schema website
#http://schema.westernstateswater.org/tables/Input_AllocationAmounts_fact.html
"""
BeneficialUseCategory, PrimaryUseCategory, AllocationTimeframeStart, AllocationTimeframeEnd, " "
BeneficialUseCategoryCV, PrimaryUseCategoryCV, TimeframeStartDate,	TimeframeEndDate,	Geometry	
"""
# UUIDs: Add UUIDs for all dim tables
# OrganizationUUID, SiteUUID, VariableSpecificUUID, WaterSourceUUID, MethodUUID
columns = ["OrganizationUUID", "SiteUUID", "VariableSpecificUUID", "WaterSourceUUID", "MethodUUID", "PrimaryUseCategory",
           "BeneficialUseCategory", "AllocationNativeID", "AllocationTypeCV", "AllocationOwner",
           "AllocationApplicationDate", "AllocationPriorityDate", "AllocationLegalStatusCV", "AllocationCropDutyAmount",
           "AllocationExpirationDate",
           "AllocationChangeApplicationIndicator", "LegacyAllocationIDs", "AllocationBasisCV", "AllocationTimeframeStart",
           "AllocationTimeframeEnd", "AllocationAmount", "AllocationMaximum", "PopulationServed", "PowerGeneratedGWh",
           "IrrigatedAcreage", "AllocationCommunityWaterSupplySystem", "AllocationSDWISIdentifierCV",
           "AllocationAssociatedWithdrawalSiteIDs", "AllocationAssociatedConsumptiveUseSiteIDs", "WaterAllocationNativeURL",
           "CustomerTypeCV", "IrrigationMethodCV", "CropTypeCV", "CommunityWaterSupplySystem", "DataPublicationDate",
           "DataPublicationDOI"]

dtypesx = [''] #here we could theoretically specify data types for each column name, but we didn't need to do that

In [5]:
### target dataFrame

# TODO: assumes dtypes inferred from CO file
outdf100=pd.DataFrame(columns=columns)

In [6]:
print("Reading inputs...")

# Read Inputs and merge tables
# ToDO: We are joining 'on-left': keep all rows of mater table (check if need to be refined)

# ground water
df100_l = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, or alternatively encoding = "utf-8"
print (len(df100_l.index))

#### Join tables

# surface water 
df200 = pd.read_csv(FileInput2,encoding = "ISO-8859-1")  
print (len(df200.index))

# in this case we concatenate the two water right data
df100=pd.concat([df100_l, df200], ignore_index=True)

print (len(df100.index))

#df100 = df100.head(10000) #only runs first 100 lines for testing.

#df100 = df100.replace('', np.nan)
df100.head(5)

# water sources look up
df400 = pd.read_csv(inp_wtrsrs,encoding = "ISO-8859-1")
#drop duplicate rows ---this one is not necessary once the water sources table is refined to remove duplicates
df400 = df400.drop_duplicates(subset=['WaterSourceName'])
#df400

# sites look up
df500 = pd.read_csv(inp_sitdim,encoding = "ISO-8859-1")

df100.head(20)

Reading inputs...
20859
3422
24281


Unnamed: 0,X,Y,OBJECTID,RECORD_ID,PERMIT_NUMBER,LATITUDE,LONGITUDE,RECORD_TYPE,WATER,STATUS,...,RANGE,COUNTY,PERMIT_TYPE,TOTAL_PERMITTED_ACRE_FEET,PRIMARY_PURPOSE,DATE_FILED,DATE_ISSUED,HYDRO_UNIT,STREAM_SYSTEM,RECORD_ID2
0,-101.896349,36.574734,561,9753,19980623,36.574728,-101.89634,Permit,Groundwater,Active,...,11EC,Texas,Regular,10.0,Agriculture,1998-11-20T00:00:00.000Z,1999-09-14T00:00:00.000Z,,,9753
1,-101.57512,36.516345,752,50052,20020591,36.516338,-101.575112,Permit,Groundwater,Active,...,14EC,Texas,Regular,1280.0,Irrigation,2002-09-20T00:00:00.000Z,2003-05-03T00:00:00.000Z,,,50052
2,-99.052511,34.582855,944,53324,20040578,34.582849,-99.052503,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53324
3,-99.050317,34.590121,954,53325,20040578,34.590116,-99.050308,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53325
4,-99.050317,34.586494,945,53326,20040578,34.586489,-99.050308,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53326
5,-99.052504,34.581036,953,53327,20040578,34.58103,-99.052496,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53327
6,-99.048097,34.581052,946,53328,20040578,34.581046,-99.048089,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53328
7,-99.045896,34.581059,952,53329,20040578,34.581053,-99.045888,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53329
8,-99.045926,34.582872,947,53330,20040578,34.582866,-99.045918,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53330
9,-99.045928,34.586498,951,53331,20040578,34.586492,-99.04592,Permit,Groundwater,Active,...,18WI,Tillman,Regular,314.0,Irrigation,2004-09-07T00:00:00.000Z,2005-05-10T00:00:00.000Z,,,53331


In [7]:
list(df100.columns)

['X',
 'Y',
 'OBJECTID',
 'RECORD_ID',
 'PERMIT_NUMBER',
 'LATITUDE',
 'LONGITUDE',
 'RECORD_TYPE',
 'WATER',
 'STATUS',
 'ENTITY_NAME',
 'QUARTER3',
 'QUARTER2',
 'QUARTER1',
 'SECTION',
 'TOWNSHIP',
 'RANGE',
 'COUNTY',
 'PERMIT_TYPE',
 'TOTAL_PERMITTED_ACRE_FEET',
 'PRIMARY_PURPOSE',
 'DATE_FILED',
 'DATE_ISSUED',
 'HYDRO_UNIT',
 'STREAM_SYSTEM',
 'RECORD_ID2']

In [8]:
# use only unique water rights based on permit number 
print("Dropping duplicates...")

df100.drop_duplicates(subset = ['PERMIT_NUMBER'], inplace=True)   #
df100 = df100.reset_index(drop=True)

print (len(df100.index))

Dropping duplicates...
13042


In [9]:
df100['OBJECTID']

0           561
1           752
2           944
3           564
4           544
5           266
6           685
7           624
8           924
9           484
10          119
11          257
12          720
13          218
14          440
15           11
16          781
17          480
18          848
19          386
20          588
21          207
22           95
23          155
24          527
25          264
26          176
27          873
28          120
29          340
          ...  
13012    171896
13013    163896
13014    163907
13015    164228
13016    164856
13017    164857
13018    163906
13019      1451
13020     10396
13021     20492
13022     23702
13023     12699
13024     11010
13025     35560
13026      4180
13027     16294
13028     23704
13029      4191
13030      3511
13031     17041
13032     10476
13033     10468
13034    190154
13035     22756
13036      5127
13037     14060
13038      4469
13039     19610
13040     11974
13041     16841
Name: OBJECTID, Length: 

In [10]:
df500['SiteNativeID']

0           561
1           752
2           944
3           954
4           945
5           953
6           946
7           952
8           947
9           951
10          948
11          950
12          949
13          564
14          558
15          563
16          562
17          544
18          542
19          543
20          266
21          685
22          686
23          624
24          924
25          484
26          485
27          119
28          257
29          258
          ...  
24251    163906
24252      1451
24253     10396
24254     10397
24255     20492
24256     23702
24257     12699
24258     11010
24259     35560
24260      4180
24261     16294
24262     23704
24263     23705
24264      4191
24265      4190
24266      3511
24267     17041
24268     10476
24269     11445
24270     10468
24271     11404
24272    190154
24273     22756
24274      5127
24275      5128
24276     14060
24277      4469
24278     19610
24279     11974
24280     16841
Name: SiteNativeID, Leng

In [11]:
print("Adding SiteUUID...")

df100 = df100.assign(SiteUUID='')  #add new column and make is nan

#Permit Number
df100['SiteUUID'] = df100.apply(lambda row: assignSiteID(row['OBJECTID'], df500), axis=1)


Adding SiteUUID...


In [12]:
df100['SiteUUID']

0           OK_561
1           OK_752
2           OK_944
3           OK_564
4           OK_544
5           OK_266
6           OK_685
7           OK_624
8           OK_924
9           OK_484
10          OK_119
11          OK_257
12          OK_720
13          OK_218
14          OK_440
15           OK_11
16          OK_781
17          OK_480
18          OK_848
19          OK_386
20          OK_588
21          OK_207
22           OK_95
23          OK_155
24          OK_527
25          OK_264
26          OK_176
27          OK_873
28          OK_120
29          OK_340
           ...    
13012    OK_171896
13013    OK_163896
13014    OK_163907
13015    OK_164228
13016    OK_164856
13017    OK_164857
13018    OK_163906
13019      OK_1451
13020     OK_10396
13021     OK_20492
13022     OK_23702
13023     OK_12699
13024     OK_11010
13025     OK_35560
13026      OK_4180
13027     OK_16294
13028     OK_23704
13029      OK_4191
13030      OK_3511
13031     OK_17041
13032     OK_10476
13033     OK

In [13]:
print("Water sources...")
df100 = df100.assign(WaterSourceUUID='')

df100['WaterSourceUUID'] = df100.apply(lambda row: 
                                       'OK_1' if row['WATER'].strip() == 'Groundwater'
                                              else 'UT_2', axis=1)
#df100['WaterSourceUUID'] = df100['Water Type'].apply(lambda cv: 
#                                                     'OK_1' if cv.strip() == 'Groundwater'
#                                                            else 'UT_2', axis=1)

#df100

Water sources...


In [14]:
print("Allocation application date...")

df100 = df100.assign(AllocationApplicationDate='')

df100['AllocationApplicationDate'] = df100.apply(lambda row: 
                                               formatDateString(row['DATE_FILED']), axis=1)
#df100

Allocation application date...


In [15]:
df100['AllocationApplicationDate']

0        11/20/1998
1        09/20/2002
2        09/07/2004
3        03/31/1995
4        12/15/1995
5        01/25/1965
6        12/04/1997
7        05/05/1972
8        10/30/1996
9        01/24/1974
10       02/06/1975
11       04/28/1967
12       09/11/1978
13       06/24/1964
14       05/28/1965
15       04/24/1970
16       05/22/1978
17       08/12/1998
18       11/03/1954
19       04/19/1954
20       03/24/1992
21       07/15/1971
22       04/23/1964
23       08/20/1964
24       07/25/1968
25       05/04/1950
26       02/13/1964
27       09/12/1978
28       12/24/1968
29       07/29/1988
            ...    
13012    03/03/2017
13013    03/09/2017
13014    03/10/2017
13015    04/10/2017
13016    04/26/2017
13017    05/01/2017
13018    05/12/2017
13019    11/10/1949
13020    01/31/1956
13021    04/10/1954
13022    06/05/1978
13023    12/07/1976
13024    08/17/1972
13025    12/13/1976
13026    08/04/2008
13027    10/13/1981
13028    12/04/1985
13029    10/27/2008
13030    03/01/2007


In [16]:
print("Allocation priority date...")

df100 = df100.assign(AllocationPriorityDate='')

df100['AllocationPriorityDate'] = df100.apply(lambda row: 
                                        formatDateString(row['DATE_ISSUED']), axis=1)
#df100

Allocation priority date...


In [17]:
print("Copying all columns...")
#
destCols=["SiteUUID", "WaterSourceUUID", 
          "AllocationNativeID", "AllocationTypeCV", "AllocationLegalStatusCV", 
          "BeneficialUseCategory", 
          "AllocationOwner", 
          "AllocationApplicationDate", "AllocationPriorityDate",
          #"AllocationAmount", 
          "AllocationMaximum", 
          #"IrrigatedAcreage",
          #"AllocationCropDutyAmount", "AllocationExpirationDate", 
          #"AllocationTimeframeStart", "AllocationTimeframeEnd"
         ]
#
sourCols=["SiteUUID", "WaterSourceUUID", 
          "PERMIT_NUMBER", "PERMIT_TYPE", "STATUS",
          "PRIMARY_PURPOSE", 
          "ENTITY_NAME",
          "AllocationApplicationDate", "AllocationPriorityDate", 
          #"",
          "TOTAL_PERMITTED_ACRE_FEET",
          #"Areas_of_Use.SHAPE.AREA",
          #"IRRIGATION_DEPLETION", "DATE_TERMINATED",
          #"USE_BEG_DATE", "USE_END_DATE"
         ]

outdf100[destCols] = df100[sourCols]
#outdf100

Copying all columns...


In [18]:
# hard coded
print("Hard coded...")
#hard coded
outdf100.OrganizationUUID = "OWRB"
outdf100.VariableSpecificUUID = "OWRB Allocation All"
outdf100.MethodUUID = "OK_WaterAllocation"
outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
outdf100.TimeframeStart = "01/01"
outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y')    #"10/31/2019" # edit this to the code run date

#outdf100

Hard coded...


In [19]:
print("Droping null allocations...")
# if both Allocation amount and Allocation maximum are empty drop row and save it to a Allocations_missing.csv
#outdf100 = outdf100.replace('', np.nan) #replace blank strings by NaN,
outdf100purge = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('waterallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["AllocationAmount"] == '') & (outdf100["AllocationMaximum"] == '')].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null allocations...


  result = method(y)


In [20]:
print("Droping null SiteUUIDs...")
outdf100nullID = outdf100.loc[outdf100["SiteUUID"] == '']
if len(outdf100nullID.index) > 0:
    dropIndex = outdf100.loc[outdf100["SiteUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null SiteUUIDs...


In [21]:
print("Droping null Priority date...")
outdf100nullPR = outdf100.loc[outdf100["AllocationPriorityDate"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["AllocationPriorityDate"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null Priority date...


In [22]:
print("Droping null WaterSourceUUID ...")
outdf100nullPR = outdf100.loc[outdf100["WaterSourceUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["WaterSourceUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null WaterSourceUUID ...


In [23]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("waterallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping duplicates...


In [24]:
print("Checking required is not null...")
# check if any cell of these columns is null
requiredCols = ["OrganizationUUID", "VariableSpecificUUID", "WaterSourceUUID", 
                "MethodUUID", "AllocationPriorityDate"] #SiteUUID
# outdf100_nullMand = outdf100.loc[outdf100.isnull().any(axis=1)] --for all cols
# outdf100_nullMand = outdf100.loc[outdf100[requiredCols].isnull().any(axis=1)]
#(outdf100["SiteUUID"].isnull()) |
outdf100_nullMand = outdf100.loc[(outdf100["OrganizationUUID"] == '') |
                                (outdf100["VariableSpecificUUID"] == '') |
                                (outdf100["WaterSourceUUID"] == '') |
                                (outdf100["MethodUUID"] == '') |
                                (outdf100["AllocationPriorityDate"] == '')]
#outdf100_nullMand = outdf100.loc[[False | (outdf100[varName].isnull()) for varName in requiredCols]]
if(len(outdf100_nullMand.index) > 0):
    outdf100_nullMand.to_csv('waterallocations_mandatoryFieldMissing.csv')  # index=False,
#ToDO: purge these cells if there is any missing? #For now left to be inspected
#outdf100_nullMand

Checking required is not null...


In [25]:
print("Writing outputs...")
#write out
outdf100.to_csv(out_alloc, index=False, encoding = "utf-8")

print("Done Water Allocation")

Writing outputs...
Done Water Allocation
