In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse

In [2]:
# working directory
working_dir = "./ProcessedInputData/"
os.chdir(working_dir)

In [3]:
target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  "PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "TimeframeStart", "TimeframeEnd", 
                  "DataPublicationDate", "DataPublicationDOI", "ReportYearCV", "Amount",  
                  "PopulationServed", "PowerGeneratedGWh", "IrrigatedAcreage",
                  "InterbasinTransferToID", "InterbasinTransferFromID", "CustomerTypeCV",
                  "AllocationCropDutyAmount", "IrrigationMethodCV", "CropTypeCV",
                  "CommunityWaterSupplySystem", "SDWISIdentifierCV", "PowerType"]

In [4]:
outdf100 = pd.DataFrame(columns=target_columns)

In [6]:
# Input files
fileInput1_prefix = "CA-DWR-WaterBalance-Level2-DP-1000-"
fileInput1_postfix = "-DAUCO.csv"

# reporting units
# ---- no need here ---- inp_repunts = 'reportingunits.csv'

In [9]:
print("Reading input...")

# combine data from multiple files to one dataFrame
startYear = 2011
endYear = 2015
numYears = 5
yearList = np.linspace(startYear, endYear, numYears)
df500_list = []
for isx in range (numYears):
    fileInput1 = fileInput1_prefix + str(int(yearList[isx])) + fileInput1_postfix
    df50 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, usecols =input_owner_cols) 
    df500_list.append(df50)
    
df500 = pd.concat(df500_list, sort=True, ignore_index=True)

df500.drop_duplicates(inplace=True)
print(len(df500.index))

#df500.head(5)
df500

Reading reporting units input...
686057


Unnamed: 0,CategoryA,CategoryB,CategoryC,CategoryD,DAU,DAU_NAME,HR_CODE,HR_NAME,KAcreFt,Latitude,Longitude,PA,Year
0,Agriculture,1,Applied Water,AG1,DAU04827,Pressure,3,Central Coast,89.2,36.639420,-121.637711,301,2011
1,Agriculture,2,Applied Water - Groundwater Recharge,AG2,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
2,Agriculture,22,Conveyance Deep Percolation,AG22,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
3,Agriculture,18f,Conveyance Deep Percolation to Mexico,AG18F,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
4,Agriculture,18e,Conveyance Deep Percolation to Nevada,AG18E,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
5,Agriculture,18d,Conveyance Deep Percolation to Oregon,AG18D,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
6,Agriculture,23,Conveyance Deep Percolation to Salt Sink,AG23,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
7,Agriculture,17,Conveyance Evaporation and ETAW,AG17,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
8,Agriculture,19b,Conveyance Return Flow for Delta Outflow,AG19B,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
9,Agriculture,20a,Conveyance Return Flow to Developed Supply (Ot...,AG20A,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011


In [10]:
print("Sum/Aggregate amount by Ben use, DAU, and Year...")

df200 = df500.groupby(['CategoryA', 'DAU', 'Year']).agg({'KAcreFt': sum})
df200

Sum/Aggregate amount by Ben use, DAU, and Year...


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,KAcreFt
CategoryA,DAU,Year,Unnamed: 3_level_1
Agriculture,DAU00125,2011,579.3
Agriculture,DAU00125,2012,723.7
Agriculture,DAU00125,2013,543.7
Agriculture,DAU00125,2014,513.6
Agriculture,DAU00125,2015,478.5
Agriculture,DAU00147,2011,772.6
Agriculture,DAU00147,2012,651.6
Agriculture,DAU00147,2013,657.9
Agriculture,DAU00147,2014,584.6
Agriculture,DAU00147,2015,553.3


In [11]:
# reset index
df200KA = pd.DataFrame(columns=['KAcreFt'])
df200KA['KAcreFt'] = df200['KAcreFt']
df200KA = df200KA.reset_index(drop=True)
df200KA

Unnamed: 0,KAcreFt
0,579.3
1,723.7
2,543.7
3,513.6
4,478.5
5,772.6
6,651.6
7,657.9
8,584.6
9,553.3


In [12]:
# change units to AF from KAF

print("Adjust units to AF...")

df200KA = df200KA.assign(AmountAF=np.nan)
df200KA['AmountAF'] = df200KA.apply(lambda row: float(row['KAcreFt'])/1000.0, axis=1)
df200KA

Adjust units to AF...


Unnamed: 0,KAcreFt,AmountAF
0,579.3,0.5793
1,723.7,0.7237
2,543.7,0.5437
3,513.6,0.5136
4,478.5,0.4785
5,772.6,0.7726
6,651.6,0.6516
7,657.9,0.6579
8,584.6,0.5846
9,553.3,0.5533


In [13]:
print("Find unique rows by Ben use, DAU, and Year...")

df100 = df500.drop_duplicates(subset=['CategoryA', 'DAU', 'Year'])
df100 = df100.reset_index(drop=True)
df100

Find unique rows by Ben use, DAU, and Year...


Unnamed: 0,CategoryA,CategoryB,CategoryC,CategoryD,DAU,DAU_NAME,HR_CODE,HR_NAME,KAcreFt,Latitude,Longitude,PA,Year
0,Agriculture,1,Applied Water,AG1,DAU04827,Pressure,3,Central Coast,89.2,36.639420,-121.637711,301,2011
1,Instream Flow Requirements,1,Applied Water,IFR1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
2,Managed Wetlands,1,Applied Water,MW1,DAU04827,Pressure,3,Central Coast,0.4,36.639420,-121.637711,301,2011
3,Required Delta Outflow,1,Applied Water,RDO1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
4,Urban,5,Applied Water - Commercial Use,URB5,DAU04827,Pressure,3,Central Coast,2.2,36.639420,-121.637711,301,2011
5,Wild and Scenic River,1,Applied Water,WSR1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011
6,Agriculture,1,Applied Water,AG1,DAU01949,Gualala,1,North Coast,0.4,38.663926,-123.280350,103,2011
7,Instream Flow Requirements,1,Applied Water,IFR1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011
8,Managed Wetlands,1,Applied Water,MW1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011
9,Required Delta Outflow,1,Applied Water,RDO1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011


In [14]:
# combine
df100['AmountAF'] = df200KA['AmountAF']
df100

Unnamed: 0,CategoryA,CategoryB,CategoryC,CategoryD,DAU,DAU_NAME,HR_CODE,HR_NAME,KAcreFt,Latitude,Longitude,PA,Year,AmountAF
0,Agriculture,1,Applied Water,AG1,DAU04827,Pressure,3,Central Coast,89.2,36.639420,-121.637711,301,2011,0.5793
1,Instream Flow Requirements,1,Applied Water,IFR1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011,0.7237
2,Managed Wetlands,1,Applied Water,MW1,DAU04827,Pressure,3,Central Coast,0.4,36.639420,-121.637711,301,2011,0.5437
3,Required Delta Outflow,1,Applied Water,RDO1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011,0.5136
4,Urban,5,Applied Water - Commercial Use,URB5,DAU04827,Pressure,3,Central Coast,2.2,36.639420,-121.637711,301,2011,0.4785
5,Wild and Scenic River,1,Applied Water,WSR1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011,0.7726
6,Agriculture,1,Applied Water,AG1,DAU01949,Gualala,1,North Coast,0.4,38.663926,-123.280350,103,2011,0.6516
7,Instream Flow Requirements,1,Applied Water,IFR1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011,0.6579
8,Managed Wetlands,1,Applied Water,MW1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011,0.5846
9,Required Delta Outflow,1,Applied Water,RDO1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011,0.5533


In [15]:
print("reporting units...")

#ReportingUnitUUID	CA_DAU

df100 = df100.assign(ReportingUnitUUID='')
df100['ReportingUnitUUID'] = df100.apply(lambda row: '_'.join(["CA", str(row["DAU"])]), axis=1)
df100

reporting units...


Unnamed: 0,CategoryA,CategoryB,CategoryC,CategoryD,DAU,DAU_NAME,HR_CODE,HR_NAME,KAcreFt,Latitude,Longitude,PA,Year,AmountAF,ReportingUnitUUID
0,Agriculture,1,Applied Water,AG1,DAU04827,Pressure,3,Central Coast,89.2,36.639420,-121.637711,301,2011,0.5793,CA_DAU04827
1,Instream Flow Requirements,1,Applied Water,IFR1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011,0.7237,CA_DAU04827
2,Managed Wetlands,1,Applied Water,MW1,DAU04827,Pressure,3,Central Coast,0.4,36.639420,-121.637711,301,2011,0.5437,CA_DAU04827
3,Required Delta Outflow,1,Applied Water,RDO1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011,0.5136,CA_DAU04827
4,Urban,5,Applied Water - Commercial Use,URB5,DAU04827,Pressure,3,Central Coast,2.2,36.639420,-121.637711,301,2011,0.4785,CA_DAU04827
5,Wild and Scenic River,1,Applied Water,WSR1,DAU04827,Pressure,3,Central Coast,0.0,36.639420,-121.637711,301,2011,0.7726,CA_DAU04827
6,Agriculture,1,Applied Water,AG1,DAU01949,Gualala,1,North Coast,0.4,38.663926,-123.280350,103,2011,0.6516,CA_DAU01949
7,Instream Flow Requirements,1,Applied Water,IFR1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011,0.6579,CA_DAU01949
8,Managed Wetlands,1,Applied Water,MW1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011,0.5846,CA_DAU01949
9,Required Delta Outflow,1,Applied Water,RDO1,DAU01949,Gualala,1,North Coast,0.0,38.663926,-123.280350,103,2011,0.5533,CA_DAU01949


In [16]:
print("Copying Columns...")

destCols = ["ReportingUnitUUID", "BeneficialUseCategory", "Amount", "ReportYearCV"]
srsCols = ["ReportingUnitUUID", "CategoryA", "AmountAF", "Year"]

outdf100[destCols] = df100[srsCols]

outdf100

Copying Columns...


Unnamed: 0,OrganizationUUID,VariableSpecificUUID,ReportingUnitUUID,PrimaryUseCategory,BeneficialUseCategory,WaterSourceUUID,MethodUUID,TimeframeStart,TimeframeEnd,DataPublicationDate,...,IrrigatedAcreage,InterbasinTransferToID,InterbasinTransferFromID,CustomerTypeCV,AllocationCropDutyAmount,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,SDWISIdentifierCV,PowerType
0,,,CA_DAU04827,,Agriculture,,,,,,...,,,,,,,,,,
1,,,CA_DAU04827,,Instream Flow Requirements,,,,,,...,,,,,,,,,,
2,,,CA_DAU04827,,Managed Wetlands,,,,,,...,,,,,,,,,,
3,,,CA_DAU04827,,Required Delta Outflow,,,,,,...,,,,,,,,,,
4,,,CA_DAU04827,,Urban,,,,,,...,,,,,,,,,,
5,,,CA_DAU04827,,Wild and Scenic River,,,,,,...,,,,,,,,,,
6,,,CA_DAU01949,,Agriculture,,,,,,...,,,,,,,,,,
7,,,CA_DAU01949,,Instream Flow Requirements,,,,,,...,,,,,,,,,,
8,,,CA_DAU01949,,Managed Wetlands,,,,,,...,,,,,,,,,,
9,,,CA_DAU01949,,Required Delta Outflow,,,,,,...,,,,,,,,,,


In [17]:
# hardcoded

outdf100.OrganizationUUID = "CDWR"
#
outdf100.VariableSpecificUUID = "Consumptive Use"
outdf100.WaterSourceUUID = "CA_1"
outdf100.MethodUUID = "CDWR_Water_uses"

# check this later
outdf100.PrimaryUseCategory = "Irrigation"
outdf100.TimeframeStart = "01/01"
outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y') 

outdf100

Unnamed: 0,OrganizationUUID,VariableSpecificUUID,ReportingUnitUUID,PrimaryUseCategory,BeneficialUseCategory,WaterSourceUUID,MethodUUID,TimeframeStart,TimeframeEnd,DataPublicationDate,...,IrrigatedAcreage,InterbasinTransferToID,InterbasinTransferFromID,CustomerTypeCV,AllocationCropDutyAmount,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,SDWISIdentifierCV,PowerType
0,CDWR,Consumptive Use,CA_DAU04827,Irrigation,Agriculture,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
1,CDWR,Consumptive Use,CA_DAU04827,Irrigation,Instream Flow Requirements,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
2,CDWR,Consumptive Use,CA_DAU04827,Irrigation,Managed Wetlands,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
3,CDWR,Consumptive Use,CA_DAU04827,Irrigation,Required Delta Outflow,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
4,CDWR,Consumptive Use,CA_DAU04827,Irrigation,Urban,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
5,CDWR,Consumptive Use,CA_DAU04827,Irrigation,Wild and Scenic River,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
6,CDWR,Consumptive Use,CA_DAU01949,Irrigation,Agriculture,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
7,CDWR,Consumptive Use,CA_DAU01949,Irrigation,Instream Flow Requirements,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
8,CDWR,Consumptive Use,CA_DAU01949,Irrigation,Managed Wetlands,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,
9,CDWR,Consumptive Use,CA_DAU01949,Irrigation,Required Delta Outflow,CA_1,CDWR_Water_uses,01/01,12/31,01/03/2020,...,,,,,,,,,,


In [18]:
print("Droping null amounts...")

# if Amount empty drop row and save it to a _missing.csv

#outdf100 = outdf100.replace(np.nan, '') #replace NaN by blank strings

outdf100purge = outdf100.loc[(outdf100["Amount"] == '') | (outdf100["Amount"] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('aggregatedallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["Amount"] == '') | (outdf100["Amount"] == np.nan)].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100["PopulationServed"]

Droping null amounts...


  result = method(y)


In [19]:
print("Droping null ReportingUnitID ...")
outdf100nullPR = outdf100.loc[(outdf100["ReportingUnitUUID"] == '') 
                              | (outdf100["ReportingUnitUUID"] == np.nan)]
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[(outdf100["ReportingUnitUUID"] == '') 
                              | (outdf100["ReportingUnitUUID"] == np.nan)].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null ReportingUnitID ...


In [20]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregatedallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping duplicates...


In [21]:
print("Writing outputs...")

#replace NaN by blank strings--this is to avoid blank columns getting default 0 by the import code
outdf100 = outdf100.replace(np.nan, '') 
# outputs aggregated amounts
out_agamount = "aggregatedamounts.csv"
outdf100.to_csv(out_agamount, index=False, encoding = "utf-8")

print("Done Aggregated amount")

Writing outputs...
Done Aggregated amount


In [None]:
### The following is only for inspection when something seems not right from the above output

In [None]:
print("Inspect duplicates for subset of columns...")

target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  #"PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "ReportYearCV"]

out_agamount1 = "aggregatedamounts.csv"
outdf100 = pd.read_csv(out_agamount1,encoding = "ISO-8859-1")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated(subset = target_columns)]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregateuse_duplicateID_rows.csv")  # index=False,
    outdf100.drop_duplicates(subset = target_columns, inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
    
#TODO: These may need removing 
