In [None]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse

In [None]:
# working directory
working_dir = "./ProcessedInputData/"
os.chdir(working_dir)

In [None]:
target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  "PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "TimeframeStart", "TimeframeEnd", 
                  "DataPublicationDate", "DataPublicationDOI", "ReportYearCV", "Amount",  
                  "PopulationServed", "PowerGeneratedGWh", "IrrigatedAcreage",
                  "InterbasinTransferToID", "InterbasinTransferFromID", "CustomerTypeCV",
                  "AllocationCropDutyAmount", "IrrigationMethodCV", "CropTypeCV",
                  "CommunityWaterSupplySystem", "SDWISIdentifierCV", "PowerType"]

In [None]:
outdf100 = pd.DataFrame(columns=target_columns)

In [None]:
# Input files
fileInput1_prefix = "CA-DWR-WaterBalance-Level2-DP-1000-"
fileInput1_postfix = "-DAUCO.csv"

# reporting units
# ---- no need here ---- inp_repunts = 'reportingunits.csv'

In [None]:
print("Reading input...")

# combine data from multiple files to one dataFrame
startYear = 2011
endYear = 2015
numYears = 5
yearList = np.linspace(startYear, endYear, numYears)
df500_list = []
for isx in range (numYears):
    fileInput1 = fileInput1_prefix + str(int(yearList[isx])) + fileInput1_postfix
    df50 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, usecols =input_owner_cols) 
    df500_list.append(df50)
    
df500 = pd.concat(df500_list, sort=True, ignore_index=True)

df500.drop_duplicates(inplace=True)
print(len(df500.index))

#df500.head(5)
df500

In [None]:
print("Sum/Aggregate amount by Ben use, DAU, and Year...")

df200 = df500.groupby(['CategoryA', 'DAU', 'Year']).agg({'KAcreFt': sum})
df200

In [None]:
# reset index
df200KA = pd.DataFrame(columns=['KAcreFt'])
df200KA['KAcreFt'] = df200['KAcreFt']
df200KA = df200KA.reset_index(drop=True)
df200KA

In [None]:
# change units to AF from KAF

print("Adjust units to AF...")

df200KA = df200KA.assign(AmountAF=np.nan)
df200KA['AmountAF'] = df200KA.apply(lambda row: float(row['KAcreFt'])/1000.0, axis=1)
df200KA

In [None]:
print("Find unique rows by Ben use, DAU, and Year...")

df100 = df500.drop_duplicates(subset=['CategoryA', 'DAU', 'Year'])
df100 = df100.reset_index(drop=True)
df100

In [None]:
# combine
df100['AmountAF'] = df200KA['AmountAF']
df100

In [None]:
print("reporting units...")

#ReportingUnitUUID	CA_DAU

df100 = df100.assign(ReportingUnitUUID='')
df100['ReportingUnitUUID'] = df100.apply(lambda row: '_'.join(["CA", str(row["DAU"])]), axis=1)
df100

In [None]:
print("Copying Columns...")

destCols = ["ReportingUnitUUID", "BeneficialUseCategory", "Amount", "ReportYearCV"]
srsCols = ["ReportingUnitUUID", "CategoryA", "AmountAF", "Year"]

outdf100[destCols] = df100[srsCols]

outdf100

In [None]:
# hardcoded

outdf100.OrganizationUUID = "CDWR"
#
outdf100.VariableSpecificUUID = "CA_Consumptive Use"
outdf100.WaterSourceUUID = "CA_1"
outdf100.MethodUUID = "CDWR_Water_uses"

# check this later
outdf100.PrimaryUseCategory = "Irrigation"
outdf100.TimeframeStart = "01/01"
outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y') 

outdf100

In [None]:
print("Droping null amounts...")

# if Amount empty drop row and save it to a _missing.csv

#outdf100 = outdf100.replace(np.nan, '') #replace NaN by blank strings

outdf100purge = outdf100.loc[(outdf100["Amount"] == '') | (outdf100["Amount"] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('aggregatedallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["Amount"] == '') | (outdf100["Amount"] == np.nan)].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100["PopulationServed"]

In [None]:
print("Droping null ReportingUnitID ...")
outdf100nullPR = outdf100.loc[(outdf100["ReportingUnitUUID"] == '') 
                              | (outdf100["ReportingUnitUUID"] == np.nan)]
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[(outdf100["ReportingUnitUUID"] == '') 
                              | (outdf100["ReportingUnitUUID"] == np.nan)].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregatedallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Writing outputs...")

#replace NaN by blank strings--this is to avoid blank columns getting default 0 by the import code
outdf100 = outdf100.replace(np.nan, '') 
# outputs aggregated amounts
out_agamount = "aggregatedamounts.csv"
outdf100.to_csv(out_agamount, index=False, encoding = "utf-8")

print("Done Aggregated amount")

In [None]:
### The following is only for inspection when something seems not right from the above output

In [None]:
print("Inspect duplicates for subset of columns...")

target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  #"PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "ReportYearCV"]

out_agamount1 = "aggregatedamounts.csv"
outdf100 = pd.read_csv(out_agamount1,encoding = "ISO-8859-1")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated(subset = target_columns)]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregateuse_duplicateID_rows.csv")  # index=False,
    outdf100.drop_duplicates(subset = target_columns, inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
    
#TODO: These may need removing 
