In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse
from utilityFunctions import *

In [2]:
# working directory
working_dir = "./ProcessedInputData/"
os.chdir(working_dir)

In [None]:
target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  "PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "TimeframeStart", "TimeframeEnd", 
                  "DataPublicationDate", "DataPublicationDOI", "ReportYearCV", "Amount",  
                  "PopulationServed", "PowerGeneratedGWh", "IrrigatedAcreage",
                  "InterbasinTransferToID", "InterbasinTransferFromID", "CustomerTypeCV",
                  "AllocationCropDutyAmount", "IrrigationMethodCV", "CropTypeCV",
                  "CommunityWaterSupplySystem", "SDWISIdentifierCV"]

In [None]:
outdf100 = pd.DataFrame(columns=target_columns)

In [None]:
# Input files
fileInput1 = "S_USE_AMOUNT.csv"
fileInput2 = "SUMMARY_USE.csv"
#fileInput3 = "REPORTING_UNIT.csv"
#fileInput4 = "REPORT.csv"
#fileInput5 = "S_USE_IRRIGATION.csv"
#fileInput6 = "LU_BENEFICIAL_USE.csv"

# reporting units
#inp_repunts = 'reportingunits.csv'

In [None]:
print("Reading inputs...")

df100 = pd.read_csv(fileInput1,encoding = "ISO-8859-1") #, usecols =input_owner_cols) 
print(len(df100.index))
#df100.drop_duplicates(inplace=True)
#print(len(df100.index))

input_cols = ['POPULATION_SERVED', 'POWER_GENERATED']
df20 = pd.read_csv(fileInput2,encoding = "ISO-8859-1", usecols =input_cols)  
print(len(df20.index))

# these two files have same length and rows correspond to same water use
df100 = df100.assign(POPULATION_SERVED='')
df100 = df100.assign(POWER_GENERATED='')
df100["POPULATION_SERVED"] = df20["POPULATION_SERVED"]
df100["POWER_GENERATED"] = df20["POWER_GENERATED"]
#pd.merge(df10, df20, left_on='REPORTING_UNIT_ID', right_on='REPORTING_UNIT_ID') #, how='left')
#df100
print (len(df100.index))

# reporting units look up
#df400 = pd.read_csv(inp_repunts,encoding = "ISO-8859-1")
#drop duplicate rows ---this one is not necessary once the table is refined to remove duplicates
#df400 = df400.drop_duplicates(subset=['ReportingUnitName'])
#df400

In [None]:
print("reporting units...")

#ReportingUnitUUID	REPORTING_UNIT_ID

df100 = df100.assign(ReportingUnitUUID='')
df100['ReportingUnitUUID'] = df100.apply(lambda row: 
                                         '_'.join(["TX", str(row["REPORTING_UNIT_ID"])]),
                                                               axis=1)

df100

In [None]:
ben_use_dictionary = {
    "1":"Irrigated agriculture_ground",
    "2":"Irrigated agriculture_surface",
    "3":"Irrigated agriculture_reuse",
    "4":"Livestock_ground",
    "5":"Livestock_surface",
    "6":"Livestock_reuse",
    "7":"Mining_ground",
    "8":"Mining_surface",
    "9":"Mining_reuse",
    "10":"Steam-Electric Power_ground",
    "11":"Steam-Electric Power_surface",
    "12":"Steam-Electric Power_reuse",
    "13":"Manufacturing_ground",
    "14":"Manufacturing_surface",
    "15":"Manufacturing_reuse",
    "16":"Municipal_ground",
    "17":"Municipal_surface",
    "18":"Municipal_reuse"
}

In [None]:
print("Beneficial uses...")

#BeneficialUseCategory	BENEFICIAL_USE_ID

df100 = df100.assign(BeneficialUseCategory='')
df100['BeneficialUseCategory'] = df100.apply(lambda row: 
                                ben_use_dictionary[str(row["BENEFICIAL_USE_ID"]).strip()],axis=1)

df100

In [None]:
print ("water source UUID...")

# Three types of water sources: ground, surface, reuse
#outdf100.WaterSourceUUID = "TWDB_1", "TWDB_2", "TWDB_3"
def mapWaterSourceUUID(inString1):
    inString = str(inString1).strip()
    if "ground" in inString:
        return "TWDB_1"
    elif "surface" in inString:
        return "TWDB_2"
    elif "reuse" in inString:
        return "TWDB_3"
    else:
        print("erroneous input string")
        return ''
    
df100 = df100.assign(WaterSourceUUID='')
df100['WaterSourceUUID'] = df100.apply(lambda row: 
                                       mapWaterSourceUUID(row["BeneficialUseCategory"]), axis=1)

df100

In [None]:
print("Copying Columns...")

#ReportYearCV	REPORT_ID 
#Amount	AMOUNT
#PopulationServed	POPULATION_SERVED
#PowerGeneratedGWh	POWER_GENERATED
#TimeframeStart	START_DATE
#TimeframeEnd	END_DATE

destCols = ["ReportingUnitUUID", "WaterSourceUUID", "PrimaryUseCategory", "BeneficialUseCategory", "ReportYearCV", "Amount", 
            "PopulationServed", "PowerGeneratedGWh", "TimeframeStart", "TimeframeEnd"]
srsCols = ["ReportingUnitUUID", "WaterSourceUUID", "PrimaryUseCategory", "BeneficialUseCategory", "REPORT_ID", "AMOUNT", 
           "POPULATION_SERVED", "POWER_GENERATED", "START_DATE", "END_DATE"]

outdf100[destCols] = df100[srsCols]

outdf100

In [None]:
# hardcoded

outdf100.OrganizationUUID = "TWDB"
#
outdf100.VariableSpecificUUID = "Consumptive Use"
# variableSpecificCV = 'Allocation All'
outdf100.MethodUUID = "TWDB_Water_uses"
#outdf100.AllocationBasisCV = "Unknown"
# check this later
#outdf100.PrimaryUseCategory = "Irrigation"
#outdf100.TimeframeStart = "01/01"
#outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y') 

outdf100

In [None]:
print("Droping null amounts...")

# if Amount empty drop row and save it to a _missing.csv

#outdf100 = outdf100.replace(np.nan, '') #replace NaN by blank strings

outdf100purge = outdf100.loc[(outdf100["Amount"] == '') | (outdf100["Amount"] == np.nan)]
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('aggregatedallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[(outdf100["Amount"] == '') | (outdf100["Amount"] == np.nan].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100["PopulationServed"]

In [None]:
print("Droping null ReportingUnitID ...")
outdf100nullPR = outdf100.loc[(outdf100["ReportingUnitUUID"] == '') 
                              | (outdf100["ReportingUnitUUID"] == np.nan)]
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[(outdf100["ReportingUnitUUID"] == '') 
                              | (outdf100["ReportingUnitUUID"] == np.nan)].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregatedallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

In [None]:
print("Inspect duplicates for subset of columns...")

target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  #"PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "ReportYearCV"]

out_agamount1 = "aggregatedamounts.csv"
outdf100 = pd.read_csv(out_agamount1,encoding = "ISO-8859-1")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated(subset = target_columns)]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregateuse_duplicateID_rows.csv")  # index=False,
    outdf100.drop_duplicates(subset = target_columns, inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
    
#TODO: These may need removing 
outdf100Duplicated

In [8]:
print("Writing outputs...")

# outputs aggregated amounts
out_agamount = "aggregatedamounts.csv"
outdf100.to_csv(out_agamount, index=False, encoding = "utf-8")

print("Done Water Allocation")

Writing outputs...
Done Water Allocation
