In [1]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse
from waterallocationsFunctions import *

In [2]:
# working directory
working_dir = "C:/tseg/NMTest/aggregatedamounts/"
os.chdir(working_dir)

In [3]:
target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  "PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "TimeframeStart", "TimeframeEnd", 
                  "DataPublicationDate", "DataPublicationDOI", "ReportYearCV", "Amount",  
                  "PopulationServed", "PowerGeneratedGWh", "IrrigatedAcreage",
                  "InterbasinTransferToID", "InterbasinTransferFromID", "CustomerTypeCV",
                  "AllocationCropDutyAmount", "IrrigationMethodCV", "CropTypeCV",
                  "CommunityWaterSupplySystem", "SDWISIdentifierCV"]

In [4]:
outdf100 = pd.DataFrame(columns=target_columns)

In [5]:
# Input files
fileInput1 = "Summary of withdrawals by county 90-15.xlsx" 
fileInput2 = "Summary of withdrawals by River Basin 90-15.xlsx" 

# reporting units lookup
inp_repunts = "reportingunits.csv"

# outputs aggregated amounts
out_agamount = "aggregatedamounts.csv"

In [6]:
df10 = pd.read_excel(fileInput1, header=0, sheet_name=0, skiprows=1, encoding = "ISO-8859-1")
#df20 = pd.read_excel(fileInput2, header=0, sheet_name=0, skiprows=1, encoding = "ISO-8859-1")
list(df10.columns)
#list(df20.columns)

# reporting units look up
df400 = pd.read_csv(inp_repunts,encoding = "ISO-8859-1")
#drop duplicate rows ---this one is not necessary once the table is refined to remove duplicates
df400 = df400.drop_duplicates(subset=['ReportingUnitName'])
#df400

In [7]:
# combine multiple sheets to one dataFrame

startYear = 1990
endYear = 2015
numSheets = 5
yearList = np.linspace(startYear, endYear, numSheets)
df100_list = []
for isx in range (numSheets):
    df10 = pd.read_excel(fileInput1, header=0, sheet_name=isx, skiprows=1, encoding = "ISO-8859-1")
    df10 = df10.assign(ReportYearCV=yearList[isx])
    df10.ReportYearCV = df10.ReportYearCV.astype(int)
    df100_list.append(df10)
    
df100 = pd.concat(df100_list, sort=True, ignore_index=True)

df100.head(5)
df100

Unnamed: 0,CAT,CN,COUNTY,ReportYearCV,TW,WGW,WSW
0,Public Water Supply,1,Bernalillo,1990,125483.156250,125483.156250,0.000000
1,Domestic (self-supplied),1,Bernalillo,1990,3561.899902,3561.899902,0.000000
2,Irrigated Agriculture,1,Bernalillo,1990,77764.000000,4037.000000,73727.000000
3,Livestock (self-supplied),1,Bernalillo,1990,789.530029,753.200012,36.330002
4,Commercial (self-supplied),1,Bernalillo,1990,3711.300049,3711.300049,0.000000
5,Industrial (self-supplied),1,Bernalillo,1990,485.049988,485.049988,0.000000
6,Mining (self-supplied),1,Bernalillo,1990,324.739990,324.739990,0.000000
7,Power (self-supplied),1,Bernalillo,1990,179.360001,179.360001,0.000000
8,Reservoir Evaporation,1,Bernalillo,1990,0.000000,0.000000,0.000000
9,Public Water Supply,3,Catron,1990,125.440002,125.440002,0.000000


In [8]:
print("WatersourceUUID and amount...")

# each row above has two amounts: surface and ground water 
# create separate tables for groundwater amount and surface amount and concatenate them

df100_1 = df100[['COUNTY', 'CAT', 'ReportYearCV', 'WGW']]
df100_2 = df100[['COUNTY', 'CAT', 'ReportYearCV', 'WSW']]
df100_1 = df100_1.rename(columns={"WGW": "Amount"})
df100_2 = df100_2.rename(columns={"WSW": "Amount"})

# water source id for each amount type
df100_1 = df100_1.assign(WaterSourceUUID="Fresh_Ground")
df100_2 = df100_2.assign(WaterSourceUUID="Fresh_Surface")

# concat the two
df100=pd.concat([df100_1, df100_2], sort=True, ignore_index=True)

print (len(df100.index))

df100.head(5)
df100.tail(5)
df100

WatersourceUUID and amount...
2970


Unnamed: 0,Amount,CAT,COUNTY,ReportYearCV,WaterSourceUUID
0,125483.156250,Public Water Supply,Bernalillo,1990,Fresh_Ground
1,3561.899902,Domestic (self-supplied),Bernalillo,1990,Fresh_Ground
2,4037.000000,Irrigated Agriculture,Bernalillo,1990,Fresh_Ground
3,753.200012,Livestock (self-supplied),Bernalillo,1990,Fresh_Ground
4,3711.300049,Commercial (self-supplied),Bernalillo,1990,Fresh_Ground
5,485.049988,Industrial (self-supplied),Bernalillo,1990,Fresh_Ground
6,324.739990,Mining (self-supplied),Bernalillo,1990,Fresh_Ground
7,179.360001,Power (self-supplied),Bernalillo,1990,Fresh_Ground
8,0.000000,Reservoir Evaporation,Bernalillo,1990,Fresh_Ground
9,125.440002,Public Water Supply,Catron,1990,Fresh_Ground


In [9]:
print("reporting units...")

df100 = df100.assign(ReportingUnitUUID='')
df100['ReportingUnitUUID'] = df100.apply(lambda row: 
                                assignReportingUnitsID(row['COUNTY'], df400), axis=1)

df100

reporting units...


Unnamed: 0,Amount,CAT,COUNTY,ReportYearCV,WaterSourceUUID,ReportingUnitUUID
0,125483.156250,Public Water Supply,Bernalillo,1990,Fresh_Ground,NM_1
1,3561.899902,Domestic (self-supplied),Bernalillo,1990,Fresh_Ground,NM_1
2,4037.000000,Irrigated Agriculture,Bernalillo,1990,Fresh_Ground,NM_1
3,753.200012,Livestock (self-supplied),Bernalillo,1990,Fresh_Ground,NM_1
4,3711.300049,Commercial (self-supplied),Bernalillo,1990,Fresh_Ground,NM_1
5,485.049988,Industrial (self-supplied),Bernalillo,1990,Fresh_Ground,NM_1
6,324.739990,Mining (self-supplied),Bernalillo,1990,Fresh_Ground,NM_1
7,179.360001,Power (self-supplied),Bernalillo,1990,Fresh_Ground,NM_1
8,0.000000,Reservoir Evaporation,Bernalillo,1990,Fresh_Ground,NM_1
9,125.440002,Public Water Supply,Catron,1990,Fresh_Ground,NM_2


In [11]:
print("Copying columns...")
destCols = ["WaterSourceUUID", "ReportingUnitUUID", "ReportYearCV", "Amount", "BeneficialUseCategory"]
srsCols = ["WaterSourceUUID", "ReportingUnitUUID", "ReportYearCV", "Amount", "CAT"]

outdf100[destCols] = df100[srsCols]

outdf100

Copying columns...


Unnamed: 0,OrganizationUUID,VariableSpecificUUID,ReportingUnitUUID,PrimaryUseCategory,BeneficialUseCategory,WaterSourceUUID,MethodUUID,TimeframeStart,TimeframeEnd,DataPublicationDate,...,PowerGeneratedGWh,IrrigatedAcreage,InterbasinTransferToID,InterbasinTransferFromID,CustomerTypeCV,AllocationCropDutyAmount,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,SDWISIdentifierCV
0,,,NM_1,,Public Water Supply,Fresh_Ground,,,,,...,,,,,,,,,,
1,,,NM_1,,Domestic (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
2,,,NM_1,,Irrigated Agriculture,Fresh_Ground,,,,,...,,,,,,,,,,
3,,,NM_1,,Livestock (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
4,,,NM_1,,Commercial (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
5,,,NM_1,,Industrial (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
6,,,NM_1,,Mining (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
7,,,NM_1,,Power (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
8,,,NM_1,,Reservoir Evaporation,Fresh_Ground,,,,,...,,,,,,,,,,
9,,,NM_2,,Public Water Supply,Fresh_Ground,,,,,...,,,,,,,,,,


In [12]:
# hardcoded

outdf100.OrganizationUUID = "NMOSE"
#
outdf100.VariableSpecificUUID = "Consumptive Use"
# variableSpecificCV = 'Allocation All'
outdf100.MethodUUID = "NMOSE_Water_uses"
#outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
outdf100.TimeframeStart = "01/01"
outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y') 

outdf100

Unnamed: 0,OrganizationUUID,VariableSpecificUUID,ReportingUnitUUID,PrimaryUseCategory,BeneficialUseCategory,WaterSourceUUID,MethodUUID,TimeframeStart,TimeframeEnd,DataPublicationDate,...,PowerGeneratedGWh,IrrigatedAcreage,InterbasinTransferToID,InterbasinTransferFromID,CustomerTypeCV,AllocationCropDutyAmount,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,SDWISIdentifierCV
0,NMOSE,Consumptive Use,NM_1,Irrigation,Public Water Supply,Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
1,NMOSE,Consumptive Use,NM_1,Irrigation,Domestic (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
2,NMOSE,Consumptive Use,NM_1,Irrigation,Irrigated Agriculture,Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
3,NMOSE,Consumptive Use,NM_1,Irrigation,Livestock (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
4,NMOSE,Consumptive Use,NM_1,Irrigation,Commercial (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
5,NMOSE,Consumptive Use,NM_1,Irrigation,Industrial (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
6,NMOSE,Consumptive Use,NM_1,Irrigation,Mining (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
7,NMOSE,Consumptive Use,NM_1,Irrigation,Power (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
8,NMOSE,Consumptive Use,NM_1,Irrigation,Reservoir Evaporation,Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,
9,NMOSE,Consumptive Use,NM_2,Irrigation,Public Water Supply,Fresh_Ground,NMOSE_Water_uses,01/01,12/31,11/21/2019,...,,,,,,,,,,


In [16]:
print("Droping null amounts...")

# if Amount empty drop row and save it to a _missing.csv

outdf100 = outdf100.replace(np.nan, '') #replace NaN by blank strings

outdf100purge = outdf100.loc[outdf100["Amount"] == '']
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('aggregatedallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[outdf100["Amount"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null amounts...


In [17]:
print("Droping null ReportingUnitID ...")
outdf100nullPR = outdf100.loc[outdf100["ReportingUnitUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["ReportingUnitUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null ReportingUnitID ...


In [18]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregatedallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping duplicates...


In [19]:
print("Writing outputs...")

# outputs aggregated amounts
outdf100.to_csv(out_agamount, index=False, encoding = "utf-8")

print("Done Water Allocation")

Writing outputs...
Done Water Allocation
