In [15]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import os
from datetime import datetime
from dateutil.parser import parse
from utilityFunctions import *

In [16]:
# working directory
working_dir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/NewMexico/AggregatedAmounts/oldcode"
os.chdir(working_dir)

In [17]:
target_columns = ["OrganizationUUID", "VariableSpecificUUID", 
                  "ReportingUnitUUID",
                  "PrimaryUseCategory", "BeneficialUseCategory", 
                  "WaterSourceUUID", "MethodUUID", "TimeframeStart", "TimeframeEnd", 
                  "DataPublicationDate", "DataPublicationDOI", "ReportYearCV", "Amount",  
                  "PopulationServed", "PowerGeneratedGWh", "IrrigatedAcreage",
                  "InterbasinTransferToID", "InterbasinTransferFromID", "CustomerTypeCV",
                  "AllocationCropDutyAmount", "IrrigationMethodCV", "CropTypeCV",
                  "CommunityWaterSupplySystem", "SDWISIdentifierCV"]

In [18]:
outdf100 = pd.DataFrame(columns=target_columns)

In [19]:
# Input files
fileInput1 = "RawInputData/Summary of withdrawals by county 90-15.xlsx" 
# fileInput2 = "RawInputData/Summary of withdrawals by River Basin 90-15.xlsx" 

# reporting units lookup
inp_repunts = "ProcessedInputData/reportingunits.csv"

# outputs aggregated amounts
out_agamount = "ProcessedInputData/aggregatedamounts.csv"

In [20]:
df10 = pd.read_excel(fileInput1, header=0, sheet_name=0, skiprows=1, encoding = "ISO-8859-1")
#df20 = pd.read_excel(fileInput2, header=0, sheet_name=0, skiprows=1, encoding = "ISO-8859-1")
# list(df10.columns)
#list(df20.columns)
df10

Unnamed: 0,CN,COUNTY,CAT,WSW,WGW,TW
0,1,Bernalillo,Public Water Supply,0.000000,125483.156250,125483.156250
1,1,Bernalillo,Domestic (self-supplied),0.000000,3561.899902,3561.899902
2,1,Bernalillo,Irrigated Agriculture,73727.000000,4037.000000,77764.000000
3,1,Bernalillo,Livestock (self-supplied),36.330002,753.200012,789.530029
4,1,Bernalillo,Commercial (self-supplied),0.000000,3711.300049,3711.300049
...,...,...,...,...,...,...
292,61,Valencia,Commercial (self-supplied),0.000000,1025.689941,1025.689941
293,61,Valencia,Industrial (self-supplied),0.000000,84.800003,84.800003
294,61,Valencia,Mining (self-supplied),0.000000,3.600000,3.600000
295,61,Valencia,Power (self-supplied),0.000000,0.000000,0.000000


In [21]:
# reporting units look up
df400 = pd.read_csv(inp_repunts,encoding = "ISO-8859-1")
#drop duplicate rows ---this one is not necessary once the table is refined to remove duplicates
df400 = df400.drop_duplicates(subset=['ReportingUnitName'])
df400

Unnamed: 0,ReportingUnitUUID,ReportingUnitNativeID,ReportingUnitName,ReportingUnitTypeCV,ReportingUnitUpdateDate,ReportingUnitProductVersion,StateCV,EPSGCodeCV,Geometry
0,NM_C_35001,35001,Bernalillo,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
1,NM_C_35003,35003,Catron,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
2,NM_C_35005,35005,Chaves,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
3,NM_C_35006,35006,Cibola,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
4,NM_C_35007,35007,Colfax,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
5,NM_C_35009,35009,Curry,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
6,NM_C_35011,35011,De Baca,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
7,NM_C_35013,35013,Dona Ana,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
8,NM_C_35015,35015,Eddy,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."
9,NM_C_35017,35017,Grant,County,,,NM,EPSG:4326,"POLYGON((-99.54319297853704 37.15853229006052,..."


In [22]:
#exracting ReportYearCv from each tab
# combine multiple sheets to one dataFrame

startYear = 1990
endYear = 2015
numSheets = 5
yearList = np.linspace(startYear, endYear, numSheets)
df100_list = []
for isx in range (numSheets):
    df10 = pd.read_excel(fileInput1, header=0, sheet_name=isx, skiprows=1, encoding = "ISO-8859-1")
    df10 = df10.assign(ReportYearCV=yearList[isx])
    df10.ReportYearCV = df10.ReportYearCV.astype(int)
    df100_list.append(df10)
    
df100 = pd.concat(df100_list, sort=True, ignore_index=True)

df100.head(5)
df100

Unnamed: 0,CAT,CN,COUNTY,ReportYearCV,TW,WGW,WSW
0,Public Water Supply,1,Bernalillo,1990,125483.156250,125483.156250,0.000000
1,Domestic (self-supplied),1,Bernalillo,1990,3561.899902,3561.899902,0.000000
2,Irrigated Agriculture,1,Bernalillo,1990,77764.000000,4037.000000,73727.000000
3,Livestock (self-supplied),1,Bernalillo,1990,789.530029,753.200012,36.330002
4,Commercial (self-supplied),1,Bernalillo,1990,3711.300049,3711.300049,0.000000
...,...,...,...,...,...,...,...
1480,Livestock (self-supplied),61,Valencia,2015,888.255259,841.114994,47.140265
1481,Mining (self-supplied),61,Valencia,2015,178.559000,178.559000,0.000000
1482,Power (self-supplied),61,Valencia,2015,6.000000,6.000000,0.000000
1483,Public Water Supply,61,Valencia,2015,6553.686693,6553.686693,0.000000


In [24]:
print("WatersourceUUID and amount...")

# each row above has two amounts: surface and ground water 
# create separate tables for groundwater amount and surface amount and concatenate them

df100_1 = df100[['COUNTY', 'CAT', 'ReportYearCV', 'WGW']]
df100_2 = df100[['COUNTY', 'CAT', 'ReportYearCV', 'WSW']]
df100_1 = df100_1.rename(columns={"WGW": "Amount"})
df100_2 = df100_2.rename(columns={"WSW": "Amount"})

# water source id for each amount type
df100_1 = df100_1.assign(WaterSourceUUID="Fresh_Ground")
df100_2 = df100_2.assign(WaterSourceUUID="Fresh_Surface")

# concat the two
df100=pd.concat([df100_1, df100_2], sort=True, ignore_index=True)

print (len(df100.index))

df100.head(5)
df100.tail(5)
df100

WatersourceUUID and amount...
2970


Unnamed: 0,Amount,CAT,COUNTY,ReportYearCV,WaterSourceUUID
0,125483.156250,Public Water Supply,Bernalillo,1990,Fresh_Ground
1,3561.899902,Domestic (self-supplied),Bernalillo,1990,Fresh_Ground
2,4037.000000,Irrigated Agriculture,Bernalillo,1990,Fresh_Ground
3,753.200012,Livestock (self-supplied),Bernalillo,1990,Fresh_Ground
4,3711.300049,Commercial (self-supplied),Bernalillo,1990,Fresh_Ground
...,...,...,...,...,...
2965,47.140265,Livestock (self-supplied),Valencia,2015,Fresh_Surface
2966,0.000000,Mining (self-supplied),Valencia,2015,Fresh_Surface
2967,0.000000,Power (self-supplied),Valencia,2015,Fresh_Surface
2968,0.000000,Public Water Supply,Valencia,2015,Fresh_Surface


In [23]:
print("reporting units...")

df100 = df100.assign(ReportingUnitUUID='')
df100['ReportingUnitUUID'] = df100.apply(lambda row: 
                                assignReportingUnitsID(row['COUNTY'], df400), axis=1)

df100

reporting units...


Unnamed: 0,CAT,CN,COUNTY,ReportYearCV,TW,WGW,WSW,ReportingUnitUUID
0,Public Water Supply,1,Bernalillo,1990,125483.156250,125483.156250,0.000000,NM_C_35001
1,Domestic (self-supplied),1,Bernalillo,1990,3561.899902,3561.899902,0.000000,NM_C_35001
2,Irrigated Agriculture,1,Bernalillo,1990,77764.000000,4037.000000,73727.000000,NM_C_35001
3,Livestock (self-supplied),1,Bernalillo,1990,789.530029,753.200012,36.330002,NM_C_35001
4,Commercial (self-supplied),1,Bernalillo,1990,3711.300049,3711.300049,0.000000,NM_C_35001
...,...,...,...,...,...,...,...,...
1480,Livestock (self-supplied),61,Valencia,2015,888.255259,841.114994,47.140265,NM_C_35061
1481,Mining (self-supplied),61,Valencia,2015,178.559000,178.559000,0.000000,NM_C_35061
1482,Power (self-supplied),61,Valencia,2015,6.000000,6.000000,0.000000,NM_C_35061
1483,Public Water Supply,61,Valencia,2015,6553.686693,6553.686693,0.000000,NM_C_35061


In [10]:
print("Copying columns...")
destCols = ["WaterSourceUUID", "ReportingUnitUUID", "ReportYearCV", "Amount", "BeneficialUseCategory"]
srsCols = ["WaterSourceUUID", "ReportingUnitUUID", "ReportYearCV", "Amount", "CAT"]

outdf100[destCols] = df100[srsCols]

outdf100

Copying columns...


Unnamed: 0,OrganizationUUID,VariableSpecificUUID,ReportingUnitUUID,PrimaryUseCategory,BeneficialUseCategory,WaterSourceUUID,MethodUUID,TimeframeStart,TimeframeEnd,DataPublicationDate,...,PowerGeneratedGWh,IrrigatedAcreage,InterbasinTransferToID,InterbasinTransferFromID,CustomerTypeCV,AllocationCropDutyAmount,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,SDWISIdentifierCV
0,,,NM_C_35001,,Public Water Supply,Fresh_Ground,,,,,...,,,,,,,,,,
1,,,NM_C_35001,,Domestic (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
2,,,NM_C_35001,,Irrigated Agriculture,Fresh_Ground,,,,,...,,,,,,,,,,
3,,,NM_C_35001,,Livestock (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
4,,,NM_C_35001,,Commercial (self-supplied),Fresh_Ground,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,,,NM_C_35061,,Livestock (self-supplied),Fresh_Surface,,,,,...,,,,,,,,,,
2966,,,NM_C_35061,,Mining (self-supplied),Fresh_Surface,,,,,...,,,,,,,,,,
2967,,,NM_C_35061,,Power (self-supplied),Fresh_Surface,,,,,...,,,,,,,,,,
2968,,,NM_C_35061,,Public Water Supply,Fresh_Surface,,,,,...,,,,,,,,,,


In [11]:
# hardcoded

outdf100.OrganizationUUID = "NMOSE"
#
outdf100.VariableSpecificUUID = "Consumptive Use"
# variableSpecificCV = 'Allocation All'
outdf100.MethodUUID = "NMOSE_Water_uses"
#outdf100.AllocationBasisCV = "Unknown"
# check this later
outdf100.PrimaryUseCategory = "Irrigation"
outdf100.TimeframeStart = "01/01"
outdf100.TimeframeEnd = "12/31"
#
outdf100.DataPublicationDate = datetime.now().strftime('%m/%d/%Y') 

outdf100

Unnamed: 0,OrganizationUUID,VariableSpecificUUID,ReportingUnitUUID,PrimaryUseCategory,BeneficialUseCategory,WaterSourceUUID,MethodUUID,TimeframeStart,TimeframeEnd,DataPublicationDate,...,PowerGeneratedGWh,IrrigatedAcreage,InterbasinTransferToID,InterbasinTransferFromID,CustomerTypeCV,AllocationCropDutyAmount,IrrigationMethodCV,CropTypeCV,CommunityWaterSupplySystem,SDWISIdentifierCV
0,NMOSE,Consumptive Use,NM_C_35001,Irrigation,Public Water Supply,Fresh_Ground,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
1,NMOSE,Consumptive Use,NM_C_35001,Irrigation,Domestic (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
2,NMOSE,Consumptive Use,NM_C_35001,Irrigation,Irrigated Agriculture,Fresh_Ground,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
3,NMOSE,Consumptive Use,NM_C_35001,Irrigation,Livestock (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
4,NMOSE,Consumptive Use,NM_C_35001,Irrigation,Commercial (self-supplied),Fresh_Ground,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2965,NMOSE,Consumptive Use,NM_C_35061,Irrigation,Livestock (self-supplied),Fresh_Surface,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
2966,NMOSE,Consumptive Use,NM_C_35061,Irrigation,Mining (self-supplied),Fresh_Surface,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
2967,NMOSE,Consumptive Use,NM_C_35061,Irrigation,Power (self-supplied),Fresh_Surface,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,
2968,NMOSE,Consumptive Use,NM_C_35061,Irrigation,Public Water Supply,Fresh_Surface,NMOSE_Water_uses,01/01,12/31,06/25/2020,...,,,,,,,,,,


In [12]:
print("Droping null amounts...")

# if Amount empty drop row and save it to a _missing.csv

outdf100 = outdf100.replace(np.nan, '') #replace NaN by blank strings

outdf100purge = outdf100.loc[outdf100["Amount"] == '']
if len(outdf100purge.index) > 0:
    outdf100purge.to_csv('aggregatedallocations_missing.csv')    #index=False,
    dropIndex = outdf100.loc[outdf100["Amount"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null amounts...


  res_values = method(rvalues)


In [13]:
print("Droping null ReportingUnitID ...")
outdf100nullPR = outdf100.loc[outdf100["ReportingUnitUUID"] == '']
if len(outdf100nullPR.index) > 0:
    dropIndex = outdf100.loc[outdf100["ReportingUnitUUID"] == ''].index
    outdf100 = outdf100.drop(dropIndex)
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping null ReportingUnitID ...


In [14]:
print("Droping duplicates...")
#drop duplicate rows; just make sure
outdf100Duplicated=outdf100.loc[outdf100.duplicated()]
if len(outdf100Duplicated.index) > 0:
    outdf100Duplicated.to_csv("aggregatedallocations_duplicaterows.csv")  # index=False,
    outdf100.drop_duplicates(inplace=True)   #
    outdf100 = outdf100.reset_index(drop=True)
#outdf100

Droping duplicates...


In [15]:
print("Writing outputs...")

# outputs aggregated amounts
outdf100.to_csv(out_agamount, index=False, encoding = "utf-8")

print("Done Water Allocation")

Writing outputs...
Done Water Allocation
