# Working with TX Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

#### Notes:
- Will need to do a while loop to read in each csv source file by year.
- Will need to seperate out by Basin & County, then combine.
- TX data also includes the summation of amounts by surface water, groundwater, and reuse.  Only pull in those catagories and leave out the summation.
- Will need to assign water source type, use ben use string to determine this.
- Will need to fix ben use string and remove errors.

In [None]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Setting work directory, reading inputs, creating dataframe
workingDir = "G:/Shared drives/WaDE Data/Texas/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

# Basin Data

In [None]:
#Read in Basin xlsx data.  2000-2016.
yearCount = 2000
dfbinput = pd.DataFrame() #input dataframe
dfBasin = pd.DataFrame() #working dataframe

while yearCount < 2017:
    basinInputString = str(yearCount) + "/SumFinal_BasinReport.xlsx"
    dfbinput = pd.read_excel(basinInputString, skiprows=1)
    dfBasin = dfBasin.append(dfbinput)
    yearCount = yearCount + 1

dfBasin = dfBasin.replace(np.nan, '').drop_duplicates()
print(len(dfBasin.index))
dfBasin.head(3)

In [None]:
# Produce Temporary Basin out dataframe
columnsList = ["Year", "Basin", "Population"]
dfBasinV2 = pd.DataFrame(columns=columnsList)
dfBasinV2 = dfBasin[columnsList]
dfBasinV2 = dfBasinV2.rename(columns={"Basin": "in_ReportingUnitName"})

############################################

dfBasinV2 = dfBasinV2.assign(TX_BenUse='')
dfBasinV2 = dfBasinV2.assign(in_Amount='')
dfBasinV2 = dfBasinV2.assign(in_ReportingUnitType='')
dfBasinOut = pd.DataFrame()

############################################
columnsList = [
"Irrig ation Reuse",
"Irrigation Ground Water",
"Irrigation Surface Water",
"Livestock Ground Water",
"Livestock Reuse",
"Livestock Surface Water",
"Mfg Ground Water",
"Mfg Reuse",
"Mfg Surface Water",
"Mining Ground Water",
"Mining Reuse",
"Mining Surface Water",
"Municipal Ground  Water",
"Municipal Reuse",
"Municipal Surface Water",
"Power Ground Water",
"Power Reuse",
"Power Surface Water"]
lenList = len(columnsList)


############################################
for i in range(lenList):
    BenuseString = columnsList[i]
    dfBasinV2['TX_BenUse'] = BenuseString
    dfBasinV2['in_Amount'] = dfBasin[columnsList[i]]
    dfBasinV2['in_ReportingUnitType'] = "Basin"
    dfBasinOut = dfBasinOut.append(dfBasinV2)
    
############################################

print(len(dfBasinOut.index))
dfBasinOut.head(3)

# County Data

In [None]:
# #Read in County xlsx data.  2000-2016.
yearCount = 2000
dfcinput = pd.DataFrame() #input dataframe
dfCounty = pd.DataFrame() #working dataframe

while yearCount < 2017:
    countyInputString = str(yearCount) + "/SumFinal_CountyReport.xlsx"
    dfcinput = pd.read_excel(countyInputString, skiprows=1)
    dfCounty = dfCounty.append(dfcinput)
    yearCount = yearCount + 1

dfCounty = dfCounty.replace(np.nan, '').drop_duplicates()
print(len(dfCounty.index))
dfCounty.head(3)

In [None]:
# Produce Temporary County out dataframe
columnsList = ["Year", "County", "Population"]
dfCountyV2 = pd.DataFrame(columns=columnsList)
dfCountyV2 = dfCounty[columnsList]
dfCountyV2 = dfCountyV2.rename(columns={"County": "in_ReportingUnitName"})

############################################

dfCountyV2 = dfCountyV2.assign(TX_BenUse='')
dfCountyV2 = dfCountyV2.assign(in_Amount='')
dfCountyV2 = dfCountyV2.assign(in_ReportingUnitType='')
dfCountyOut = pd.DataFrame()

############################################
columnsList = [
"Irrigation Ground Water",
"Irrigation Reuse",
"Irrigation Surface Water",
"Livestock Ground Water",
"Livestock Reuse",
"Livestock Surface Water",
"Mfg Ground Water",
"Mfg Reuse",
"Mfg Surface Water",
"Mining Ground Water",
"Mining Reuse",
"Mining Surface Water",
"Municipal Ground Water",
"Municipal Reuse",
"Municipal Surface Water",
"Power Ground Water",
"Power Reuse",
"Power Surface Water"]
lenList = len(columnsList)


############################################
for i in range(lenList):
    BenuseString = columnsList[i]
    dfCountyV2['TX_BenUse'] = BenuseString
    dfCountyV2['in_Amount'] = dfCounty[columnsList[i]]
    dfCountyV2['in_ReportingUnitType'] = "County"
    dfCountyOut = dfCountyOut.append(dfCountyV2)
    
############################################

print(len(dfCountyOut.index))
dfCountyOut.head(3)

# Output Dataframe

In [None]:
dfout = dfBasinOut
dfout = dfout.append(dfCountyOut)
print(len(dfout.index))
dfout.head(3)

In [None]:
# WaterSourceType.  Use the TX Benefical Use to help determine this.

dfout = dfout.assign(in_WaterSourceType='')

def defineWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = 'Unspecified'
    else:
        try:
            if "Surface Water" in str(colrowValue).strip():
                outString = "Surface Water"
            elif "Ground Water" in str(colrowValue).strip():
                outString = "Groundwater"
            elif "Ground  Water" in str(colrowValue).strip():
                outString = "Groundwater"
            elif "Reuse" in str(colrowValue).strip():
                outString = "Reuse"
            else:
                outString = "Unspecified"
        except:
            outString = 'Unspecified'
    return outString

dfout['in_WaterSourceType'] = dfout.apply(lambda row: defineWaterSourceType(row['TX_BenUse']), axis=1)
dfout

In [None]:
# TimeframeStart & TimeframeEnd

dfout['inTimeframeStart'] = '01/01/' + dfout['Year'].astype(str)
dfout['inTimeframeEnd'] = '12/31/' + dfout['Year'].astype(str)
dfout

In [None]:
# Fixing Benefical use

BenUseDict = {
"Irrigation Ground Water" : "Irrigation",
"Irrigation Reuse" : "Irrigation",
"Irrigation Surface Water" : "Irrigation",
"Livestock Ground Water" : "Livestock",
"Livestock Reuse" : "Livestock",
"Livestock Surface Water" : "Livestock",
"Mfg Ground Water" : "Manufacturing",
"Mfg Reuse" : "Manufacturing",
"Mfg Surface Water" : "Manufacturing",
"Mining Ground Water" : "Mining",
"Mining Reuse" : "Mining",
"Mining Surface Water" : "Mining",
"Municipal Ground Water" : "Municipal",
"Municipal Reuse" : "Municipal",
"Municipal Surface Water" : "Municipal",
"Power Ground Water" : "Power",
"Power Reuse" : "Power",
"Power Surface Water" : "Power",
"Irrig ation Reuse" : "Irrigation",
"Municipal Ground  Water" : "Municipal"}

def fixTX_BenUse(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = 'UnSpecified'
    else:
        String1 = colrowValue
        try:
            outString = BenUseDict[String1]
        except:
            outString = 'UnSpecified'
    return outString

dfout['TX_BenUse'] = dfout.apply(lambda row: fixTX_BenUse(row['TX_BenUse']), axis=1)
dfout

In [None]:
# Dropping the 'STATE TOTAL' unit name.

dfout = dfout[dfout.in_ReportingUnitName != "STATE TOTAL"]
dfout

In [None]:
# in_VariableSpecificCV Info
dfout['in_VariableSpecificCV'] = "Consumptive Use_Annual_" + dfout['TX_BenUse'] + "_" + dfout['in_WaterSourceType']
dfout['in_VariableSpecificCV'].unique()

## WaDE Custom Elements (due to missing reporting unit and water source info)

In [None]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceType'] = dfout['in_WaterSourceType']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceType'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceType']), axis=1)
dfout

In [None]:
# Creating WaDE Custom reportingunit native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp ReportingUnitNativeID dataframe of unique reporting units.
def assignReportingUnitNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_RU" + string1
    return outstring

dfReportingUnitNativeID = pd.DataFrame()
dfReportingUnitNativeID['in_ReportingUnitName'] = dfout['in_ReportingUnitName']
dfReportingUnitNativeID['in_ReportingUnitType'] = dfout['in_ReportingUnitType']
dfReportingUnitNativeID = dfReportingUnitNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfReportingUnitNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfReportingUnitNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportingUnitNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfReportingUnitNativeID.loc[(dfReportingUnitNativeID['in_ReportingUnitName'] == A) & 
                                         (dfReportingUnitNativeID['in_ReportingUnitType'] == B), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_ReportingUnitNativeID'] = dfout.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)
dfout

# Shapefile Data

In [None]:
# Shapefile input
inputBasinShape = gpd.read_file('shapefiles/TX_Basin.shp')
inputCountyShape = gpd.read_file('shapefiles/TX_County.shp')

In [None]:
#Basin Shapefile
dfBasinShapetemp = pd.DataFrame(inputBasinShape)

columnsList = ['in_ReportingUnitName', 'in_ReportingUnitType', 'geometry']
dfBasinShape = pd.DataFrame(columns=columnsList)
dfBasinShape['in_ReportingUnitName'] = dfBasinShapetemp['Reportin_1']
dfBasinShape['in_ReportingUnitType'] = dfBasinShapetemp['Reportin_2']
dfBasinShape['geometry'] = dfBasinShapetemp['geometry']


# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfout.loc[(dfout['in_ReportingUnitName'] == A) & 
                       (dfout['in_ReportingUnitType'] == B), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfBasinShape['in_ReportingUnitNativeID'] = dfBasinShape.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)
dfBasinShape


dfBasinShape = dfBasinShape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfBasinShape.head(3)

In [None]:
#CountyShapefile
dfCountyShapetemp = pd.DataFrame(inputCountyShape)

columnsList = ['in_ReportingUnitName', 'in_ReportingUnitType', 'geometry']
dfCountyShape = pd.DataFrame(columns=columnsList)
dfCountyShape['in_ReportingUnitName'] = dfCountyShapetemp['Reportin_3'].astype(str).clip()
dfCountyShape['in_ReportingUnitType'] = dfCountyShapetemp['Reportin_4'].astype(str).clip()
dfCountyShape['geometry'] = dfCountyShapetemp['geometry']

# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfout.loc[(dfout['in_ReportingUnitName'] == A) & 
                       (dfout['in_ReportingUnitType'] == B), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfCountyShape['in_ReportingUnitNativeID'] = dfCountyShape.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)
dfCountyShape

dfCountyShape = dfCountyShape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfCountyShape

In [None]:
# Concatenate shp datafraes together.
frames = [dfBasinShape, dfCountyShape]
dfAllShape = pd.concat(frames).reset_index()
dfAllShape

### Inspect Output Data & Export

In [None]:
# Export out to CSV.
dfout.to_csv('P_txAggMaster.csv', index=False) # The output.
dfAllShape.to_csv('P_TXGeometry.csv', index=False) # The output geometry.