# Working with TX Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

#### Notes:
- Will need to do a while loop to read in each csv source file by year.
- Will need to seperate out by Basin & County, then combine.
- TX data also includes the summation of amounts by surface water, groundwater, and reuse.  Only pull in those catagories and leave out the summation.
- Will need to assign water source type, use ben use string to determine this.
- Will need to fix ben use string and remove errors.

In [1]:
#Needed Libararies
import os
import numpy as np
import pandas as pd
from datetime import datetime
import geopandas as gpd # the library that lets us read in shapefiles
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

#Setting work directory, reading inputs, creating dataframe
workingDir = "C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Texas/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

# Basin Data

In [2]:
#Read in Basin xlsx data.  2000-2016.
yearCount = 2000
dfbinput = pd.DataFrame() #input dataframe
dfBasin = pd.DataFrame() #working dataframe

while yearCount < 2017:
    basinInputString = str(yearCount) + "/SumFinal_BasinReport.xlsx"
    dfbinput = pd.read_excel(basinInputString, skiprows=1)
    dfBasin = dfBasin.append(dfbinput)
    yearCount = yearCount + 1

dfBasin = dfBasin.replace(np.nan, '').drop_duplicates()
print(len(dfBasin.index))
dfBasin.head(3)

408


Unnamed: 0,Year,Basin,Population,Municipal,Manu-facturing,Mining,Steam Electric (Power),Irrigation,Livestock,Municipal Ground Water,Municipal Surface Water,Mfg Ground Water,Mfg Surface Water,Mining Ground Water,Mining Surface Water,Power Ground Water,Power Surface Water,Irrigation Ground Water,Irrigation Surface Water,Livestock Ground Water,Livestock Surface Water,Municipal Reuse,Mfg Reuse,Mining Reuse,Power Reuse,Irrig ation Reuse,Livestock Reuse
0,2000,BRAZOS,2127781,409144,142466,8091,148315,2804513,74755,167201,241946,47774,94692,3288,4803,19415,128901,2711021,93492,31646,43109,,,,,,
1,2000,BRAZOS-COLORADO,82624,11431,29601,5414,0,263843,2185,10460,971,1441,28160,964,4450,0,0,70159,193684,1141,1044,,,,,,
2,2000,CANADIAN,172241,42267,45010,392,7704,1952208,29810,28210,14057,41998,3010,392,0,4042,3662,1950566,1642,16583,13227,,,,,,


In [3]:
# Produce Temporary Basin out dataframe
columnsList = ["Year", "Basin", "Population"]
dfBasinV2 = pd.DataFrame(columns=columnsList)
dfBasinV2 = dfBasin[columnsList]
dfBasinV2 = dfBasinV2.rename(columns={"Basin": "in_ReportingUnitName"})

############################################

dfBasinV2 = dfBasinV2.assign(TX_BenUse='')
dfBasinV2 = dfBasinV2.assign(in_Amount='')
dfBasinV2 = dfBasinV2.assign(in_ReportingUnitType='')
dfBasinOut = pd.DataFrame()

############################################
columnsList = [
"Irrig ation Reuse",
"Irrigation Ground Water",
"Irrigation Surface Water",
"Livestock Ground Water",
"Livestock Reuse",
"Livestock Surface Water",
"Mfg Ground Water",
"Mfg Reuse",
"Mfg Surface Water",
"Mining Ground Water",
"Mining Reuse",
"Mining Surface Water",
"Municipal Ground  Water",
"Municipal Reuse",
"Municipal Surface Water",
"Power Ground Water",
"Power Reuse",
"Power Surface Water"]
lenList = len(columnsList)


############################################
for i in range(lenList):
    BenuseString = columnsList[i]
    dfBasinV2['TX_BenUse'] = BenuseString
    dfBasinV2['in_Amount'] = dfBasin[columnsList[i]]
    dfBasinV2['in_ReportingUnitType'] = "Basin"
    dfBasinOut = dfBasinOut.append(dfBasinV2)
    
############################################

print(len(dfBasinOut.index))
dfBasinOut.head(3)

7344


Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType
0,2000,BRAZOS,2127781,Irrig ation Reuse,,Basin
1,2000,BRAZOS-COLORADO,82624,Irrig ation Reuse,,Basin
2,2000,CANADIAN,172241,Irrig ation Reuse,,Basin


# County Data

In [4]:
# #Read in County xlsx data.  2000-2016.
yearCount = 2000
dfcinput = pd.DataFrame() #input dataframe
dfCounty = pd.DataFrame() #working dataframe

while yearCount < 2017:
    countyInputString = str(yearCount) + "/SumFinal_CountyReport.xlsx"
    dfcinput = pd.read_excel(countyInputString, skiprows=1)
    dfCounty = dfCounty.append(dfcinput)
    yearCount = yearCount + 1

dfCounty = dfCounty.replace(np.nan, '').drop_duplicates()
print(len(dfCounty.index))
dfCounty.head(3)

4335


Unnamed: 0,Year,County,Population,Municipal,Manu-facturing,Mining,Steam Electric (Power),Irrigation,Livestock,Municipal Ground Water,Municipal Surface Water,Mfg Ground Water,Mfg Surface Water,Mining Ground Water,Mining Surface Water,Power Ground Water,Power Surface Water,Irrigation Ground Water,Irrigation Surface Water,Livestock Ground Water,Livestock Surface Water,Municipal Reuse,Mfg Reuse,Mining Reuse,Power Reuse,Irrigation Reuse,Livestock Reuse
0,2000,ANDERSON,55109,13255,340,0,0,192,1708,9625,3630,340,0,0,0,0,0,96,96,683,1025,,,,,,
1,2000,ANDREWS,13004,3482,1014,3,0,18482,319,3482,0,1014,0,3,0,0,0,18482,0,255,64,,,,,,
2,2000,ANGELINA,80130,12648,27024,0,0,30,578,12648,0,15467,11557,0,0,0,0,30,0,231,347,,,,,,


In [5]:
# Produce Temporary County out dataframe
columnsList = ["Year", "County", "Population"]
dfCountyV2 = pd.DataFrame(columns=columnsList)
dfCountyV2 = dfCounty[columnsList]
dfCountyV2 = dfCountyV2.rename(columns={"County": "in_ReportingUnitName"})

############################################

dfCountyV2 = dfCountyV2.assign(TX_BenUse='')
dfCountyV2 = dfCountyV2.assign(in_Amount='')
dfCountyV2 = dfCountyV2.assign(in_ReportingUnitType='')
dfCountyOut = pd.DataFrame()

############################################
columnsList = [
"Irrigation Ground Water",
"Irrigation Reuse",
"Irrigation Surface Water",
"Livestock Ground Water",
"Livestock Reuse",
"Livestock Surface Water",
"Mfg Ground Water",
"Mfg Reuse",
"Mfg Surface Water",
"Mining Ground Water",
"Mining Reuse",
"Mining Surface Water",
"Municipal Ground Water",
"Municipal Reuse",
"Municipal Surface Water",
"Power Ground Water",
"Power Reuse",
"Power Surface Water"]
lenList = len(columnsList)


############################################
for i in range(lenList):
    BenuseString = columnsList[i]
    dfCountyV2['TX_BenUse'] = BenuseString
    dfCountyV2['in_Amount'] = dfCounty[columnsList[i]]
    dfCountyV2['in_ReportingUnitType'] = "County"
    dfCountyOut = dfCountyOut.append(dfCountyV2)
    
############################################

print(len(dfCountyOut.index))
dfCountyOut.head(3)

78030


Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType
0,2000,ANDERSON,55109,Irrigation Ground Water,96,County
1,2000,ANDREWS,13004,Irrigation Ground Water,18482,County
2,2000,ANGELINA,80130,Irrigation Ground Water,30,County


# Output Dataframe

In [6]:
dfout = dfBasinOut
dfout = dfout.append(dfCountyOut)
print(len(dfout.index))
dfout.head(3)

85374


Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType
0,2000,BRAZOS,2127781,Irrig ation Reuse,,Basin
1,2000,BRAZOS-COLORADO,82624,Irrig ation Reuse,,Basin
2,2000,CANADIAN,172241,Irrig ation Reuse,,Basin


In [7]:
# WaterSourceType.  Use the TX Benefical Use to help determine this.

dfout = dfout.assign(in_WaterSourceType='')

def defineWaterSourceType(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = 'Unspecified'
    else:
        try:
            if "Surface Water" in str(colrowValue).strip():
                outString = "Surface Water"
            elif "Ground Water" in str(colrowValue).strip():
                outString = "Groundwater"
            elif "Ground  Water" in str(colrowValue).strip():
                outString = "Groundwater"
            elif "Reuse" in str(colrowValue).strip():
                outString = "Reuse"
            else:
                outString = "Unspecified"
        except:
            outString = 'Unspecified'
    return outString

dfout['in_WaterSourceType'] = dfout.apply(lambda row: defineWaterSourceType(row['TX_BenUse']), axis=1)
dfout

Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType,in_WaterSourceType
0,2000,BRAZOS,2127781,Irrig ation Reuse,,Basin,Reuse
1,2000,BRAZOS-COLORADO,82624,Irrig ation Reuse,,Basin,Reuse
2,2000,CANADIAN,172241,Irrig ation Reuse,,Basin,Reuse
3,2000,COLORADO,1595971,Irrig ation Reuse,,Basin,Reuse
4,2000,COLORADO-LAVACA,24181,Irrig ation Reuse,,Basin,Reuse
...,...,...,...,...,...,...,...
250,2016,YOAKUM,8488,Power Surface Water,0,County,Surface Water
251,2016,YOUNG,18152,Power Surface Water,368,County,Surface Water
252,2016,ZAPATA,14349,Power Surface Water,0,County,Surface Water
253,2016,ZAVALA,12023,Power Surface Water,0,County,Surface Water


In [8]:
# TimeframeStart & TimeframeEnd

dfout['inTimeframeStart'] = '01/01/' + dfout['Year'].astype(str)
dfout['inTimeframeEnd'] = '12/31/' + dfout['Year'].astype(str)
dfout

Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType,in_WaterSourceType,inTimeframeStart,inTimeframeEnd
0,2000,BRAZOS,2127781,Irrig ation Reuse,,Basin,Reuse,01/01/2000,12/31/2000
1,2000,BRAZOS-COLORADO,82624,Irrig ation Reuse,,Basin,Reuse,01/01/2000,12/31/2000
2,2000,CANADIAN,172241,Irrig ation Reuse,,Basin,Reuse,01/01/2000,12/31/2000
3,2000,COLORADO,1595971,Irrig ation Reuse,,Basin,Reuse,01/01/2000,12/31/2000
4,2000,COLORADO-LAVACA,24181,Irrig ation Reuse,,Basin,Reuse,01/01/2000,12/31/2000
...,...,...,...,...,...,...,...,...,...
250,2016,YOAKUM,8488,Power Surface Water,0,County,Surface Water,01/01/2016,12/31/2016
251,2016,YOUNG,18152,Power Surface Water,368,County,Surface Water,01/01/2016,12/31/2016
252,2016,ZAPATA,14349,Power Surface Water,0,County,Surface Water,01/01/2016,12/31/2016
253,2016,ZAVALA,12023,Power Surface Water,0,County,Surface Water,01/01/2016,12/31/2016


In [9]:
# Fixing Benefical use

BenUseDict = {
"Irrigation Ground Water" : "Irrigation",
"Irrigation Reuse" : "Irrigation",
"Irrigation Surface Water" : "Irrigation",
"Livestock Ground Water" : "Livestock",
"Livestock Reuse" : "Livestock",
"Livestock Surface Water" : "Livestock",
"Mfg Ground Water" : "Manufacturing",
"Mfg Reuse" : "Manufacturing",
"Mfg Surface Water" : "Manufacturing",
"Mining Ground Water" : "Mining",
"Mining Reuse" : "Mining",
"Mining Surface Water" : "Mining",
"Municipal Ground Water" : "Municipal",
"Municipal Reuse" : "Municipal",
"Municipal Surface Water" : "Municipal",
"Power Ground Water" : "Power",
"Power Reuse" : "Power",
"Power Surface Water" : "Power",
"Irrig ation Reuse" : "Irrigation",
"Municipal Ground  Water" : "Municipal"}

def fixTX_BenUse(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = 'UnSpecified'
    else:
        String1 = colrowValue
        try:
            outString = BenUseDict[String1]
        except:
            outString = 'UnSpecified'
    return outString

dfout['TX_BenUse'] = dfout.apply(lambda row: fixTX_BenUse(row['TX_BenUse']), axis=1)
dfout

Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType,in_WaterSourceType,inTimeframeStart,inTimeframeEnd
0,2000,BRAZOS,2127781,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
1,2000,BRAZOS-COLORADO,82624,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
2,2000,CANADIAN,172241,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
3,2000,COLORADO,1595971,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
4,2000,COLORADO-LAVACA,24181,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
...,...,...,...,...,...,...,...,...,...
250,2016,YOAKUM,8488,Power,0,County,Surface Water,01/01/2016,12/31/2016
251,2016,YOUNG,18152,Power,368,County,Surface Water,01/01/2016,12/31/2016
252,2016,ZAPATA,14349,Power,0,County,Surface Water,01/01/2016,12/31/2016
253,2016,ZAVALA,12023,Power,0,County,Surface Water,01/01/2016,12/31/2016


In [10]:
# Dropping the 'STATE TOTAL' unit name.

dfout = dfout[dfout.in_ReportingUnitName != "STATE TOTAL"]
dfout

Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType,in_WaterSourceType,inTimeframeStart,inTimeframeEnd
0,2000,BRAZOS,2127781,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
1,2000,BRAZOS-COLORADO,82624,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
2,2000,CANADIAN,172241,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
3,2000,COLORADO,1595971,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
4,2000,COLORADO-LAVACA,24181,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000
...,...,...,...,...,...,...,...,...,...
249,2016,WOOD,44227,Power,0,County,Surface Water,01/01/2016,12/31/2016
250,2016,YOAKUM,8488,Power,0,County,Surface Water,01/01/2016,12/31/2016
251,2016,YOUNG,18152,Power,368,County,Surface Water,01/01/2016,12/31/2016
252,2016,ZAPATA,14349,Power,0,County,Surface Water,01/01/2016,12/31/2016


## WaDE Custom Elements (due to missing reporting unit and water source info)

In [11]:
# Creating WaDE Custom water source native ID for easy water source identification
# ----------------------------------------------------------------------------------------------------

# Create temp WaterSourceNativeID dataframe of unique water source.
def assignWaterSourceNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_WS" + string1
    return outstring

dfWaterSourceNativeID = pd.DataFrame()
dfWaterSourceNativeID['in_WaterSourceType'] = dfout['in_WaterSourceType']
dfWaterSourceNativeID = dfWaterSourceNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfWaterSourceNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfWaterSourceNativeID['in_WaterSourceNativeID'] = dftemp.apply(lambda row: assignWaterSourceNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom water source native ID
def retrieveWaterSourceNativeID(A):
    if (A == '') or (pd.isnull(A)):
        outList = ''
    else:
        ml = dfWaterSourceNativeID.loc[(dfWaterSourceNativeID['in_WaterSourceType'] == A), 'in_WaterSourceNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceType']), axis=1)
dfout

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfout['in_WaterSourceNativeID'] = dfout.apply(lambda row: retrieveWaterSourceNativeID(row['in_WaterSourceType']), axis=1)


Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType,in_WaterSourceType,inTimeframeStart,inTimeframeEnd,in_WaterSourceNativeID
0,2000,BRAZOS,2127781,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1
1,2000,BRAZOS-COLORADO,82624,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1
2,2000,CANADIAN,172241,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1
3,2000,COLORADO,1595971,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1
4,2000,COLORADO-LAVACA,24181,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1
...,...,...,...,...,...,...,...,...,...,...
249,2016,WOOD,44227,Power,0,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3
250,2016,YOAKUM,8488,Power,0,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3
251,2016,YOUNG,18152,Power,368,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3
252,2016,ZAPATA,14349,Power,0,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3


In [12]:
# Creating WaDE Custom reportingunit native ID for easy site identificaiion
# ----------------------------------------------------------------------------------------------------

# Create temp ReportingUnitNativeID dataframe of unique reporting units.
def assignReportingUnitNativeID(colrowValue):
    string1 = str(colrowValue)
    outstring = "WaDETX_RU" + string1
    return outstring

dfReportingUnitNativeID = pd.DataFrame()
dfReportingUnitNativeID['in_ReportingUnitName'] = dfout['in_ReportingUnitName']
dfReportingUnitNativeID['in_ReportingUnitType'] = dfout['in_ReportingUnitType']
dfReportingUnitNativeID = dfReportingUnitNativeID.drop_duplicates()

dftemp = pd.DataFrame(index=dfReportingUnitNativeID.index)
dftemp["Count"] = range(1, len(dftemp.index) + 1)
dfReportingUnitNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportingUnitNativeID(row['Count']), axis=1)

# ----------------------------------------------------------------------------------------------------

# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfReportingUnitNativeID.loc[(dfReportingUnitNativeID['in_ReportingUnitName'] == A) & 
                                         (dfReportingUnitNativeID['in_ReportingUnitType'] == B), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfout['in_ReportingUnitNativeID'] = dfout.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)
dfout

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfout['in_ReportingUnitNativeID'] = dfout.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)


Unnamed: 0,Year,in_ReportingUnitName,Population,TX_BenUse,in_Amount,in_ReportingUnitType,in_WaterSourceType,inTimeframeStart,inTimeframeEnd,in_WaterSourceNativeID,in_ReportingUnitNativeID
0,2000,BRAZOS,2127781,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1,WaDETX_RU1
1,2000,BRAZOS-COLORADO,82624,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1,WaDETX_RU2
2,2000,CANADIAN,172241,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1,WaDETX_RU3
3,2000,COLORADO,1595971,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1,WaDETX_RU4
4,2000,COLORADO-LAVACA,24181,Irrigation,,Basin,Reuse,01/01/2000,12/31/2000,WaDETX_WS1,WaDETX_RU5
...,...,...,...,...,...,...,...,...,...,...,...
249,2016,WOOD,44227,Power,0,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3,WaDETX_RU273
250,2016,YOAKUM,8488,Power,0,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3,WaDETX_RU274
251,2016,YOUNG,18152,Power,368,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3,WaDETX_RU275
252,2016,ZAPATA,14349,Power,0,County,Surface Water,01/01/2016,12/31/2016,WaDETX_WS3,WaDETX_RU276


# Shapefile Data

In [13]:
# Shapefile input
inputBasinShape = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Texas/AggregatedAmounts/RawInputData/shapefiles/TX_Basin.shp')
inputCountyShape = gpd.read_file('C:/Users/rjame/Documents/WSWC Documents/MappingStatesDataToWaDE2.0/Texas/AggregatedAmounts/RawInputData/shapefiles/TX_County.shp')

In [14]:
#Basin Shapefile
dfBasinShapetemp = pd.DataFrame(inputBasinShape)

columnsList = ['in_ReportingUnitName', 'in_ReportingUnitType', 'geometry']
dfBasinShape = pd.DataFrame(columns=columnsList)
dfBasinShape['in_ReportingUnitName'] = dfBasinShapetemp['Reportin_1']
dfBasinShape['in_ReportingUnitType'] = dfBasinShapetemp['Reportin_2']
dfBasinShape['geometry'] = dfBasinShapetemp['geometry']


# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfout.loc[(dfout['in_ReportingUnitName'] == A) & 
                       (dfout['in_ReportingUnitType'] == B), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfBasinShape['in_ReportingUnitNativeID'] = dfBasinShape.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)
dfBasinShape


dfBasinShape = dfBasinShape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfBasinShape.head(3)

Unnamed: 0,in_ReportingUnitName,in_ReportingUnitType,geometry,in_ReportingUnitNativeID
0,BRAZOS,Basin,"MULTIPOLYGON (((-95.38128 28.87588, -95.38851 ...",WaDETX_RU1
1,BRAZOS-COLORADO,Basin,"MULTIPOLYGON (((-95.75817 28.73139, -95.75682 ...",WaDETX_RU2
2,CANADIAN,Basin,"POLYGON ((-100.00040 35.81694, -100.02271 35.8...",WaDETX_RU3


In [15]:
#CountyShapefile
dfCountyShapetemp = pd.DataFrame(inputCountyShape)

columnsList = ['in_ReportingUnitName', 'in_ReportingUnitType', 'geometry']
dfCountyShape = pd.DataFrame(columns=columnsList)
dfCountyShape['in_ReportingUnitName'] = dfCountyShapetemp['Reportin_3'].astype(str).clip()
dfCountyShape['in_ReportingUnitType'] = dfCountyShapetemp['Reportin_4'].astype(str).clip()
dfCountyShape['geometry'] = dfCountyShapetemp['geometry']

# Retreive WaDE Custom reportingunit native ID
def retrieveReportingUnitNativeID(A, B):
    if (A == '' and B == '') or (pd.isnull(A) and pd.isnull(B)):
        outList = ''
    else:
        ml = dfout.loc[(dfout['in_ReportingUnitName'] == A) & 
                       (dfout['in_ReportingUnitType'] == B), 'in_ReportingUnitNativeID']
        if not (ml.empty):  # check if the series is empty
            outList = ml.iloc[0]
        else:
            outList = ''
    return outList

dfCountyShape['in_ReportingUnitNativeID'] = dfCountyShape.apply(lambda row: retrieveReportingUnitNativeID( row['in_ReportingUnitName'], row['in_ReportingUnitType']), axis=1)
dfCountyShape

dfCountyShape = dfCountyShape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfCountyShape

Unnamed: 0,in_ReportingUnitName,in_ReportingUnitType,geometry,in_ReportingUnitNativeID
0,GOLIAD,County,"POLYGON ((-97.77853 28.66803, -97.41734 28.925...",WaDETX_RU111
1,GRIMES,County,"POLYGON ((-96.18831 30.59961, -96.18678 30.605...",WaDETX_RU116
2,HIDALGO,County,"POLYGON ((-98.58529 26.26027, -98.32067 26.783...",WaDETX_RU131
3,HOUSTON,County,"POLYGON ((-95.77535 31.12205, -95.77425 31.138...",WaDETX_RU136
4,IRION,County,"POLYGON ((-101.27189 31.29403, -101.26795 31.5...",WaDETX_RU141
...,...,...,...,...
249,BROWN,County,"POLYGON ((-99.20341 31.75822, -99.19587 32.079...",WaDETX_RU48
250,CLAY,County,"POLYGON ((-98.42358 33.83605, -98.42353 34.082...",WaDETX_RU62
251,FRANKLIN,County,"POLYGON ((-95.30872 32.99456, -95.30859 33.377...",WaDETX_RU103
252,HAYS,County,"POLYGON ((-98.29417 30.04680, -98.17298 30.356...",WaDETX_RU128


In [16]:
# Concatenate shp datafraes together.
frames = [dfBasinShape, dfCountyShape]
dfAllShape = pd.concat(frames).reset_index()
dfAllShape

Unnamed: 0,index,in_ReportingUnitName,in_ReportingUnitType,geometry,in_ReportingUnitNativeID
0,0,BRAZOS,Basin,"MULTIPOLYGON (((-95.38128 28.87588, -95.38851 ...",WaDETX_RU1
1,1,BRAZOS-COLORADO,Basin,"MULTIPOLYGON (((-95.75817 28.73139, -95.75682 ...",WaDETX_RU2
2,2,CANADIAN,Basin,"POLYGON ((-100.00040 35.81694, -100.02271 35.8...",WaDETX_RU3
3,3,COLORADO,Basin,"MULTIPOLYGON (((-95.98255 28.59958, -95.98390 ...",WaDETX_RU4
4,4,COLORADO-LAVACA,Basin,"MULTIPOLYGON (((-96.37258 28.38773, -96.37073 ...",WaDETX_RU5
...,...,...,...,...,...
272,249,BROWN,County,"POLYGON ((-99.20341 31.75822, -99.19587 32.079...",WaDETX_RU48
273,250,CLAY,County,"POLYGON ((-98.42358 33.83605, -98.42353 34.082...",WaDETX_RU62
274,251,FRANKLIN,County,"POLYGON ((-95.30872 32.99456, -95.30859 33.377...",WaDETX_RU103
275,252,HAYS,County,"POLYGON ((-98.29417 30.04680, -98.17298 30.356...",WaDETX_RU128


### Inspect Output Data & Export

In [18]:
# Export out to CSV.
dfout.to_csv('P_txAggMaster.csv', index=False) # The output.
dfAllShape.to_csv('P_TXGeometry.csv', index=False) # The output geometry.