# Working with CA Aggregated Data

Pre-processing input data for a smoother upload experience of the state data to the WaDE 2.0 database.
Using geopandas to read in shp file, and coverting to WKT for ReportingUnit geometry.

#### Notes:
- Three ReportingUnitTypeCV: PA, HR, DAU

In [None]:
#Needed Libararies

# working with data
import os
import numpy as np
import pandas as pd
import geopandas as gpd

# visulizaiton
import matplotlib.pyplot as plt
import seaborn as sns

# Cleanup
from datetime import datetime
pd.set_option('display.max_columns', 999)  # How to display all columns of a Pandas DataFrame in Jupyter Notebook

In [None]:
#Setting work directory, reading inputs, creating dataframe
workingDir = "G:/Shared drives/WaDE Data/California/AggregatedAmounts/RawInputData"
os.chdir(workingDir)

## Input Data

### hydrologic region (HR_CODE)

In [None]:
HR_2002 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2002-HR.csv"
dfhr2002 = pd.read_csv(HR_2002)
print(len(dfhr2002))
dfhr2002.head(1)

In [None]:
HR_2003 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2003-HR.csv"
dfhr2003 = pd.read_csv(HR_2003)
print(len(dfhr2003))
dfhr2003.head(1)

In [None]:
HR_2004 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2004-HR.csv"
dfhr2004 = pd.read_csv(HR_2004)
print(len(dfhr2004))
dfhr2004.head(1)

In [None]:
HR_2005 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2005-HR.csv"
dfhr2005 = pd.read_csv(HR_2005)
print(len(dfhr2005))
dfhr2005.head(1)

In [None]:
HR_2006 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2006-HR.csv"
dfhr2006 = pd.read_csv(HR_2006)
print(len(dfhr2006))
dfhr2006.head(1)

In [None]:
HR_2007 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2007-HR.csv"
dfhr2007 = pd.read_csv(HR_2007)
print(len(dfhr2007))
dfhr2007.head(1)

In [None]:
HR_2008 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2008-HR.csv"
dfhr2008 = pd.read_csv(HR_2008)
print(len(dfhr2008))
dfhr2008.head(1)

In [None]:
HR_2009 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2009-HR.csv"
dfhr2009 = pd.read_csv(HR_2009)
print(len(dfhr2009))
dfhr2009.head(1)

In [None]:
HR_2010 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2010-HR.csv"
dfhr2010 = pd.read_csv(HR_2010)
print(len(dfhr2010))
dfhr2010.head(1)

In [None]:
HR_2011 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2011-HR.csv"
dfhr2011 = pd.read_csv(HR_2011)
print(len(dfhr2011))
dfhr2011.head(1)

In [None]:
HR_2012 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2012-HR.csv"
dfhr2012 = pd.read_csv(HR_2012)
print(len(dfhr2012))
dfhr2012.head(1)

In [None]:
HR_2013 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2013-HR.csv"
dfhr2013 = pd.read_csv(HR_2013)
print(len(dfhr2013))
dfhr2013.head(1)

In [None]:
HR_2014 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2014-HR.csv"
dfhr2014 = pd.read_csv(HR_2014)
print(len(dfhr2014))
dfhr2014.head(1)

In [None]:
HR_2015 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2015-HR.csv"
dfhr2015 = pd.read_csv(HR_2015)
print(len(dfhr2015))

In [None]:
HR_2016 = "HR_input/CA-DWR-WaterBalance-Level2-DP-1000-2016-HR.csv"
dfhr2016 = pd.read_csv(HR_2016)
print(len(dfhr2016))
dfhr2016.head(1)

In [None]:
# Concatenate HR inputs into one dataframe
frames = [dfhr2002,dfhr2003,dfhr2004,dfhr2005,dfhr2006,dfhr2007,dfhr2008,
          dfhr2009,dfhr2010,dfhr2011,dfhr2012,dfhr2013,dfhr2014,dfhr2015,dfhr2016]
dfHR = pd.concat(frames).reset_index(drop=True)
print(len(dfHR))
dfHR['Year'].unique()

#### planning area (PA)

In [None]:
PA_2002 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2002-PA.csv"
dfpa2002 = pd.read_csv(PA_2002)
print(len(dfpa2002))
dfpa2002.head(1)

In [None]:
PA_2003 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2003-PA.csv"
dfpa2003 = pd.read_csv(PA_2003)
print(len(dfpa2003))
dfpa2003.head(1)

In [None]:
PA_2004 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2004-PA.csv"
dfpa2004 = pd.read_csv(PA_2004)
print(len(dfpa2004))
dfpa2004.head(1)

In [None]:
PA_2005 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2005-PA.csv"
dfpa2005 = pd.read_csv(PA_2005)
print(len(dfpa2005))
dfpa2005.head(1)

In [None]:
PA_2006 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2006-PA.csv"
dfpa2006 = pd.read_csv(PA_2006)
print(len(dfpa2006))
dfpa2006.head(1)

In [None]:
PA_2007 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2007-PA.csv"
dfpa2007 = pd.read_csv(PA_2007)
print(len(dfpa2007))
dfpa2007.head(1)

In [None]:
PA_2008 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2008-PA.csv"
dfpa2008 = pd.read_csv(PA_2008)
print(len(dfpa2008))
dfpa2008.head(1)

In [None]:
PA_2009 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2009-PA.csv"
dfpa2009 = pd.read_csv(PA_2009)
print(len(dfpa2009))
dfpa2009.head(1)

In [None]:
PA_2010 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2010-PA.csv"
dfpa2010 = pd.read_csv(PA_2010)
print(len(dfpa2010))
dfpa2010.head(1)

In [None]:
PA_2011 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2011-PA.csv"
dfpa2011 = pd.read_csv(PA_2011)
print(len(dfpa2011))
dfpa2011.head(1)

In [None]:
PA_2012 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2012-PA.csv"
dfpa2012 = pd.read_csv(PA_2012)
print(len(dfpa2012))
dfpa2012.head(1)

In [None]:
PA_2013 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2013-PA.csv"
dfpa2013 = pd.read_csv(PA_2013)
print(len(dfpa2013))
dfpa2013.head(1)

In [None]:
PA_2014 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2014-PA.csv"
dfpa2014 = pd.read_csv(PA_2014)
print(len(dfpa2014))
dfpa2014.head(1)

In [None]:
PA_2015 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2015-PA.csv"
dfpa2015 = pd.read_csv(PA_2015)
print(len(dfpa2015))
dfpa2015.head(1)

In [None]:
PA_2016 = "PA_input/CA-DWR-WaterBalance-Level2-DP-1000-2016-PA.csv"
dfpa2016 = pd.read_csv(PA_2016)
print(len(dfpa2016))
dfpa2016.head(1)

In [None]:
# Concatenate PA inputs into one dataframe
frames = [dfpa2002, dfpa2003, dfpa2004, dfpa2005, dfpa2006, dfpa2007, dfpa2008,
          dfpa2009, dfpa2010, dfpa2011, dfpa2012, dfpa2013, dfpa2014, dfpa2015, dfpa2016]
dfPA = pd.concat(frames).reset_index(drop=True)
print(len(dfPA))
dfPA['Year'].unique()

### Detailed Analysis Units by County (DAU)

In [None]:
DAU_2002 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2002-DAUCO.csv"
dfdau2002 = pd.read_csv(DAU_2002)
print(len(dfdau2002))
dfdau2002.head(1)

In [None]:
DAU_2003 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2003-DAUCO.csv"
dfdau2003 = pd.read_csv(DAU_2003)
print(len(dfdau2003))
dfdau2003.head(1)

In [None]:
DAU_2004 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2004-DAUCO.csv"
dfdau2004 = pd.read_csv(DAU_2004)
print(len(dfdau2004))
dfdau2004.head(1)

In [None]:
DAU_2005 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2005-DAUCO.csv"
dfdau2005 = pd.read_csv(DAU_2005)
print(len(dfdau2005))
dfdau2005.head(1)

In [None]:
DAU_2006 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2006-DAUCO.csv"
dfdau2006 = pd.read_csv(DAU_2006)
print(len(dfdau2006))
dfdau2006.head(1)

In [None]:
DAU_2007 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2007-DAUCO.csv"
dfdau2007 = pd.read_csv(DAU_2007)
print(len(dfdau2007))
dfdau2007.head(1)

In [None]:
DAU_2008 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2008-DAUCO.csv"
dfdau2008 = pd.read_csv(DAU_2008)
print(len(dfdau2008))
dfdau2008.head(1)

In [None]:
DAU_2009 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2009-DAUCO.csv"
dfdau2009 = pd.read_csv(DAU_2009)
print(len(dfdau2009))
dfdau2009.head(1)

In [None]:
DAU_2010 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2010-DAUCO.csv"
dfdau2010 = pd.read_csv(DAU_2010)
print(len(dfdau2010))
dfdau2010.head(1)

In [None]:
DAU_2011 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2011-DAUCO.csv"
dfdau2011 = pd.read_csv(DAU_2011)
print(len(dfdau2011))
dfdau2011.head(1)

In [None]:
DAU_2012 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2012-DAUCO.csv"
dfdau2012 = pd.read_csv(DAU_2012)
print(len(dfdau2012))
dfdau2012.head(1)

In [None]:
DAU_2013 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2013-DAUCO.csv"
dfdau2013 = pd.read_csv(DAU_2013)
print(len(dfdau2013))
dfdau2013.head(1)

In [None]:
DAU_2014 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2014-DAUCO.csv"
dfdau2014 = pd.read_csv(DAU_2014)
print(len(dfdau2014))
dfdau2014.head(1)

In [None]:
DAU_2015 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2015-DAUCO.csv"
dfdau2015 = pd.read_csv(DAU_2015)
print(len(dfdau2015))
dfdau2015.head(1)

In [None]:
DAU_2016 = "DAU_input/CA-DWR-WaterBalance-Level2-DP-1000-2016-DAUCO.csv"
dfdau2016 = pd.read_csv(DAU_2016)
print(len(dfdau2016))
dfdau2016.head(1)

In [None]:
# Concatenate DAU inputs into one dataframe
frames = [dfdau2002, dfdau2003, dfdau2004, dfdau2005, dfdau2006, dfdau2007, dfdau2008, dfdau2009,
          dfdau2010, dfdau2011, dfdau2012, dfdau2013, dfdau2014, dfdau2015, dfdau2016]
dfDAU = pd.concat(frames).reset_index(drop=True)
print(len(dfDAU))
dfDAU['Year'].unique()

## Clean Data
- We only want the Applied Water Use and Depletion values.

In [None]:
# HR
dfHR_2 = dfHR.copy()
dfHR_2 = dfHR_2[dfHR_2['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
dfHR_2 = dfHR_2.sort_values(by=['Year', 'CategoryC', 'HR', 'CategoryA', 'KAcreFt'], )
print(len(dfHR_2))
print(dfHR_2['CategoryC'].unique())
dfHR_2.head(1)

In [None]:
# PA
dfPA_2 = dfPA.copy()
dfPA_2 = dfPA_2[dfPA_2['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
dfPA_2 = dfPA_2.sort_values(by=['Year', 'CategoryC', 'PA', 'CategoryA', 'KAcreFt'], )
print(len(dfPA_2))
print(dfPA_2['CategoryC'].unique())
dfPA_2.head(1)

In [None]:
# DAU
dfDAU_2 = dfDAU.copy()
dfDAU_2 = dfDAU_2[dfDAU_2['CategoryC'].isin(['Applied Water Use', 'Depletion']) ].reset_index(drop=True)
dfDAU_2 = dfDAU_2.sort_values(by=['Year', 'CategoryC', 'DAU', 'CategoryA', 'KAcreFt'], )
print(len(dfDAU_2))
print(dfDAU_2['CategoryC'].unique())
dfDAU_2.head(1)

## Output Dataframes

In [None]:
# HR
dfHR_3 = pd.DataFrame(index=dfHR_2.index)

# Variable Info
dfHR_3['VariableCV'] = dfHR_2['CategoryC']
dfHR_3['in_VariableSpecificCV'] = dfHR_2['CategoryC'] + "_Annual_" + dfHR_2['CategoryA'] + "_Surface Ground Water"

# ReportingUnits Info
dfHR_3['in_ReportingUnitName'] = dfHR_2['HR']
dfHR_3['in_ReportingUnitNativeID'] = ""
dfHR_3['in_ReportingUnitTypeCV'] = "Hydrologic Region"

# AggregatedAmounts Info
dfHR_3['in_Amount'] = dfHR_2['KAcreFt']
dfHR_3['in_BenUse'] = dfHR_2['CategoryA']
dfHR_3['in_ReportYearCV'] =  dfHR_2['Year'].astype(int)
dfHR_3['in_TimeframeStart'] = dfHR_2['Year'].astype(str) + "/01/01"
dfHR_3['in_TimeframeEnd'] = dfHR_2['Year'].astype(str)  + "/12/31"

print(len(dfHR_3))
dfHR_3.head(1)

In [None]:
# For creating ReportingUnitNativeID for HR data

dftempHR_CODE = pd.DataFrame(columns = ['HR_CODE', 'HR_NAME'])
dftempHR_CODE['HR_CODE'] = dfDAU_2['HR_CODE']
dftempHR_CODE['HR_NAME'] = dfDAU_2['HR_NAME']
dftempHR_CODE = dftempHR_CODE.drop_duplicates().reset_index(drop=True)

HR_Code_dict = pd.Series(dftempHR_CODE.HR_CODE.values, index=dftempHR_CODE.HR_NAME).to_dict()

def retrieveReportingUnitNativeID(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outString = ''
    else:
        String1 = colrowValue
        try:
            outString = HR_Code_dict[String1]
        except:
            outString = colrowValue
    return outString

dfHR_3['in_ReportingUnitNativeID'] = dfHR_3.apply(lambda row: retrieveReportingUnitNativeID(row['in_ReportingUnitName']), axis=1)
dfHR_3['in_ReportingUnitNativeID'].unique()

In [None]:
# PA
dfPA_3 = pd.DataFrame(index=dfPA_2.index)

# Variable Info
dfPA_3['VariableCV'] = dfPA_2['CategoryC']
dfPA_3['in_VariableSpecificCV'] = dfPA_2['CategoryC'] + "_Annual_" + dfPA_2['CategoryA'] + "_Surface Ground Water"

# ReportingUnits Info
dfPA_3['in_ReportingUnitName'] = dfPA_2['PA']
dfPA_3['in_ReportingUnitNativeID'] = dfPA_2['PA'] # setting native id = name in this unique situation
dfPA_3['in_ReportingUnitTypeCV'] = "Planning Area"

# AggregatedAmounts Info
dfPA_3['in_Amount'] = dfPA_2['KAcreFt']
dfPA_3['in_BenUse'] = dfPA_2['CategoryA']
dfPA_3['in_ReportYearCV'] =  dfPA_2['Year'].astype(int)
dfPA_3['in_TimeframeStart'] = dfPA_2['Year'].astype(str) + "/01/01"
dfPA_3['in_TimeframeEnd'] = dfPA_2['Year'].astype(str)  + "/12/31"

print(len(dfPA_3))
dfPA_3.head(1)

In [None]:
# DAU
dfDAU_3 = pd.DataFrame(index=dfDAU_2.index)

# Variable Info
dfDAU_3['VariableCV'] = dfDAU_2['CategoryC']
dfDAU_3['in_VariableSpecificCV'] = dfDAU_2['CategoryC'] + "_Annual_" + dfDAU_2['CategoryA'] + "_Surface Ground Water"

# ReportingUnits Info
dfDAU_3['in_ReportingUnitName'] = dfDAU_2['DAU_NAME']
dfDAU_3['in_ReportingUnitNativeID'] = dfDAU_2['DAU']
dfDAU_3['in_ReportingUnitTypeCV'] = "Detailed Analysis Units by County"

# AggregatedAmounts Info
dfDAU_3['in_Amount'] = dfDAU_2['KAcreFt']
dfDAU_3['in_BenUse'] = dfDAU_2['CategoryA']
dfDAU_3['in_ReportYearCV'] =  dfDAU_2['Year'].astype(int)
dfDAU_3['in_TimeframeStart'] = dfDAU_2['Year'].astype(str) + "/01/01"
dfDAU_3['in_TimeframeEnd'] = dfDAU_2['Year'].astype(str)  + "/12/31"

print(len(dfDAU_3))
dfDAU_3.head(1)

In [None]:
# Concatenate HR, PA, & DAU datfames into single output dataframe.
frames = [dfHR_3, dfPA_3, dfDAU_3]
dfout = pd.concat(frames).reset_index(drop=True)
print(len(dfout))
dfout['in_ReportingUnitTypeCV'].unique()

In [None]:
# Convert History Year to YYYY-MM-DD format.

dfout['in_TimeframeStart'] = pd.to_datetime(dfout['in_TimeframeStart'], errors = 'coerce')
dfout['in_TimeframeStart'] = pd.to_datetime(dfout["in_TimeframeStart"].dt.strftime('%m/%d/%Y'))

dfout['in_TimeframeEnd'] = pd.to_datetime(dfout['in_TimeframeEnd'], errors = 'coerce')
dfout['in_TimeframeEnd'] = pd.to_datetime(dfout["in_TimeframeEnd"].dt.strftime('%m/%d/%Y'))

dfout.head()

## WaDE Custom Elements (due to missing info)

# Shapefile Data

In [None]:
# Shapefile input
HydrologicRegionsShape = gpd.read_file('Hydrologic_Regions-shp/Hydrologic_Regions.shp', crs="EPSG:4326")
WaterPlanAreaShape = gpd.read_file('Water_Plan_Planning_Areas-shp/Water_Plan_Planning_Areas.shp', crs="EPSG:4326")
DAUCOShape = gpd.read_file('DAUCO-shp/WaDECADAU.shp', crs="EPSG:4326")

In [None]:
#check shp input Hydrologic Region
dfHRshapetemp = pd.DataFrame(HydrologicRegionsShape)

HydrologicRegionIDdict = {
"North Coast" : "1",
"San Francisco Bay" : "2",
"Central Coast" : "3",
"South Coast" : "4",
"Sacramento River" : "5",
"San Joaquin River" : "6",
"Tulare Lake" : "7",
"North Lahontan" : "8",
"South Lahontan" : "9",
"Colorado River" : "10"}
    
def retrieveHRID(colrowValue):
    if colrowValue == '' or pd.isnull(colrowValue):
        outList = ''
    else:
        String1 = colrowValue.strip()
        try:
            outList = HydrologicRegionIDdict[String1]
        except:
            outList = ''
    return outList

columnsList = ['RU_ID', 'geometry']
dfHRshape = pd.DataFrame(columns=columnsList)
dfHRshape['RU_ID'] = dfHRshapetemp.apply(lambda row: retrieveHRID(row['HR_NAME']), axis=1)
dfHRshape['geometry'] = dfHRshapetemp['geometry']
dfHRshape = dfHRshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfHRshape

In [None]:
#check shp input Water Plan Area
dfWPAshapetemp = pd.DataFrame(WaterPlanAreaShape)

columnsList = ['RU_ID', 'geometry']
dfWPAshape = pd.DataFrame(columns=columnsList)
dfWPAshape['RU_ID'] = dfWPAshapetemp['PA_NO']
dfWPAshape['geometry'] = dfWPAshapetemp['geometry']
dfWPAshape = dfWPAshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfWPAshape.head(3)

In [None]:
#check shp input DAUCO
dfDAUCOshapetemp = pd.DataFrame(DAUCOShape)

columnsList = ['RU_ID', 'geometry']
dfDAUCOshape = pd.DataFrame(columns=columnsList)
dfDAUCOshape['RU_ID'] = dfDAUCOshapetemp['RU_ID']
dfDAUCOshape['geometry'] = dfDAUCOshapetemp['geometry']
dfDAUCOshape = dfDAUCOshape.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
dfDAUCOshape.head(3)

In [None]:
# Concatenate shp datafraes together.
frames = [dfHRshape, dfWPAshape, dfDAUCOshape]
dfShape = pd.concat(frames).reset_index(drop=True)
dfShape

### Inspect Output Data & Export

In [None]:
dfout.info()

In [None]:
dfShape.info()

In [None]:
# Export out to CSV.
dfout.to_csv('P_caAggMaster.csv', index=False) # The output.
dfShape.to_csv('P_caGeometry.csv', index=False) # The output geometry.

In [None]:
dfout['in_VariableSpecificCV'].unique()

In [None]:
# # Creating WaDE Custom reporting unit native ID for easy water site identification
# # create by unique ReportingUnitName & ReportingUnitTypeCV
# # only need for PA areas.
# # ----------------------------------------------------------------------------------------------------

# # Create temp ReportingUnitNativeID dataframe of unique reporting unit native ID areas.
# def assignReportingUnitNativeID(colrowValue):
#     string1 = str(colrowValue)
#     outstring = "WaDECA_RU" + string1
#     return outstring

# dfReportingUnitNativeID = pd.DataFrame()
# dfReportingUnitNativeID['in_ReportingUnitName'] = dfout['in_ReportingUnitName']
# dfReportingUnitNativeID['in_ReportingUnitTypeCV'] = dfout['in_ReportingUnitTypeCV']
# dfReportingUnitNativeID = dfReportingUnitNativeID.drop_duplicates()

# dftemp = pd.DataFrame(index=dfReportingUnitNativeID.index)
# dftemp["Count"] = range(1, len(dftemp.index) + 1)
# dfReportingUnitNativeID['in_ReportingUnitNativeID'] = dftemp.apply(lambda row: assignReportingUnitNativeID(row['Count']), axis=1)

# # ----------------------------------------------------------------------------------------------------

# # Retreive WaDE Custom reporting unit native ID areas.
# def retrieveReportingUnitNativeID(A, B, C):
#     # check if A is empty or null
#     if A == "" or pd.isnull(A):
#         ml = dfReportingUnitNativeID.loc[(dfReportingUnitNativeID['in_ReportingUnitName'] == B) & 
#                                          (dfReportingUnitNativeID['in_ReportingUnitTypeCV'] == C), 'in_ReportingUnitNativeID']
#         if not (ml.empty):  # check if the series is empty
#             outString = ml.iloc[0]
#         else:
#             outString = A
#     else:
#         outString = A

#     return outString

# dfout['in_ReportingUnitNativeID'] = dfout.apply(lambda row: retrieveReportingUnitNativeID(row['in_ReportingUnitNativeID'],
#                                                                                           row['in_ReportingUnitName'],
#                                                                                           row['in_ReportingUnitTypeCV']), axis=1)
# print(dfout['in_ReportingUnitNativeID'].unique())