### Annual Sales Tax Revenue Analysis

#### [Data Source](https://tax.utah.gov/econstats/sales)
- wait for full year to be released
- download the [historical annual taxable sales 1998 - Present](https://tax.utah.gov/esu/sales-year/annual-sales-historical.xlsx)

#### Notes:
- after updating the data in AGOL, set AAAA to "All Sectors" in the fields/domains tab
- if changing any categories, ensure match in both sector and supersector 
- city boundaries may need to be updated
- ensure city names in excel sheet match those in the cities shapefile
- city shapefile was simplified to reduce file size


In [2]:
import arcpy
from arcpy import env
import os
import numpy as np
from arcgis import GIS
from arcgis.features import GeoAccessor
from arcgis.features import GeoSeriesAccessor
import pandas as pd
import glob
import numpy as np

arcpy.env.overwriteOutput = True
arcpy.env.parallelProcessingFactor = "90%"

# show all columns
pd.options.display.max_columns = None

# pd.DataFrame.spatial.from_featureclass(???)  
# df.spatial.to_featureclass(location=???,sanitize_columns=False)  

# gsa = arcgis.features.GeoSeriesAccessor(df['SHAPE'])  
# df['AREA'] = gsa.area  # KNOW YOUR UNITS

In [3]:
if not os.path.exists('Outputs'):
    os.makedirs('Outputs')
    
outputs = ['.\\Outputs', 'Utah.gdb']
gdb = os.path.join(outputs[0], outputs[1])

if not arcpy.Exists(gdb):
    arcpy.CreateFileGDB_management(outputs[0], outputs[1])


In [4]:
# # dictionary for renaming tax revenue sectors
# global sector_lu
# sector_lu= {
#               'ACCOMMODATION (721)':'L_ACCOMMODATION',
#               'ADMIN. & SUPPORT & WASTE MANAG. & REMED. SERVICES (56)':'S_ADMIN_SUPPORT',
#               'AGRICULTURE, FORESTRY, FISHING & HUNTING (11)':'A_AG_WILDLIFE',
#               'ARTS, ENTERTAINMENT AND RECREATION (71)':'L_CULTURAL_REC', 
#               'CONSTRUCTION (23)':'C_CONSTRUCTION',
#               'EDUCATIONAL SERVICES (61)':'E_EDUCATION', 
#               'FINANCE & INSURANCE (52)':'S_FINANCIAL',
#               'FOOD SERVICES & DRINKING PLACES (722)':'L_RSTRNT_BAR',
#               'HEALTH CARE & SOCIAL ASSISTANCE (62)':'H_HEALTH_CARE', 
#               'INFORMATION (51)':'S_IT_DATA',
#               'MANAGEMENT OF COMPANIES & ENTERPRISES (55)':'S_CORPORATE_MGMT', 
#               'MANUFACTURING (31-33)':'M_MANUFACTURING',
#               'MINING, QUARRYING, & OIL & GAS EXTRACTION (21)':'E_EXTRACTION',
#               'OTHER SERVICES-EXCEPT PUBLIC ADMINISTRATION (81)':'O_OTHER',
#               'PRIOR-PERIOD PAYMENTS & REFUNDS':'X_ADJUSTMENTS', 
#               'PRIVATE MOTOR VEHICLE SALES':'R_AUTO_PRIVATE',
#               'PROFESSIONAL, SCIENTIFIC & TECHNICAL SERVICES (54)':'S_PROF_TECH_SRV',
#               'PUBLIC ADMINISTRATION (92)':'G_GOVERNMENT', 
#               'REAL ESTATE, RENTAL & LEASING (53)':'S_RENTAL_LEASING',
#               'RETAIL-BUILD. MATERIAL, GARDEN EQUIP. & SUPPLIES DEALERS (444)':'R_BUILDING_SUPPLY',
#               'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448)':'R_CLOTHING',
#               'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448,458)':'R_CLOTHING',
#               'RETAIL-ELECTRONICS & APPLIANCE STORES (443)':'R_ELECTRONICS',
#               'RETAIL-ELECTRONICS & APPLIANCE STORES (443,4492)':'R_ELECTRONICS',
#               'RETAIL-FOOD & BEVERAGE STORES (445)':'R_GROCERY_BEV',
#               'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442)':'R_FURNITURE',
#               'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442,4491)':'R_FURNITURE',
#               'RETAIL-GASOLINE STATIONS (447)':'R_GAS_STATIONS',
#               'RETAIL-GASOLINE STATIONS (447,457)': 'R_GAS_STATIONS',
#               'RETAIL-GENERAL MERCHANDISE STORES (452)':'R_GENERAL_RETAIL',
#               'RETAIL-GENERAL MERCHANDISE STORES (452,455)':'R_GENERAL_RETAIL',
#               'RETAIL-HEALTH & PERSONAL CARE STORES (446)':'R_HEALTH_RETAIL',
#               'RETAIL-HEALTH & PERSONAL CARE STORES (446,456)':'R_HEALTH_RETAIL',
#               'RETAIL-MISCELLANEOUS STORE RETAILERS (453)':'R_OTHER_RETAIL',
#               'RETAIL-MISCELLANEOUS STORE RETAILERS (453,4593-4599)':'R_OTHER_RETAIL',
#               'RETAIL-MOTOR VEHICLE & PARTS DEALERS (441)':'R_AUTO_RETAIL',
#               'RETAIL-NONSTORE RETAILERS (454)':'R_NONSTORE_RETAIL',
#               'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451)':'R_SPORT_HOBBY',
#               'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451,4591,4592)':'R_SPORT_HOBBY',
#               'SPECIAL EVENT SALES':'L_SPECIAL_EVENT', 
#               'TRANSPORTATION & WAREHOUSING (48-49)':'W_DISTRIBUTION',
#               'UNKNOWN/NONCLASSIFIABLE':'X_UNKNOWN', 
#               'UTILITIES (22)':'U_UTILITIES',
#               'WHOLESALE TRADE-DURABLE GOODS (423)':'W_WHLSALE_DURABLE',
#               'WHOLESALE TRADE-ELECTRONIC MARKETS (425)':'S_WHLSLE_ETRADE',
#               'WHOLESALE TRADE-AGENTS & BROKERS (425)':'S_WHLSLE_ETRADE',
#               'WHOLESALE TRADE-NONDURABLE GOODS (424)':'W_WHLSLE_NDURABLE'            
# }

In [5]:
# # dictionary for renaming tax revenue sectors
# global sector_lu
# sector_lu= {
#               'ACCOMMODATION (721)':'ACCOMMODATION',
#               'ADMIN. & SUPPORT & WASTE MANAG. & REMED. SERVICES (56)':'ADMIN_SUPPORT',
#               'AGRICULTURE, FORESTRY, FISHING & HUNTING (11)':'AGRI_WILDLIFE',
#               'ARTS, ENTERTAINMENT AND RECREATION (71)':'CULTURAL_REC', 
#               'CONSTRUCTION (23)':'CONSTRUCTION',
#               'EDUCATIONAL SERVICES (61)':'EDUCATION', 
#               'FINANCE & INSURANCE (52)':'FINANCIAL',
#               'FOOD SERVICES & DRINKING PLACES (722)':'RESTAURANT_BAR',
#               'HEALTH CARE & SOCIAL ASSISTANCE (62)':'HEALTHCARE', 
#               'INFORMATION (51)':'IT_DATA',
#               'MANAGEMENT OF COMPANIES & ENTERPRISES (55)':'CORPORATE_MGMT', 
#               'MANUFACTURING (31-33)':'MANUFACTURING',
#               'MINING, QUARRYING, & OIL & GAS EXTRACTION (21)':'EXTRACTION',
#               'OTHER SERVICES-EXCEPT PUBLIC ADMINISTRATION (81)':'OTHER_SERVICES',
#               'PRIOR-PERIOD PAYMENTS & REFUNDS':'ADJUSTMENTS', 
#               'PRIVATE MOTOR VEHICLE SALES':'AUTO_PRIVATE',
#               'PROFESSIONAL, SCIENTIFIC & TECHNICAL SERVICES (54)':'PROF_TECH_SERVICES',
#               'PUBLIC ADMINISTRATION (92)':'GOVERNMENT', 
#               'REAL ESTATE, RENTAL & LEASING (53)':'RENTAL_LEASING',
#               'RETAIL-BUILD. MATERIAL, GARDEN EQUIP. & SUPPLIES DEALERS (444)':'BUILDING_SUPPLY',
#               'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448)':'CLOTHING',
#               'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448,458)':'CLOTHING',
#               'RETAIL-ELECTRONICS & APPLIANCE STORES (443)':'ELECTRONICS',
#               'RETAIL-ELECTRONICS & APPLIANCE STORES (443,4492)':'ELECTRONICS',
#               'RETAIL-FOOD & BEVERAGE STORES (445)':'GROCERY_BEVERAGE',
#               'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442)':'FURNITURE',
#               'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442,4491)':'FURNITURE',
#               'RETAIL-GASOLINE STATIONS (447)':'GAS_STATIONS',
#               'RETAIL-GASOLINE STATIONS (447,457)': 'GAS_STATIONS',
#               'RETAIL-GENERAL MERCHANDISE STORES (452)':'GENERAL_RETAIL',
#               'RETAIL-GENERAL MERCHANDISE STORES (452,455)':'GENERAL_RETAIL',
#               'RETAIL-HEALTH & PERSONAL CARE STORES (446)':'HEALTH_RETAIL',
#               'RETAIL-HEALTH & PERSONAL CARE STORES (446,456)':'HEALTH_RETAIL',
#               'RETAIL-MISCELLANEOUS STORE RETAILERS (453)':'OTHER_RETAIL',
#               'RETAIL-MISCELLANEOUS STORE RETAILERS (453,4593-4599)':'OTHER_RETAIL',
#               'RETAIL-MOTOR VEHICLE & PARTS DEALERS (441)':'AUTO_RETAIL',
#               'RETAIL-NONSTORE RETAILERS (454)':'NONSTORE_RETAIL',
#               'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451)':'SPORT_HOBBY',
#               'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451,4591,4592)':'SPORT_HOBBY',
#               'SPECIAL EVENT SALES':'SPECIAL_EVENT', 
#               'TRANSPORTATION & WAREHOUSING (48-49)':'DISTRIBUTION',
#               'UNKNOWN/NONCLASSIFIABLE':'UNKNOWN', 
#               'UTILITIES (22)':'UTILITIES',
#               'WHOLESALE TRADE-DURABLE GOODS (423)':'WHOLESALE_DURABLE',
#               'WHOLESALE TRADE-ELECTRONIC MARKETS (425)':'WHOLESALE_ETRADE',
#               'WHOLESALE TRADE-AGENTS & BROKERS (425)':'WHOLESALE_ETRADE',
#               'WHOLESALE TRADE-NONDURABLE GOODS (424)':'WHOLESALE_NONDURABLE'            
# }

In [6]:
# dictionary for renaming tax revenue sectors
global sector_lu
sector_lu= {
    'ACCOMMODATION (721)': 'Accommodation',
    'ADMIN. & SUPPORT & WASTE MANAG. & REMED. SERVICES (56)': 'Admin Support',
    'AGRICULTURE, FORESTRY, FISHING & HUNTING (11)': 'Agri Wildlife',
    'ARTS, ENTERTAINMENT AND RECREATION (71)': 'Cultural Rec',
    'CONSTRUCTION (23)': 'Construction',
    'EDUCATIONAL SERVICES (61)': 'Education',
    'FINANCE & INSURANCE (52)': 'Financial',
    'FOOD SERVICES & DRINKING PLACES (722)': 'Restaurant Bar',
    'HEALTH CARE & SOCIAL ASSISTANCE (62)': 'Healthcare',
    'INFORMATION (51)': 'IT Data',
    'MANAGEMENT OF COMPANIES & ENTERPRISES (55)': 'Corporate Mgmt',
    'MANUFACTURING (31-33)': 'Manufacturing',
    'MINING, QUARRYING, & OIL & GAS EXTRACTION (21)': 'Extraction',
    'OTHER SERVICES-EXCEPT PUBLIC ADMINISTRATION (81)': 'Other Services',
    'PRIOR-PERIOD PAYMENTS & REFUNDS': 'Adjustments',
    'PRIVATE MOTOR VEHICLE SALES': 'Auto Private',
    'PROFESSIONAL, SCIENTIFIC & TECHNICAL SERVICES (54)': 'Prof Tech Services',
    'PUBLIC ADMINISTRATION (92)': 'Government',
    'REAL ESTATE, RENTAL & LEASING (53)': 'Rental Leasing',
    'RETAIL-BUILD. MATERIAL, GARDEN EQUIP. & SUPPLIES DEALERS (444)': 'Building Supply',
    'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448)': 'Clothing',
    'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448,458)': 'Clothing',
    'RETAIL-ELECTRONICS & APPLIANCE STORES (443)': 'Electronics',
    'RETAIL-ELECTRONICS & APPLIANCE STORES (443,4492)': 'Electronics',
    'RETAIL-FOOD & BEVERAGE STORES (445)': 'Grocery Beverage',
    'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442)': 'Furniture',
    'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442,4491)': 'Furniture',
    'RETAIL-GASOLINE STATIONS (447)': 'Gas Stations',
    'RETAIL-GASOLINE STATIONS (447,457)': 'Gas Stations',
    'RETAIL-GENERAL MERCHANDISE STORES (452)': 'General Retail',
    'RETAIL-GENERAL MERCHANDISE STORES (452,455)': 'General Retail',
    'RETAIL-HEALTH & PERSONAL CARE STORES (446)': 'Health Retail',
    'RETAIL-HEALTH & PERSONAL CARE STORES (446,456)': 'Health Retail',
    'RETAIL-MISCELLANEOUS STORE RETAILERS (453)': 'Other Retail',
    'RETAIL-MISCELLANEOUS STORE RETAILERS (453,4593-4599)': 'Other Retail',
    'RETAIL-MOTOR VEHICLE & PARTS DEALERS (441)': 'Auto Retail',
    'RETAIL-NONSTORE RETAILERS (454)': 'Nonstore Retail',
    'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451)': 'Sport Hobby',
    'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451,4591,4592)': 'Sport Hobby',
    'SPECIAL EVENT SALES': 'Special Event',
    'TRANSPORTATION & WAREHOUSING (48-49)': 'Distribution',
    'UNKNOWN/NONCLASSIFIABLE': 'Unknown',
    'UTILITIES (22)': 'Utilities',
    'WHOLESALE TRADE-DURABLE GOODS (423)': 'Wholesale Durable',
    'WHOLESALE TRADE-ELECTRONIC MARKETS (425)': 'Wholesale ETrade',
    'WHOLESALE TRADE-AGENTS & BROKERS (425)': 'Wholesale Etrade',
    'WHOLESALE TRADE-NONDURABLE GOODS (424)': 'Wholesale Nondurable'
}

In [7]:
# # dictionary for renaming tax revenue sectors
# global super_sector_lu
# super_sector_lu= {
#                 'ACCOMMODATION':	'LEISURE & HOSPITALITY',
#                 'ADMIN_SUPPORT':	'PROFESSIONAL SERVICES',
#                 'AGRI_WILDLIFE':	'OTHER',
#                 'CULTURAL_REC':	'LEISURE & HOSPITALITY', 
#                 'CONSTRUCTION':	'OTHER',
#                 'EDUCATION':	'GOVT & EDU', 
#                 'FINANCIAL':	'PROFESSIONAL SERVICES',
#                 'RESTAURANT_BAR':	'LEISURE & HOSPITALITY',
#                 'HEALTHCARE':	'GOVT & EDU', 
#                 'IT_DATA':	'PROFESSIONAL SERVICES',
#                 'CORPORATE_MGMT': 	'PROFESSIONAL SERVICES', 
#                 'MANUFACTURING':	'MANU, WHOLESALE, & DIST',
#                 'EXTRACTION':	'OTHER',
#                 'OTHER_SERVICES':	'PROFESSIONAL SERVICES',
#                 'ADJUSTMENTS': 	'EXCLUDE', 
#                 'AUTO_PRIVATE':	'RETAIL',
#                 'PROF_TECH_SERVICES':	'PROFESSIONAL SERVICES',
#                 'GOVERNMENT': 	'GOVT & EDU', 
#                 'RENTAL_LEASING':'PROFESSIONAL SERVICES',
#                 'BUILDING_SUPPLY':	'RETAIL',
#                 'CLOTHING':	'RETAIL',
#                 'ELECTRONICS':	'RETAIL',
#                 'GROCERY_BEVERAGE':	'RETAIL',
#                 'FURNITURE':	'RETAIL',
#                 'GAS_STATIONS':'RETAIL',
#                 'GENERAL_RETAIL':	'RETAIL',
#                 'HEALTH_RETAIL':	'RETAIL',
#                 'OTHER_RETAIL':	'RETAIL',
#                 'AUTO_RETAIL':	'RETAIL',
#                 'NONSTORE_RETAIL':	'RETAIL',
#                 'SPORT_HOBBY':	'RETAIL',
#                 'SPECIAL_EVENT': 'LEISURE & HOSPITALITY', 
#                 'DISTRIBUTION':	'MANU, WHOLESALE, & DIST',
#                 'UNKNOWN': 	'EXCLUDE', 
#                 'UTILITIES':'OTHER',
#                 'WHOLESALE_DURABLE':'MANU, WHOLESALE, & DIST',
#                 'WHOLESALE_ETRADE':	'PROFESSIONAL SERVICES',
#                 'WHOLESALE_ETRADE':	'PROFESSIONAL SERVICES',
#                 'WHOLESALE_NONDURABLE':	'MANU, WHOLESALE, & DIST' 
           
# }

In [8]:
# dictionary for renaming tax revenue sectors
global super_sector_lu
super_sector_lu= {
    'Accommodation': 'Leisure & Hospitality',
    'Admin Support': 'Professional Services',
    'Agri Wildlife': 'Other',
    'Cultural Rec': 'Leisure & Hospitality',
    'Construction': 'Other',
    'Education': 'Govt & Edu',
    'Financial': 'Professional Services',
    'Restaurant Bar': 'Leisure & Hospitality',
    'Healthcare': 'Govt & Edu',
    'IT Data': 'Professional Services',
    'Corporate Mgmt': 'Professional Services',
    'Manufacturing': 'Manu, Wholesale, & Dist',
    'Extraction': 'Other',
    'Other Services': 'Professional Services',
    'Adjustments': 'Exclude',
    'Auto Private': 'Retail',
    'Prof Tech Services': 'Professional Services',
    'Government': 'Govt & Edu',
    'Rental Leasing': 'Professional Services',
    'Building Supply': 'Retail',
    'Clothing': 'Retail',
    'Electronics': 'Retail',
    'Grocery Beverage': 'Retail',
    'Furniture': 'Retail',
    'Gas Stations': 'Retail',
    'General Retail': 'Retail',
    'Health Retail': 'Retail',
    'Other Retail': 'Retail',
    'Auto Retail': 'Retail',
    'Nonstore Retail': 'Retail',
    'Sport Hobby': 'Retail',
    'Special Event': 'Leisure & Hospitality',
    'Distribution': 'Manu, Wholesale, & Dist',
    'Unknown': 'Exclude',
    'Utilities': 'Other',
    'Wholesale Durable': 'Manu, Wholesale, & Dist',
    'Wholesale ETrade': 'Professional Services',
    'Wholesale Nondurable': 'Manu, Wholesale, & Dist'
    }

In [9]:
# fill NA values in Spatially enabled dataframes (ignores SHAPE column)
def fill_na_sedf(df_with_shape_column, fill_value=0):
    if 'SHAPE' in list(df_with_shape_column.columns):
        df = df_with_shape_column.copy()
        shape_column = df['SHAPE'].copy()
        del df['SHAPE']
        return df.fillna(fill_value).merge(shape_column,left_index=True, right_index=True, how='inner')
    else:
        raise Exception("Dataframe does not include 'SHAPE' column")

In [10]:
def get_cities_from_report(_xlsx):
    tsr = pd.read_excel(_xlsx, sheet_name='Table 9', header=5)
    lst = tsr['City'].to_list()
    lst = [item for item in lst if item==item]
    return lst

In [11]:
# gather annual taxable sales reports
reports = glob.glob('.\\Inputs\\*-annual-sales.xlsx')
reports = [r for r in reports if '~' not in r] # in case the spreadsheet is open
cities_lists = [get_cities_from_report(r) for r in reports]
cities_list = list(set([item for sublist in cities_lists for item in sublist]))

In [12]:
# read in cities as spatial dataframe
global cities_sdf
global cities_list
cities_sdf = pd.DataFrame.spatial.from_featureclass(r".\Inputs\Cities_Simplified_ZJ.shp")
cities_sdf = cities_sdf[['NAME','POPLASTCEN','POPLASTEST', 'ACRES','SHAPE']].copy()
cities_sdf.rename({'NAME':'City'}, axis=1, inplace=True)
cities_sdf = cities_sdf[cities_sdf['City'].isin(cities_list)]
cities_df = cities_sdf.drop(['SHAPE','POPLASTCEN','POPLASTEST','ACRES'], axis=1)

In [13]:
def process_historical_sales_to_shape(_historical_taxable_sales_file, _gdb):
    tsr = pd.read_excel(_historical_taxable_sales_file, sheet_name='Table 9', header=5)
    tsr.drop(tsr.tail(2).index,inplace=True)
    tsr['County'] = tsr['County'].fillna(method='ffill')
    tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
    tsr['City'] = tsr['City'].fillna(method='ffill')
    tsr.rename(columns={'Unnamed: 6':'Rounded_Up'}, inplace=True)
    crosswalk = {'*':True, np.nan:False}
    tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)
    tsr['Year'] = tsr['Year'].str[-4:]

    df = pd.DataFrame(columns=['City', 'Time_Frame', 'Start_Year', 'End_Year', 'Start_Value', 'End_Value', 'Difference', 'Percent_Difference', 'Sector'])

    years = sorted(list(set(tsr['Year'].to_list()))) 
    for idx, year in enumerate(years):
        # print(year)
        tsr_year = tsr[tsr['Year']==year].copy()
        tsr_year = pd.pivot_table(tsr_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
        tsr_year = tsr_year.fillna(0)
        tsr_year.rename(sector_lu, axis=1, inplace=True)
        tsr_year.set_index('City', inplace=True)

        if idx != 0:

            # add missing columns, sort data frame
            tsr_year_cols = list(tsr_year.columns)
            tsr_year_cols = [c for c in tsr_year_cols if c != 'City']
            missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_year_cols]
            for c in missing_cols: tsr_year[c]= np.nan
            tsr_year = tsr_year.sort_index(axis=1).reset_index()

            # I named the total field "AAAA" because ESRI dashoboards only lets you set the default to first or last item in the sorted list
            tsr_year['AAAA'] = tsr_year[list(sector_lu.values())].sum(axis=1)
            tsr_year = cities_df.merge(tsr_year, on='City', how='left')
            tsr_year.set_index('City', inplace=True)
        
    
            tsr_previous_year = tsr[tsr['Year'] == str(int(year) - 1)].copy()
            tsr_previous_year = pd.pivot_table(tsr_previous_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
            tsr_previous_year = tsr_previous_year.fillna(0)
            tsr_previous_year.rename(sector_lu, axis=1, inplace=True)
            tsr_previous_year.set_index('City', inplace=True)

            # add missing columns, sort data frame
            tsr_previous_year_cols = list(tsr_previous_year.columns)
            tsr_previous_year_cols = [c for c in tsr_previous_year_cols if c != 'City']
            sectors = list(sector_lu.values())
            missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_previous_year_cols]
            for c in missing_cols: tsr_previous_year[c]= np.nan
            tsr_previous_year = tsr_previous_year.sort_index(axis=1).reset_index()

            tsr_previous_year['AAAA'] = tsr_previous_year[sectors].sum(axis=1)
            tsr_previous_year = cities_df.merge(tsr_previous_year, on='City', how='left')
            tsr_previous_year.set_index('City', inplace=True)
            
            # calculate the difference
            if list(tsr_year.columns) == list(tsr_previous_year.columns):
                if  tsr_year.index.values.tolist() == tsr_previous_year.index.values.tolist():
                    tsr_diff = tsr_year - tsr_previous_year
                    tsr_year = tsr_year.reset_index()
                    tsr_previous_year = tsr_previous_year.reset_index()
                    tsr_diff = tsr_diff.reset_index()

                    # unpivot data
                    tsr_year_melted = pd.melt(tsr_year, id_vars =['City'], value_vars =sectors + ['AAAA'], var_name ='Sector', value_name ='End_Value').set_index(['City', 'Sector'])
                    tsr_previous_year_melted = pd.melt(tsr_previous_year, id_vars =['City'], value_vars =sectors + ['AAAA'], var_name ='Sector', value_name ='Start_Value').set_index(['City', 'Sector'])
                    tsr_diff_melted = pd.melt(tsr_diff, id_vars =['City'], value_vars =sectors + ['AAAA'], var_name ='Sector', value_name ='Difference').set_index(['City', 'Sector'])

                    
                    tsr_full = tsr_year_melted.merge(tsr_previous_year_melted, left_index=True, right_index=True, how='left').merge(tsr_diff_melted, left_index=True, right_index=True, how='left')
                    tsr_full = tsr_full.reset_index()
                    tsr_full['Start_Year'] = int(year) - 1
                    tsr_full['End_Year'] = int(year)
                    tsr_full['Time_Frame'] = f"{int(year)-1}-{int(year)}"
                    df = pd.concat([df, tsr_full], ignore_index=True)

                else:
                    print('row name (cities) mismatch')
            else:
                print('column name (sectors) mismatch')

    # export to gdb
    df.loc[(df['Start_Value'] > 0) & (df['End_Value'] > 0), 'Percent_Difference'] = round((df['End_Value'] - df['Start_Value']) / df['Start_Value'] * 100, 2) 
    df['Supersector'] = df['Sector'].map(super_sector_lu)
    df = df[df['Supersector'] != 'EXCLUDE']
    df = df[df['End_Year'] >= 2015]
    # df.to_csv(os.path.join(outputs[0], 'test.csv')) 
    sdf = df.merge(cities_sdf, on='City', how='inner')
    sdf.spatial.to_featureclass(location=os.path.join(_gdb,f'Taxable_Sales'),sanitize_columns=False)    

In [14]:
# Run the function
historical_tax_file = r".\Inputs\annual-sales-historical-2024.xlsx"
process_historical_sales_to_shape(historical_tax_file, gdb)

  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():


# Graveyard

In [15]:
# # process from  historical spreadsheet
# def process_historical_sales_to_shape( _gdb):
    
#     tsr = pd.read_excel(r".\Inputs\annual-sales-historical-2023.xlsx", sheet_name='Table 9', header=5)
#     tsr.drop(tsr.tail(2).index,inplace=True)
#     tsr['County'] = tsr['County'].fillna(method='ffill')
#     tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
#     tsr['City'] = tsr['City'].fillna(method='ffill')
#     tsr.rename(columns={'Unnamed: 6':'Rounded_Up'}, inplace=True)
#     crosswalk = {'*':True, np.nan:False}
#     tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)
#     tsr['Year'] = tsr['Year'].str[-4:]

#     years = sorted(list(set(tsr['Year'].to_list()))) 
#     for idx, year in enumerate(years):
#         # print(year)
#         tsr_year = tsr[tsr['Year']==year].copy()
#         tsr_year = pd.pivot_table(tsr_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
#         tsr_year = tsr_year.fillna(0)
#         tsr_year.rename(sector_lu, axis=1, inplace=True)
#         tsr_year.set_index('City', inplace=True)


#         # add missing columns, sort data frame
#         tsr_year_cols = list(tsr_year.columns)
#         tsr_year_cols = [c for c in tsr_year_cols if c != 'City']
#         missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_year_cols]
#         for c in missing_cols: tsr_year[c]= np.nan
#         tsr_year = tsr_year.sort_index(axis=1).reset_index()

#         tsr_year['TOTAL'] = tsr_year[list(sector_lu.values())].sum(axis=1)
#         tsr_year = cities_df.merge(tsr_year, on='City', how='left')
#         tsr_year.set_index('City', inplace=True)
        
#         if idx != 0:
#             tsr_previous_year = tsr[tsr['Year'] == str(int(year) - 1)].copy()
#             tsr_previous_year = pd.pivot_table(tsr_previous_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
#             tsr_previous_year = tsr_previous_year.fillna(0)
#             tsr_previous_year.rename(sector_lu, axis=1, inplace=True)
#             tsr_previous_year.set_index('City', inplace=True)

#             # add missing columns, sort data frame
#             tsr_previous_year_cols = list(tsr_previous_year.columns)
#             tsr_previous_year_cols = [c for c in tsr_previous_year_cols if c != 'City']
#             missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_previous_year_cols]
#             for c in missing_cols: tsr_previous_year[c]= np.nan
#             tsr_previous_year = tsr_previous_year.sort_index(axis=1).reset_index()

#             tsr_previous_year['TOTAL'] = tsr_previous_year[list(sector_lu.values())].sum(axis=1)
#             tsr_previous_year = cities_df.merge(tsr_previous_year, on='City', how='left')
#             tsr_previous_year.set_index('City', inplace=True)
            
#             # calculate the difference
#             if list(tsr_year.columns) == list(tsr_previous_year.columns):
#                 if  tsr_year.index.values.tolist() == tsr_previous_year.index.values.tolist():
#                     tsr_diff = tsr_year - tsr_previous_year
#                     tsr_diff = cities_sdf.merge(tsr_diff, on='City', how='left')
#                     tsr_diff.spatial.to_featureclass(location=os.path.join(_gdb,f'Taxable_Sales_Diff_by_City_{year}'),sanitize_columns=False)
#                 else:
#                     print(year)
#                     print('row name (cities) mismatch')
#             else:
#                 print(year)
#                 print('column name (sectors) mismatch')

#         tsr_year = cities_sdf.merge(tsr_year, on='City', how='left')
#         tsr_year.spatial.to_featureclass(location=os.path.join(_gdb,f'Taxable_Sales_by_City_{year}'),sanitize_columns=False)

In [16]:
# # This version export each year individually
# def process_annual_sales_to_shape(_xlsx, _gdb):
    
#     year = os.path.split(_xlsx)[-1][:4]
#     tsr = pd.read_excel(_xlsx, sheet_name='Table 9', header=5)

#     # Drop last empty two rows
#     tsr.drop(tsr.tail(2).index,inplace=True)

#     # forward fill values from merged cells
#     tsr['County'] = tsr['County'].fillna(method='ffill')
#     tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
#     tsr['City'] = tsr['City'].fillna(method='ffill')

#     # Figures with less than 10 taxpayers have been rounded up per Tax Commission disclosure rules.
#     # Rename column and convert values to boolean
#     tsr.rename(columns={'Unnamed: 5':'Rounded_Up'}, inplace=True)
#     crosswalk = {'*':True, np.nan:False}
#     tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)

#     # pivot the table to get sectors as columns with tax values by city
#     tsr_by_sector = pd.pivot_table(tsr,values = f'CY{year}', index ='City', columns = 'Economic Sector (NAICS Code)', 
#                                     aggfunc='first').reset_index()

#     # fill NAs with 0
#     tsr_by_sector = tsr_by_sector.fillna(0)

#     # rename sectors
#     # _sector_lu = sector_lu
#     # _sector_lu = {k:v + f"_{year[-2:]}" for (k,v) in _sector_lu.items()}
#     tsr_by_sector.rename(sector_lu, axis=1, inplace=True)

#     # get the total sales tax revenue
#     tsr_by_sector['TOTAL'] = tsr_by_sector[list(sector_lu.values())].sum(axis=1)

#     # export
#     tsr_by_sector_sdf = cities_sdf.merge(tsr_by_sector, on='City', how='left')
#     return tsr_by_sector_sdf.spatial.to_featureclass(location=os.path.join(_gdb,f'Taxable_Sales_by_City_{year}'),sanitize_columns=False)


In [17]:
# # gather annual taxable sales reports
# reports = glob.glob('.\\Inputs\\*-annual-sales.xlsx')
# reports = [r for r in reports if '~' not in r] # in case the spreadsheet is open

# base = cities_sdf

# # export to shape
# for report in reports:
#     process_annual_sales_to_shape(report, gdb)


In [18]:
# tsr = pd.read_excel(r".\Inputs\annual-sales-historical.xlsx", sheet_name='Table 9', header=5)
# tsr.drop(tsr.tail(2).index,inplace=True)
# tsr['County'] = tsr['County'].fillna(method='ffill')
# tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
# tsr['City'] = tsr['City'].fillna(method='ffill')
# tsr.rename(columns={'Unnamed: 6':'Rounded_Up'}, inplace=True)
# crosswalk = {'*':True, np.nan:False}
# tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)
# tsr['Year'] = tsr['Year'].str[-4:]

# years = sorted(list(set(tsr['Year'].to_list()))) 
# for idx, year in enumerate(years):
#     print(year)
#     tsr_year = tsr[tsr['Year']==year].copy()
#     tsr_year = pd.pivot_table(tsr_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
#     tsr_year = tsr_year.fillna(0)
#     tsr_year.rename(sector_lu, axis=1, inplace=True)
#     tsr_year.set_index('City', inplace=True)


#     # add missing columns, sort data frame
#     tsr_year_cols = list(tsr_year.columns)
#     tsr_year_cols = [c for c in tsr_year_cols if c != 'City']
#     missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_year_cols]
#     for c in missing_cols: tsr_year[c]= np.nan
#     tsr_year = tsr_year.sort_index(axis=1).reset_index()

#     tsr_year['TOTAL'] = tsr_year[list(sector_lu.values())].sum(axis=1)
#     tsr_year = cities_df.merge(tsr_year, on='City', how='left')
#     tsr_year.set_index('City', inplace=True)
    
#     if idx != 0:
#         tsr_previous_year = tsr[tsr['Year'] == str(int(year) - 1)].copy()
#         tsr_previous_year = pd.pivot_table(tsr_previous_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
#         tsr_previous_year = tsr_previous_year.fillna(0)
#         tsr_previous_year.rename(sector_lu, axis=1, inplace=True)
#         tsr_previous_year.set_index('City', inplace=True)

#         # add missing columns, sort data frame
#         tsr_previous_year_cols = list(tsr_previous_year.columns)
#         tsr_previous_year_cols = [c for c in tsr_previous_year_cols if c != 'City']
#         missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_previous_year_cols]
#         for c in missing_cols: tsr_previous_year[c]= np.nan
#         tsr_previous_year = tsr_previous_year.sort_index(axis=1).reset_index()

#         tsr_previous_year['TOTAL'] = tsr_previous_year[list(sector_lu.values())].sum(axis=1)
#         tsr_previous_year = cities_df.merge(tsr_previous_year, on='City', how='left')
#         tsr_previous_year.set_index('City', inplace=True)
        
#         # calculate the difference
#         if list(tsr_year.columns) == list(tsr_previous_year.columns):
#             if  tsr_year.index.values.tolist() == tsr_previous_year.index.values.tolist():
#                 tsr_diff = tsr_year - tsr_previous_year
#                 tsr_diff = cities_sdf.merge(tsr_diff, on='City', how='left')
#                 tsr_diff.spatial.to_featureclass(location=os.path.join(gdb,f'Taxable_Sales_Diff_by_City_{year}'),sanitize_columns=False)
#             else:
#                 print('row name (cities) mismatch')
#         else:
#             print('column name (sectors) mismatch')

#     tsr_year = cities_sdf.merge(tsr_year, on='City', how='left')
#     tsr_year.spatial.to_featureclass(location=os.path.join(gdb,f'Taxable_Sales_by_City_{year}'),sanitize_columns=False)

In [19]:
## this version is for horizontal concatenation of records 
# def process_annual_sales_to_df(_xlsx):
    
#     year = os.path.split(_xlsx)[-1][:4]
#     tsr = pd.read_excel(_xlsx, sheet_name='Table 9', header=5)

#     # Drop last empty two rows
#     tsr.drop(tsr.tail(2).index,inplace=True)

#     # forward fill values from merged cells
#     tsr['County'] = tsr['County'].fillna(method='ffill')
#     tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
#     tsr['City'] = tsr['City'].fillna(method='ffill')

#     # Figures with less than 10 taxpayers have been rounded up per Tax Commission disclosure rules.
#     # Rename column and convert values to boolean
#     tsr.rename(columns={'Unnamed: 5':'Rounded_Up'}, inplace=True)
#     crosswalk = {'*':True, np.nan:False}
#     tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)

#     # pivot the table to get sectors as columns with tax values by city
#     tsr_by_sector = pd.pivot_table(tsr,values = f'CY{year}', index ='City', columns = 'Economic Sector (NAICS Code)', 
#                                     aggfunc='first').reset_index()

#     # fill NAs with 0
#     tsr_by_sector = tsr_by_sector.fillna(0)

#     # rename sectors
#     _sector_lu = sector_lu
#     _sector_lu = {k:v + f"_{year[-2:]}" for (k,v) in _sector_lu.items()}
#     tsr_by_sector.rename(_sector_lu, axis=1, inplace=True)

#     # get the total sales tax revenue
#     tsr_by_sector[f'TOTAL_{year[-2:]}'] = tsr_by_sector[_sector_lu.values()].sum(axis=1)

#     # export
#     return tsr_by_sector

In [20]:
# ## this version is for vertical concatenation of records 
# def process_annual_sales_to_df2(_xlsx):
    
#     year = os.path.split(_xlsx)[-1][:4]
#     tsr = pd.read_excel(_xlsx, sheet_name='Table 9', header=5)

#     # Drop last empty two rows
#     tsr.drop(tsr.tail(2).index,inplace=True)

#     # forward fill values from merged cells
#     tsr['County'] = tsr['County'].fillna(method='ffill')
#     tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
#     tsr['City'] = tsr['City'].fillna(method='ffill')

#     # Figures with less than 10 taxpayers have been rounded up per Tax Commission disclosure rules.
#     # Rename column and convert values to boolean
#     tsr.rename(columns={'Unnamed: 5':'Rounded_Up'}, inplace=True)
#     crosswalk = {'*':True, np.nan:False}
#     tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)

#     # pivot the table to get sectors as columns with tax values by city
#     tsr_by_sector = pd.pivot_table(tsr,values = f'CY{year}', index ='City', columns = 'Economic Sector (NAICS Code)', 
#                                     aggfunc='first').reset_index()

#     # fill NAs with 0
#     tsr_by_sector = tsr_by_sector.fillna(0)

#     # rename sectors
#     tsr_by_sector.rename(sector_lu, axis=1, inplace=True)

#     # get the total sales tax revenue
#     tsr_by_sector['TOTAL'] = tsr_by_sector[sector_lu.values()].sum(axis=1)

#     # add the year
#     tsr_by_sector['YEAR'] = year

#     # export
#     tsr_by_sector_sdf = cities_sdf.merge(tsr_by_sector, on='City', how='inner')
#     return tsr_by_sector_sdf

In [21]:
# # gather annual taxable sales reports
# reports = glob.glob('.\\Inputs\\*-annual-sales.xlsx')
# reports = [r for r in reports if '~' not in r] # in case the spreadsheet is open

# base = cities_sdf

# # export to shape
# tsr_dataframes = [process_annual_sales_to_df2(r) for r in reports]
# tsr_complete = pd.concat(tsr_dataframes)
# tsr_complete.spatial.to_featureclass(location=os.path.join(gdb,'Taxable_Sales_by_City'),sanitize_columns=False)

In [22]:
# # read in taxable sales report (excel format)
# xlsx = '.\\Inputs\\2021-annual-sales.xlsx'
# year = os.path.split(xlsx)[-1][:4]
# tsr = pd.read_excel(xlsx, sheet_name='Table 9', header=5)

In [23]:
# # Drop last empty two rows
# tsr.drop(tsr.tail(2).index,inplace=True)

# # forward fill values from merged cells
# tsr['County'] = tsr['County'].fillna(method='ffill')
# tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
# tsr['City'] = tsr['City'].fillna(method='ffill')

In [24]:
# # Figures with less than 10 taxpayers have been rounded up per Tax Commission disclosure rules.
# # Rename column and convert values to boolean
# tsr.rename(columns={'Unnamed: 5':'Rounded_Up'}, inplace=True)
# crosswalk = {'*':True, np.nan:False}
# tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)

In [25]:
# # pivot the table to get sectors as columns with tax values by city
# tsr_by_sector = pd.pivot_table(tsr,values = f'CY{year}', index ='City', columns = 'Economic Sector (NAICS Code)', 
#                                  aggfunc='first').reset_index()

# # fill NAs with 0
# tsr_by_sector = tsr_by_sector.fillna(0)

In [26]:
# # rename sectors
# tsr_by_sector.rename(sector_lu, axis=1, inplace=True)

# # get the total sales tax revenue
# tsr_by_sector['TOTAL'] = tsr_by_sector[sector_lu.values()].sum(axis=1)
# tsr_by_sector

In [27]:
# cities_sdf = pd.DataFrame.spatial.from_featureclass(r".\Inputs\Cities_v2.shp")
# cities_sdf = cities_sdf[['NAME','POPLASTCEN','POPLASTEST','SHAPE']].copy()
# cities_sdf.rename({'NAME':'City'}, axis=1, inplace=True)
# tsr_by_sector_sdf = cities_sdf.merge(tsr_by_sector, on='City', how='inner')
# tsr_by_sector_sdf.spatial.to_featureclass(location=os.path.join(gdb2,f'Taxable_Sales_by_City_{year}'),sanitize_columns=False)

### Calculate the differences 
*2020 - 2019 = Diff*