### Annual Sales Tax Revenue Analysis

#### [Data Source](https://tax.utah.gov/econstats/sales)
- wait for full year to be released
- download the annual release of [historical annual taxable sales 1998 - Present](https://tax.utah.gov/esu/sales-year/annual-sales-historical.xlsx)

#### Important Notes:
- after updating the data in AGOL, set AAAA to "All Sectors" in the fields/domains tab (automatic now, check in AGOL)
- if changing any categories, ensure match in both sector and supersector 
- city boundaries may need to be updated periodically
- ensure city names in excel sheet match those in the cities shapefile
- city shapefile was simplified to reduce file size
- zip TaxableSales.gdb into Utah.gdb.zip


In [31]:
import arcpy
from arcpy import env
import os
import numpy as np
from arcgis import GIS
from arcgis.features import GeoAccessor
from arcgis.features import GeoSeriesAccessor
import pandas as pd
import glob
import numpy as np

arcpy.env.overwriteOutput = True
arcpy.env.parallelProcessingFactor = "90%"

# show all columns
pd.options.display.max_columns = None

# pd.DataFrame.spatial.from_featureclass(???)  
# df.spatial.to_featureclass(location=???,sanitize_columns=False)  

# gsa = arcgis.features.GeoSeriesAccessor(df['SHAPE'])  
# df['AREA'] = gsa.area  # KNOW YOUR UNITS

In [32]:
if not os.path.exists('Outputs'):
    os.makedirs('Outputs')
    
outputs = ['.\\Outputs', 'TaxableSales.gdb']
gdb = os.path.join(outputs[0], outputs[1])

if not arcpy.Exists(gdb):
    arcpy.CreateFileGDB_management(outputs[0], outputs[1])


In [33]:
# Domain details
domain_name = "SectorDomain"
domain_description = "Sector types"
field_type = "Text"  # Could be "Short", "Long", "Text", "Float", etc.
domain_type = "CODED"  # Could be "CODED" or "RANGE"

# Check if the domain already exists
existing_domains = arcpy.da.ListDomains(gdb)
domain_names = [d.name for d in existing_domains]

if domain_name in domain_names:
    print(f"Domain: '{domain_name}' already exists. Skipping creation.")
else:
    # Create the domain
    arcpy.management.CreateDomain(
        in_workspace=gdb,
        domain_name=domain_name,
        domain_description=domain_description,
        field_type=field_type,
        domain_type=domain_type
    )

    # Add coded values to the domain
    coded_values = {
        "Cultural Rec": "Cultural Rec",
        "Education": "Education",
        "Sport Hobby": "Sport Hobby",
        "Nonstore Retail": "Nonstore Retail",
        "Other Retail": "Other Retail",
        "Extraction": "Extraction",
        "Construction": "Construction",
        "Distribution": "Distribution",
        "Admin Support": "Admin Support",
        "Healthcare": "Healthcare",
        "Accommodation": "Accommodation",
        "Grocery Beverage": "Grocery Beverage",
        "Utilities": "Utilities",
        "Unknown": "Unknown",
        "Other Services": "Other Services",
        "Wholesale Nondurable": "Wholesale Nondurable",
        "Adjustments": "Adjustments",
        "General Retail": "General Retail",
        "Rental Leasing": "Rental Leasing",
        "Auto Private": "Auto Private",
        "Auto Retail": "Auto Retail",
        "AAAA": "All Sectors",
        "Financial": "Financial",
        "Government": "Government",
        "Wholesale Durable": "Wholesale Durable",
        "Health Retail": "Health Retail",
        "IT Data": "IT Data",
        "Manufacturing": "Manufacturing",
        "Corporate Mgmt": "Corporate Mgmt",
        "Gas Stations": "Gas Stations",
        "Electronics": "Electronics",
        "Agri Wildlife": "Agri Wildlife",
        "Prof Tech Services": "Prof Tech Services",
        "Restaurant Bar": "Restaurant Bar",
        "Wholesale ETrade": "Wholesale ETrade",
        "Furniture": "Furniture",
        "Building Supply": "Building Supply",
        "Special Event": "Special Event",
    }

    for code, desc in coded_values.items():
        arcpy.management.AddCodedValueToDomain(
            in_workspace=gdb,
            domain_name=domain_name,
            code=code,
            code_description=desc
        )

Domain: 'SectorDomain' already exists. Skipping creation.


In [36]:
# dictionary for renaming tax revenue sectors
global sector_lu
sector_lu= {
    'ACCOMMODATION (721)': 'Accommodation',
    'ADMIN. & SUPPORT & WASTE MANAG. & REMED. SERVICES (56)': 'Admin Support',
    'AGRICULTURE, FORESTRY, FISHING & HUNTING (11)': 'Agri Wildlife',
    'ARTS, ENTERTAINMENT AND RECREATION (71)': 'Cultural Rec',
    'CONSTRUCTION (23)': 'Construction',
    'EDUCATIONAL SERVICES (61)': 'Education',
    'FINANCE & INSURANCE (52)': 'Financial',
    'FOOD SERVICES & DRINKING PLACES (722)': 'Restaurant Bar',
    'HEALTH CARE & SOCIAL ASSISTANCE (62)': 'Healthcare',
    'INFORMATION (51)': 'IT Data',
    'MANAGEMENT OF COMPANIES & ENTERPRISES (55)': 'Corporate Mgmt',
    'MANUFACTURING (31-33)': 'Manufacturing',
    'MINING, QUARRYING, & OIL & GAS EXTRACTION (21)': 'Extraction',
    'OTHER SERVICES-EXCEPT PUBLIC ADMINISTRATION (81)': 'Other Services',
    'PRIOR-PERIOD PAYMENTS & REFUNDS': 'Adjustments',
    'PRIVATE MOTOR VEHICLE SALES': 'Auto Private',
    'PROFESSIONAL, SCIENTIFIC & TECHNICAL SERVICES (54)': 'Prof Tech Services',
    'PUBLIC ADMINISTRATION (92)': 'Government',
    'REAL ESTATE, RENTAL & LEASING (53)': 'Rental Leasing',
    'RETAIL-BUILD. MATERIAL, GARDEN EQUIP. & SUPPLIES DEALERS (444)': 'Building Supply',
    'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448)': 'Clothing',
    'RETAIL-CLOTHING & CLOTHING ACCESSORIES STORES (448,458)': 'Clothing',
    'RETAIL-ELECTRONICS & APPLIANCE STORES (443)': 'Electronics',
    'RETAIL-ELECTRONICS & APPLIANCE STORES (443,4492)': 'Electronics',
    'RETAIL-FOOD & BEVERAGE STORES (445)': 'Grocery Beverage',
    'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442)': 'Furniture',
    'RETAIL-FURNITURE & HOME FURNISHINGS STORES (442,4491)': 'Furniture',
    'RETAIL-GASOLINE STATIONS (447)': 'Gas Stations',
    'RETAIL-GASOLINE STATIONS (447,457)': 'Gas Stations',
    'RETAIL-GENERAL MERCHANDISE STORES (452)': 'General Retail',
    'RETAIL-GENERAL MERCHANDISE STORES (452,455)': 'General Retail',
    'RETAIL-HEALTH & PERSONAL CARE STORES (446)': 'Health Retail',
    'RETAIL-HEALTH & PERSONAL CARE STORES (446,456)': 'Health Retail',
    'RETAIL-MISCELLANEOUS STORE RETAILERS (453)': 'Other Retail',
    'RETAIL-MISCELLANEOUS STORE RETAILERS (453,4593-4599)': 'Other Retail',
    'RETAIL-MOTOR VEHICLE & PARTS DEALERS (441)': 'Auto Retail',
    'RETAIL-NONSTORE RETAILERS (454)': 'Nonstore Retail',
    'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451)': 'Sport Hobby',
    'RETAIL-SPORTING GOODS, HOBBY, MUSIC & BOOK STORES (451,4591,4592)': 'Sport Hobby',
    'SPECIAL EVENT SALES': 'Special Event',
    'TRANSPORTATION & WAREHOUSING (48-49)': 'Distribution',
    'UNKNOWN/NONCLASSIFIABLE': 'Unknown',
    'UTILITIES (22)': 'Utilities',
    'WHOLESALE TRADE-DURABLE GOODS (423)': 'Wholesale Durable',
    'WHOLESALE TRADE-ELECTRONIC MARKETS (425)': 'Wholesale ETrade',
    'WHOLESALE TRADE-AGENTS & BROKERS (425)': 'Wholesale ETrade',
    'WHOLESALE TRADE-NONDURABLE GOODS (424)': 'Wholesale Nondurable'
}

In [38]:
# dictionary for renaming tax revenue sectors
global super_sector_lu
super_sector_lu= {
    'Accommodation': 'Leisure & Hospitality',
    'Admin Support': 'Professional Services',
    'Agri Wildlife': 'Other',
    'Cultural Rec': 'Leisure & Hospitality',
    'Construction': 'Other',
    'Education': 'Govt & Edu',
    'Financial': 'Professional Services',
    'Restaurant Bar': 'Leisure & Hospitality',
    'Healthcare': 'Govt & Edu',
    'IT Data': 'Professional Services',
    'Corporate Mgmt': 'Professional Services',
    'Manufacturing': 'Manu, Wholesale, & Dist',
    'Extraction': 'Other',
    'Other Services': 'Professional Services',
    'Adjustments': 'Exclude',
    'Auto Private': 'Retail',
    'Prof Tech Services': 'Professional Services',
    'Government': 'Govt & Edu',
    'Rental Leasing': 'Professional Services',
    'Building Supply': 'Retail',
    'Clothing': 'Retail',
    'Electronics': 'Retail',
    'Grocery Beverage': 'Retail',
    'Furniture': 'Retail',
    'Gas Stations': 'Retail',
    'General Retail': 'Retail',
    'Health Retail': 'Retail',
    'Other Retail': 'Retail',
    'Auto Retail': 'Retail',
    'Nonstore Retail': 'Retail',
    'Sport Hobby': 'Retail',
    'Special Event': 'Leisure & Hospitality',
    'Distribution': 'Manu, Wholesale, & Dist',
    'Unknown': 'Exclude',
    'Utilities': 'Other',
    'Wholesale Durable': 'Manu, Wholesale, & Dist',
    'Wholesale ETrade': 'Professional Services',
    'Wholesale Nondurable': 'Manu, Wholesale, & Dist'
    }

In [39]:
# fill NA values in Spatially enabled dataframes (ignores SHAPE column)
def fill_na_sedf(df_with_shape_column, fill_value=0):
    if 'SHAPE' in list(df_with_shape_column.columns):
        df = df_with_shape_column.copy()
        shape_column = df['SHAPE'].copy()
        del df['SHAPE']
        return df.fillna(fill_value).merge(shape_column,left_index=True, right_index=True, how='inner')
    else:
        raise Exception("Dataframe does not include 'SHAPE' column")

In [40]:
def get_cities_from_report(_xlsx):
    tsr = pd.read_excel(_xlsx, sheet_name='Table 9', header=5)
    lst = tsr['City'].to_list()
    lst = [item for item in lst if item==item]
    return lst

In [41]:
# gather annual taxable sales reports
reports = glob.glob('.\\Inputs\\*-annual-sales.xlsx')
reports = [r for r in reports if '~' not in r] # in case the spreadsheet is open
cities_lists = [get_cities_from_report(r) for r in reports]
cities_list = list(set([item for sublist in cities_lists for item in sublist]))

In [42]:
# read in cities as spatial dataframe
global cities_sdf
global cities_list
cities_sdf = pd.DataFrame.spatial.from_featureclass(r".\Inputs\Cities_Simplified_ZJ_test.shp")
cities_sdf = cities_sdf[['NAME','POPLASTCEN','POPLASTEST', 'ACRES','SHAPE']].copy()
cities_sdf.rename({'NAME':'City'}, axis=1, inplace=True)
cities_sdf = cities_sdf[cities_sdf['City'].isin(cities_list)]
cities_df = cities_sdf.drop(['SHAPE','POPLASTCEN','POPLASTEST','ACRES'], axis=1)

In [43]:
def process_historical_sales_to_shape(_historical_taxable_sales_file, _gdb):
    tsr = pd.read_excel(_historical_taxable_sales_file, sheet_name='Table 9', header=5)
    tsr.drop(tsr.tail(2).index,inplace=True)
    tsr['County'] = tsr['County'].fillna(method='ffill')
    tsr['Location Code'] = tsr['Location Code'].fillna(method='ffill')
    tsr['City'] = tsr['City'].fillna(method='ffill')
    tsr.rename(columns={'Unnamed: 6':'Rounded_Up'}, inplace=True)
    crosswalk = {'*':True, np.nan:False}
    tsr['Rounded_Up'] = tsr['Rounded_Up'].map(crosswalk)
    tsr['Year'] = tsr['Year'].str[-4:]

    df = pd.DataFrame(columns=['City', 'Time_Frame', 'Start_Year', 'End_Year', 'Start_Value', 'End_Value', 'Difference', 'Percent_Difference', 'Sector'])

    years = sorted(list(set(tsr['Year'].to_list()))) 
    for idx, year in enumerate(years):
        # print(year)
        tsr_year = tsr[tsr['Year']==year].copy()
        tsr_year = pd.pivot_table(tsr_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
        tsr_year = tsr_year.fillna(0)
        tsr_year.rename(sector_lu, axis=1, inplace=True)
        tsr_year.set_index('City', inplace=True)

        if idx != 0:

            # add missing columns, sort data frame
            tsr_year_cols = list(tsr_year.columns)
            tsr_year_cols = [c for c in tsr_year_cols if c != 'City']
            missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_year_cols]
            for c in missing_cols: tsr_year[c]= np.nan
            tsr_year = tsr_year.sort_index(axis=1).reset_index()

            # I named the total field "AAAA" because ESRI dashoboards only lets you set the default to first or last item in the sorted list
            tsr_year['AAAA'] = tsr_year[list(sector_lu.values())].sum(axis=1)
            tsr_year = cities_df.merge(tsr_year, on='City', how='left')
            tsr_year.set_index('City', inplace=True)
        
    
            tsr_previous_year = tsr[tsr['Year'] == str(int(year) - 1)].copy()
            tsr_previous_year = pd.pivot_table(tsr_previous_year, values = 'Taxable Sales', index ='City', columns = 'Economic Sector (NAICS Code)', aggfunc='first').reset_index()
            tsr_previous_year = tsr_previous_year.fillna(0)
            tsr_previous_year.rename(sector_lu, axis=1, inplace=True)
            tsr_previous_year.set_index('City', inplace=True)

            # add missing columns, sort data frame
            tsr_previous_year_cols = list(tsr_previous_year.columns)
            tsr_previous_year_cols = [c for c in tsr_previous_year_cols if c != 'City']
            sectors = list(sector_lu.values())
            missing_cols = [c for c in list(sector_lu.values()) if c not in tsr_previous_year_cols]
            for c in missing_cols: tsr_previous_year[c]= np.nan
            tsr_previous_year = tsr_previous_year.sort_index(axis=1).reset_index()

            tsr_previous_year['AAAA'] = tsr_previous_year[sectors].sum(axis=1)
            tsr_previous_year = cities_df.merge(tsr_previous_year, on='City', how='left')
            tsr_previous_year.set_index('City', inplace=True)
            
            # calculate the difference
            if list(tsr_year.columns) == list(tsr_previous_year.columns):
                if  tsr_year.index.values.tolist() == tsr_previous_year.index.values.tolist():
                    tsr_diff = tsr_year - tsr_previous_year
                    tsr_year = tsr_year.reset_index()
                    tsr_previous_year = tsr_previous_year.reset_index()
                    tsr_diff = tsr_diff.reset_index()

                    # unpivot data
                    tsr_year_melted = pd.melt(tsr_year, id_vars =['City'], value_vars =sectors + ['AAAA'], var_name ='Sector', value_name ='End_Value').set_index(['City', 'Sector'])
                    tsr_previous_year_melted = pd.melt(tsr_previous_year, id_vars =['City'], value_vars =sectors + ['AAAA'], var_name ='Sector', value_name ='Start_Value').set_index(['City', 'Sector'])
                    tsr_diff_melted = pd.melt(tsr_diff, id_vars =['City'], value_vars =sectors + ['AAAA'], var_name ='Sector', value_name ='Difference').set_index(['City', 'Sector'])

                    
                    tsr_full = tsr_year_melted.merge(tsr_previous_year_melted, left_index=True, right_index=True, how='left').merge(tsr_diff_melted, left_index=True, right_index=True, how='left')
                    tsr_full = tsr_full.reset_index()
                    tsr_full['Start_Year'] = int(year) - 1
                    tsr_full['End_Year'] = int(year)
                    tsr_full['Time_Frame'] = f"{int(year)-1}-{int(year)}"
                    df = pd.concat([df, tsr_full], ignore_index=True)

                else:
                    print('row name (cities) mismatch')
            else:
                print('column name (sectors) mismatch')

    # export to gdb
    df.loc[(df['Start_Value'] > 0) & (df['End_Value'] > 0), 'Percent_Difference'] = round((df['End_Value'] - df['Start_Value']) / df['Start_Value'] * 100, 2) 
    df['Supersector'] = df['Sector'].map(super_sector_lu)
    df = df[df['Supersector'] != 'EXCLUDE']
    df = df[df['End_Year'] >= 2015]
    # df.to_csv(os.path.join(outputs[0], 'test.csv')) 

    # cast to int32
    df['Start_Year'] = df['Start_Year'].astype('int32')
    df['End_Year'] = df['End_Year'].astype('int32')

    sdf = df.merge(cities_sdf, on='City', how='inner')
    sdf.spatial.to_featureclass(location=os.path.join(_gdb,f'Taxable_Sales'),sanitize_columns=False)

     # Assign the domain
    arcpy.management.AssignDomainToField(
        in_table=os.path.join(_gdb,f'Taxable_Sales'),
        field_name='Sector',
        domain_name='SectorDomain'
    )  

In [44]:
# Run the function
historical_tax_file = r".\Inputs\annual-sales-historical-2024.xlsx"
process_historical_sales_to_shape(historical_tax_file, gdb)

  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
