#### Annual Sales Tax Revenue Analysis

### [Data Source](https://tax.utah.gov/econstats/sales)

### Cal Year

### Notes:


In [None]:
import pandas as pd
import os
import numpy as np
import arcpy
from arcgis.features import GeoAccessor, GeoSeriesAccessor
arcpy.env.overwriteOutput = True

In [None]:
# show all columns
pd.options.display.max_columns = None

In [None]:
# Removes misc characters from string
def replace(string):
    string = string.replace("&", "")
    string = string.replace("-", " ")
    string = string.replace("(", "")
    string = string.replace(")", "")
    string = string.replace("/", " ")
    string = string.replace(",", "")
    string = string.replace(".", "")
    string = string.replace(" ", "_")
    return string


### Load Cal Year 2020
2020 sales tax revenue data

In [None]:
# load excel sheet into a dataframe
xlsx = '.\\Inputs\\2020-annual-sales.xlsx'
calyr_2020 = pd.read_excel(xlsx, sheet_name='Table 9', header=5, converters={'County':'object', 'Location Code':'object','City':'object', 'Economic Sector (NAICS Code)':'object', 'CY2020':pd.to_numeric})

# Drop last empty two rows
calyr_2020.drop(calyr_2020.tail(2).index,inplace=True)

# forward fill values from merged cells
calyr_2020['County'] = calyr_2020['County'].fillna(method='ffill')
calyr_2020['Location Code'] = calyr_2020['Location Code'].fillna(method='ffill')
calyr_2020['City'] = calyr_2020['City'].fillna(method='ffill')

# Figures with less than 10 taxpayers have been rounded up per Tax Commission disclosure rules.
# Rename column and convert values to boolean
calyr_2020.rename(columns={'Unnamed: 5':'Rounded Up'}, inplace=True)
crosswalk = {'*':True, np.nan:False} # County_ID
calyr_2020['Rounded Up'] = calyr_2020['Rounded Up'].map(crosswalk)

calyr_2020.head()


In [None]:
# pivot the table to get sectors as columns with tax values by city
sectors_calyr_2020 = pd.pivot_table(calyr_2020,values = 'CY2020',index ='City', columns = 'Economic Sector (NAICS Code)', 
                                 aggfunc='first')

# fill NAs with 0
sectors_calyr_2020 = sectors_calyr_2020.fillna(0)

In [None]:
sectors_calyr_2020.columns

In [None]:
# fix column names
#new_names = [replace(item) for item in list(sectors_q4_2020.columns)]
new_names = ['L_ACCOMMODATION_20', 'S_ADMIN_SUPPORT_20','A_AG_WILDLIFE_20','L_CULTURAL_REC_20','C_CONSTRUCTION_20',
'E_EDUCATION_20','S_FINANCIAL_20','L_RSTRNT_BAR_20','H_HEALTH_CARE_20','S_IT_DATA_20','S_CORPORATE_MGMT_20',
'M_MANUFACTURING_20','E_EXTRACTION_20','O_OTHER_20','X_ADJUSTMENTS_20','R_AUTO_PRIVATE_20','S_PROF_TECH_SRV_20',
'G_GOVERNMENT_20','S_RENTAL_LEASING_20','R_BUILDING_SUPPLY_20','R_CLOTHING_20','R_ELECTRONICS_20','R_GROCERY_BEV_20',
'R_FURNITURE_20','R_GAS_STATIONS_20','R_GENERAL_RETAIL_20','R_HEALTH_RETAIL_20','R_OTHER_RETAIL_20','R_AUTO_RETAIL_20',
'R_NONSTORE_RETAIL_20','R_SPORT_HOBBY_20','L_SPECIAL_EVENT_20','W_DISTRIBUTION_20','X_UNKNOWN_20','U_UTILITIES_20',
'W_WHLSALE_DURABLE_20','S_WHLSLE_ETRADE_20','W_WHLSLE_NDURABLE_20']
sectors_calyr_2020.columns = new_names

# get the total sales tax revenue
sectors_calyr_2020['TOTAL_20'] = sectors_calyr_2020[new_names].sum(axis=1)

sectors_calyr_2020.head()

#### Load Q4 2019 sales tax revenue data

In [None]:
# load excel sheet into a dataframe
xlsx = '.\\Inputs\\2019-annual-sales.xlsx'
calyr_2019 = pd.read_excel(xlsx, sheet_name='Table 9', header=5)

# Drop last empty two rows
calyr_2019.drop(calyr_2019.tail(2).index,inplace=True)

# forward fill values from merged cells
calyr_2019['County'] = calyr_2019['County'].fillna(method='ffill')
calyr_2019['Location Code'] = calyr_2019['Location Code'].fillna(method='ffill')
calyr_2019['City'] = calyr_2019['City'].fillna(method='ffill')

# Figures with less than 10 taxpayers have been rounded up per Tax Commission disclosure rules.
# Rename column and convert values to boolean
calyr_2019.rename(columns={'Unnamed: 5':'Rounded Up'}, inplace=True)
crosswalk = {'*':True, np.nan:False} # County_ID
calyr_2019['Rounded Up'] = calyr_2019['Rounded Up'].map(crosswalk)

calyr_2019.tail()

In [None]:
# pivot the table to get sectors as columns with tax values by city
sectors_calyr_2019 = pd.pivot_table(calyr_2019,values = 'CY2019',index ='City', columns = 'Economic Sector (NAICS Code)', 
                                 aggfunc='first')

# fill NAs with 0
sectors_calyr_2019 = sectors_calyr_2019.fillna(0)

# fix column names
#new_names = [replace(item) for item in list(sectors_q4_2019.columns)]
new_names = ['L_ACCOMMODATION_19', 'S_ADMIN_SUPPORT_19','A_AG_WILDLIFE_19','L_CULTURAL_REC_19','C_CONSTRUCTION_19',
'E_EDUCATION_19','S_FINANCIAL_19','L_RSTRNT_BAR_19','H_HEALTH_CARE_19','S_IT_DATA_19','S_CORPORATE_MGMT_19',
'M_MANUFACTURING_19','E_EXTRACTION_19','O_OTHER_19','X_ADJUSTMENTS_19','R_AUTO_PRIVATE_19','S_PROF_TECH_SRV_19',
'G_GOVERNMENT_19','S_RENTAL_LEASING_19','R_BUILDING_SUPPLY_19','R_CLOTHING_19','R_ELECTRONICS_19','R_GROCERY_BEV_19',
'R_FURNITURE_19','R_GAS_STATIONS_19','R_GENERAL_RETAIL_19','R_HEALTH_RETAIL_19','R_OTHER_RETAIL_19','R_AUTO_RETAIL_19',
'R_NONSTORE_RETAIL_19','R_SPORT_HOBBY_19','L_SPECIAL_EVENT_19','W_DISTRIBUTION_19','X_UNKNOWN_19','U_UTILITIES_19',
'W_WHLSALE_DURABLE_19','S_WHLSLE_ETRADE_19','W_WHLSLE_NDURABLE_19']
sectors_calyr_2019.columns = new_names

# drop unknown column since its not present in 2020
#sectors_q4_2019.drop('X_UNKNOWN_19', axis=1, inplace=True)
#new_names.remove('X_UNKNOWN_19')

# get the total sales tax revenue
sectors_calyr_2019['TOTAL_19'] = sectors_calyr_2019[new_names].sum(axis=1)

sectors_calyr_2019.head()

### Calculate the differences 
*2020 - 2019 = Diff*

In [None]:
# Check join output shape
print(sectors_calyr_2019.shape)
print(sectors_calyr_2020.shape)

In [None]:
# merge the 2019-2020 difference with the sdf
sectors_calyr_diff = pd.DataFrame(sectors_calyr_2020.values - sectors_calyr_2019.values)

# add back the column names
new_names = ['L_ACCOMMODATION_D20', 'S_ADMIN_SUPPORT_D20','A_AG_WILDLIFE_D20','L_CULTURAL_REC_D20','C_CONSTRUCTION_D20',
'E_EDUCATION_D20','S_FINANCIAL_D20','L_RSTRNT_BAR_D20','H_HEALTH_CARE_D20','S_IT_DATA_D20','S_CORPORATE_MGMT_D20',
'M_MANUFACTURING_D20','E_EXTRACTION_D20','O_OTHER_D20','X_ADJUSTMENTS_D20','R_AUTO_PRIVATE_D20','S_PROF_TECH_SRV_D20',
'G_GOVERNMENT_D20','S_RENTAL_LEASING_D20','R_BUILDING_SUPPLY_D20','R_CLOTHING_D20','R_ELECTRONICS_D20','R_GROCERY_BEV_D20',
'R_FURNITURE_D20','R_GAS_STATIONS_D20','R_GENERAL_RETAIL_D20','R_HEALTH_RETAIL_D20','R_OTHER_RETAIL_D20','R_AUTO_RETAIL_D20',
'R_NONSTORE_RETAIL_D20','R_SPORT_HOBBY_D20','L_SPECIAL_EVENT_D20','W_DISTRIBUTION_D20','X_UNKNOWN_D20','U_UTILITIES_D20',
'W_WHLSALE_DURABLE_D20','S_WHLSLE_ETRADE_D20','W_WHLSLE_NDURABLE_D20', 'TOTAL_D20']

sectors_calyr_diff.columns = new_names

# add back cities as index
sectors_calyr_diff['City'] = sectors_calyr_2020.index
sectors_calyr_diff = sectors_calyr_diff.set_index('City')

#check table
sectors_calyr_diff.head(10)

### Load municipalities and townships shapefile and join to formatted sales tax data

In [None]:
# load cities shapefile into pandas spatial dataframe
cities_shp = '.\\Inputs\\Cities.shp'
cities_sdf = pd.DataFrame.spatial.from_featureclass(cities_shp)

# Rename columns
cities_sdf.columns = ['FID', 'NAME', 'SOURCE', 'SALESTAXID', 'POPLASTCEN', 'POPLASTEST',
       'AREA_SQMI', 'SHAPE']

In [None]:
# merge the 2020 sdf with the pivoted data
cities_sdf2 = cities_sdf.merge(sectors_calyr_2019, left_on= "NAME", right_on="City", how="inner")
cities_sdf2 = cities_sdf2.merge(sectors_calyr_2020, left_on= "NAME", right_on="City", how="inner")
cities_sdf2 = cities_sdf2.merge(sectors_calyr_diff, left_on= "NAME", right_on="City", how="inner")
cities_sdf2.tail(10)


In [None]:
# Confirm join count, Salt Lake County (Unincorporated) should be the only one not joined
l1 = list(calyr_2020['City'].value_counts().index)
l2 = list(cities_sdf['NAME'].value_counts().index)
list(set(l1) - set(l2))

In [None]:
# Create file gdb and export sdf to feature class
outputs = '.\\Outputs'

gdb = os.path.join(outputs, "taxable_sales.gdb")
if not arcpy.Exists(gdb):
    arcpy.CreateFileGDB_management(outputs, "taxable_sales.gdb")

cities_sdf2.spatial.to_featureclass(location=os.path.join(outputs, "taxable_sales.gdb","taxable_sales_utah_calyr"))

### Analysis

In [None]:
cities_sdf2

In [None]:
# Get year specific column names
standard_cols = ['FID','NAME','SOURCE','SALESTAXID','POPLASTCEN','POPLASTEST','AREA_SQMI', 'SHAPE']
names_19 = standard_cols + [col for col in cities_sdf2.columns if '_19' in col]
names_20 = standard_cols + [col for col in cities_sdf2.columns if '_20' in col]
names_D20 = standard_cols + [col for col in cities_sdf2.columns if '_D20' in col]

# subset columns by time
cities_calyr_19 = cities_sdf2[names_19]
cities_calyr_20 = cities_sdf2[names_20]
cities_calyr_d20 = cities_sdf2[names_D20]


In [None]:
# 10 cities with smallest difference of revenue
cities_calyr_d20[['NAME', 'POPLASTCEN', 'AREA_SQMI', 'TOTAL_D20']].sort_values('TOTAL_D20', ascending=False).head(10)

In [None]:
# 10 cities with largest difference of revenue
cities_calyr_d20[['NAME', 'POPLASTCEN', 'AREA_SQMI', 'TOTAL_D20']].sort_values('TOTAL_D20', ascending=True).head(10)

#### Some industries of interest:
- RETAIL-GASOLINE STATIONS (447)
- RETAIL-FOOD & BEVERAGE STORES (445)
- REAL ESTATE, RENTAL & LEASING (53)
- ARTS, ENTERTAINMENT AND RECREATION (71)
- ACCOMMODATION (721)