#### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

# code to ignore warnings
import warnings
warnings.filterwarnings("ignore")

root_csv = '../csv files/'
root_pickle = '../pickle files/'

#### Reading Files

In [2]:
df_obese = pd.read_excel(root_pickle + 'obese_County_Data_New.xlsx')

In [3]:
# Restricting to required columns
df_obese = df_obese[['State','County','CountyFIPS','Year','Tax_Type','Rate','Total Absolute','Total Percentage','Gender','Gender Absolute','Gender Percentage','Age','Age Obesity Absolute','Age Obesity Percentage','Notes_On_Tax_Rate']]

In [4]:
# Check for data type
df_obese.dtypes

State                      object
County                     object
CountyFIPS                  int64
Year                        int64
Tax_Type                   object
Rate                      float64
Total Absolute             object
Total Percentage           object
Gender                     object
Gender Absolute            object
Gender Percentage          object
Age                        object
Age Obesity Absolute       object
Age Obesity Percentage     object
Notes_On_Tax_Rate          object
dtype: object

In [5]:
# Converting data type
columns_to_convert = ['Total Absolute', 'Total Percentage', 'Gender Absolute', 'Gender Percentage', 'Age Obesity Absolute', 'Age Obesity Percentage']
for column in columns_to_convert:
    df_obese[column] = pd.to_numeric(df_obese[column], errors='coerce', downcast='float')

In [6]:
df_obese.head(5)

Unnamed: 0,State,County,CountyFIPS,Year,Tax_Type,Rate,Total Absolute,Total Percentage,Gender,Gender Absolute,Gender Percentage,Age,Age Obesity Absolute,Age Obesity Percentage,Notes_On_Tax_Rate
0,Alabama,Autauga County,1001,2004,Restaurant Tax,0.04,7213.0,21.9,Male,3466.0,22.1,20-44,3411.0,20.700001,
1,Alabama,Autauga County,1001,2004,Grocery Tax,0.04,7213.0,21.9,Male,3466.0,22.1,20-44,3411.0,20.700001,
2,Alabama,Autauga County,1001,2004,Restaurant Tax,0.04,7213.0,21.9,Male,3466.0,22.1,45-64,2898.0,25.299999,
3,Alabama,Autauga County,1001,2004,Grocery Tax,0.04,7213.0,21.9,Male,3466.0,22.1,45-64,2898.0,25.299999,
4,Alabama,Autauga County,1001,2004,Restaurant Tax,0.04,7213.0,21.9,Male,3466.0,22.1,65-beyond,904.0,17.9,


In [7]:
# Check for Null Values
df_obese.isna().sum()

State                          0
County                         0
CountyFIPS                     0
Year                           0
Tax_Type                       0
Rate                           0
Total Absolute               144
Total Percentage              84
Gender                         0
Gender Absolute              366
Gender Percentage             84
Age                            0
Age Obesity Absolute         612
Age Obesity Percentage        88
Notes_On_Tax_Rate         603702
dtype: int64

In [8]:
df_obese.dropna(subset={'Total Percentage','Total Absolute'},inplace=True)

In [9]:
# Pivot the Tax_Type column
df_obese_pivot = df_obese.pivot_table(index=['State', 'County', 'CountyFIPS', 'Year', 'Total Absolute','Total Percentage',
                                      'Gender', 'Gender Absolute', 'Gender Percentage','Age', 'Age Obesity Absolute',
                                     'Age Obesity Percentage'],
                           columns='Tax_Type', values='Rate', aggfunc='mean').reset_index()

# Rename the columns for clarity
df_obese_pivot.columns.name = None  # Remove the 'Tax_Type' label

df_obese_pivot.head(5)

Unnamed: 0,State,County,CountyFIPS,Year,Total Absolute,Total Percentage,Gender,Gender Absolute,Gender Percentage,Age,Age Obesity Absolute,Age Obesity Percentage,Grocery Tax,Restaurant Tax
0,Alabama,Autauga County,1001,2004,7213.0,21.9,Female,3747.0,21.700001,20-44,3411.0,20.700001,0.04,0.04
1,Alabama,Autauga County,1001,2004,7213.0,21.9,Female,3747.0,21.700001,45-64,2898.0,25.299999,0.04,0.04
2,Alabama,Autauga County,1001,2004,7213.0,21.9,Female,3747.0,21.700001,65-beyond,904.0,17.9,0.04,0.04
3,Alabama,Autauga County,1001,2004,7213.0,21.9,Male,3466.0,22.1,20-44,3411.0,20.700001,0.04,0.04
4,Alabama,Autauga County,1001,2004,7213.0,21.9,Male,3466.0,22.1,45-64,2898.0,25.299999,0.04,0.04


In [10]:
df_obese_pivot.isna().sum()

State                     0
County                    0
CountyFIPS                0
Year                      0
Total Absolute            0
Total Percentage          0
Gender                    0
Gender Absolute           0
Gender Percentage         0
Age                       0
Age Obesity Absolute      0
Age Obesity Percentage    0
Grocery Tax               0
Restaurant Tax            0
dtype: int64

#### Saving pickle files for unified Obesity dataset pivot

In [11]:
df_obese_pivot.to_pickle(root_pickle + 'Obesity_Unified_Pivot.pkl')

In [12]:
df_obese_pivot = pd.read_pickle(root_pickle +'Obesity_Unified_Pivot.pkl')
df_obese_pivot.head(5)

Unnamed: 0,State,County,CountyFIPS,Year,Total Absolute,Total Percentage,Gender,Gender Absolute,Gender Percentage,Age,Age Obesity Absolute,Age Obesity Percentage,Grocery Tax,Restaurant Tax
0,Alabama,Autauga County,1001,2004,7213.0,21.9,Female,3747.0,21.700001,20-44,3411.0,20.700001,0.04,0.04
1,Alabama,Autauga County,1001,2004,7213.0,21.9,Female,3747.0,21.700001,45-64,2898.0,25.299999,0.04,0.04
2,Alabama,Autauga County,1001,2004,7213.0,21.9,Female,3747.0,21.700001,65-beyond,904.0,17.9,0.04,0.04
3,Alabama,Autauga County,1001,2004,7213.0,21.9,Male,3466.0,22.1,20-44,3411.0,20.700001,0.04,0.04
4,Alabama,Autauga County,1001,2004,7213.0,21.9,Male,3466.0,22.1,45-64,2898.0,25.299999,0.04,0.04


#### Saving pickle file for total abolute and percentage 

In [13]:
df_obese_total_pivot = df_obese_pivot[['State','County','CountyFIPS', 'Year', 'Total Absolute','Total Percentage','Grocery Tax','Restaurant Tax']]
df_obese_total_pivot.drop_duplicates(inplace=True)

In [14]:
df_obese_total_pivot['tax_delta'] = df_obese_total_pivot['Restaurant Tax'] - df_obese_total_pivot['Grocery Tax']

In [15]:
df_obese_total_pivot['Population'] = (df_obese_total_pivot['Total Absolute']*100) / df_obese_total_pivot['Total Percentage']

In [16]:
df_obese_total_pivot.to_pickle(root_pickle + 'Obesity_Total_Pivot.pkl')

In [17]:
df_obese_total_pivot = pd.read_pickle(root_pickle +'Obesity_Total_Pivot.pkl')
df_obese_total_pivot.head(5)

Unnamed: 0,State,County,CountyFIPS,Year,Total Absolute,Total Percentage,Grocery Tax,Restaurant Tax,tax_delta,Population
0,Alabama,Autauga County,1001,2004,7213.0,21.9,0.04,0.04,0.0,32936.073633
6,Alabama,Autauga County,1001,2005,8093.0,24.0,0.04,0.04,0.0,33720.833333
12,Alabama,Autauga County,1001,2006,7457.0,21.5,0.04,0.04,0.0,34683.72093
18,Alabama,Autauga County,1001,2007,7467.0,21.1,0.04,0.04,0.0,35388.624953
24,Alabama,Autauga County,1001,2008,9568.0,26.6,0.04,0.04,0.0,35969.924296


#### Saving pickle file for gender abolute and percentage 

In [18]:
df_obese_gender_pivot = df_obese_pivot[['State','County','CountyFIPS', 'Year', 'Gender Absolute','Gender Percentage','Gender','Grocery Tax','Restaurant Tax']]
df_obese_gender_pivot.drop_duplicates(inplace=True)

In [19]:
df_obese_gender_pivot['tax_delta'] = df_obese_gender_pivot['Restaurant Tax'] - df_obese_gender_pivot['Grocery Tax']

In [20]:
df_obese_gender_pivot['Population'] = (df_obese_gender_pivot['Gender Absolute']*100) / df_obese_gender_pivot['Gender Percentage']

In [21]:
df_obese_gender_pivot.to_pickle(root_pickle + 'Obesity_Gender_Pivot.pkl')

In [22]:
df_obese_gender_pivot = pd.read_pickle(root_pickle +'Obesity_Gender_Pivot.pkl')
df_obese_gender_pivot.head(5)

Unnamed: 0,State,County,CountyFIPS,Year,Gender Absolute,Gender Percentage,Gender,Grocery Tax,Restaurant Tax,tax_delta,Population
0,Alabama,Autauga County,1001,2004,3747.0,21.700001,Female,0.04,0.04,0.0,17267.280499
3,Alabama,Autauga County,1001,2004,3466.0,22.1,Male,0.04,0.04,0.0,15683.257648
6,Alabama,Autauga County,1001,2005,4141.0,23.4,Female,0.04,0.04,0.0,17696.581485
9,Alabama,Autauga County,1001,2005,3952.0,24.6,Male,0.04,0.04,0.0,16065.040401
12,Alabama,Autauga County,1001,2006,3874.0,21.299999,Female,0.04,0.04,0.0,18187.794079


#### Saving pickle file for age abolute and percentage

In [23]:
df_obese_age_pivot = df_obese_pivot[['State','County','CountyFIPS', 'Year', 'Age Obesity Absolute','Age Obesity Percentage','Age','Grocery Tax','Restaurant Tax']]
df_obese_age_pivot.drop_duplicates(inplace=True)

In [24]:
df_obese_age_pivot['tax_delta'] = df_obese_age_pivot['Restaurant Tax'] - df_obese_age_pivot['Grocery Tax']

In [25]:
df_obese_age_pivot['Population'] = (df_obese_age_pivot['Age Obesity Absolute']*100) / df_obese_age_pivot['Age Obesity Percentage']

In [26]:
df_obese_age_pivot.to_pickle(root_pickle + 'Obesity_Age_Pivot.pkl')

In [27]:
df_obese_age_pivot = pd.read_pickle(root_pickle +'Obesity_Age_Pivot.pkl')
df_obese_age_pivot.head(5)

Unnamed: 0,State,County,CountyFIPS,Year,Age Obesity Absolute,Age Obesity Percentage,Age,Grocery Tax,Restaurant Tax,tax_delta,Population
0,Alabama,Autauga County,1001,2004,3411.0,20.700001,20-44,0.04,0.04,0.0,16478.260262
1,Alabama,Autauga County,1001,2004,2898.0,25.299999,45-64,0.04,0.04,0.0,11454.5458
2,Alabama,Autauga County,1001,2004,904.0,17.9,65-beyond,0.04,0.04,0.0,5050.279437
6,Alabama,Autauga County,1001,2005,3714.0,22.200001,20-44,0.04,0.04,0.0,16729.729155
7,Alabama,Autauga County,1001,2005,3379.0,28.5,45-64,0.04,0.04,0.0,11856.140351
