In [2]:
### CODE TO CREATE DATABASE AND TABLE FOR THE DATA ###

### Dependencies and Configuration ###
import pandas as pd
from sqlalchemy import create_engine

In [4]:
#### IMPORT DATA #####
# Using panda read_excel to import the data from the excel file

EXCEL_PATH = 'data/ATLAS/FoodEnvironmentAtlas.xls'
VARIABLE_PATH = 'data/ATLAS/variables.csv'


# Import entire excel files as a large dictionary with sheet names as keys

global_dict = pd.read_excel(EXCEL_PATH,
sheet_name=['Supplemental Data - County',
    'ACCESS', 
    'STORES', 
    'RESTAURANTS', 
    'ASSISTANCE', 
    'INSECURITY', 
    'LOCAL', 
    'HEALTH', 
    'SOCIOECONOMIC'])


#global_dict

In [5]:
## CLEANING DATA ##
### CREATE LISTS OF THE COLUMNS I WANT FROM EACH SHEET ####

POPULATION_LIST = ['FIPS','County',
'State',
'2010_Census_Population',
'Population_Estimate_2011',
'Population_Estimate_2012',
'Population_Estimate_2013',
'Population_Estimate_2014',
'Population_Estimate_2015',
'Population_Estimate_2016',
'Population_Estimate_2017',
'Population_Estimate_2018'
]

ACCESS_LIST = ['FIPS',
'PCT_LACCESS_POP10',
'PCT_LACCESS_POP15',
'PCT_LACCESS_LOWI10',
'PCT_LACCESS_LOWI15',
'PCT_LACCESS_HHNV10',
'PCT_LACCESS_HHNV15',
'PCT_LACCESS_SNAP15']

STORES_LIST = ['FIPS',
'GROCPTH11',
'GROCPTH16',
'SUPERCPTH11',
'SUPERCPTH16',
'CONVSPTH11',
'CONVSPTH16',
'SNAPSPTH12',
'SNAPSPTH17',
'WICSPTH11',
'WICSPTH16']

RESTAURANTS_LIST = ['FIPS','FFRPTH11',
'FFRPTH16',
'FSRPTH11',
'FSRPTH16',
'PC_FFRSALES07',
'PC_FFRSALES12',
'PC_FSRSALES07',
'PC_FSRSALES12']

ASSISTANCE_LIST = ['FIPS',
'REDEMP_SNAPS12',
'REDEMP_SNAPS17',
'PCT_SNAP12',
'PCT_SNAP17',
'PC_SNAPBEN12',
'PC_SNAPBEN17',
'SNAP_PART_RATE11',
'SNAP_PART_RATE16',
'PC_WIC_REDEMP11',
'PC_WIC_REDEMP16',
'REDEMP_WICS11',
'REDEMP_WICS16',
'PCT_WIC12',
'PCT_WIC17',
'PCT_WICINFANTCHILD14',
'PCT_WICINFANTCHILD16',
'FOOD_BANKS18']

INSECURITY_LIST = ['FIPS',
'FOODINSEC_12_14',
'FOODINSEC_15_17',
'VLFOODSEC_12_14',
'VLFOODSEC_15_17']

LOCAL_LIST = ['FIPS',
'FMRKTPTH13',
'FMRKTPTH18',
'FOODHUB18']

HEALTH_LIST = ['FIPS',
'PCT_DIABETES_ADULTS08',
'PCT_DIABETES_ADULTS13',
'PCT_OBESE_ADULTS12',
'PCT_OBESE_ADULTS17',
'PCT_HSPA17',
'RECFACPTH11',
'RECFACPTH16']

SOCIOECONOMIC_LIST = ['FIPS',
'MEDHHINC15',
'POVRATE15',
'PERPOV10',
'CHILDPOVRATE15',
'PERCHLDPOV10',
'METRO13']

# Create a global list of all the lists above
# Used later to create a dataframe with the definition of variable names
GLOBAL_LIST = POPULATION_LIST + ACCESS_LIST + STORES_LIST + RESTAURANTS_LIST + ASSISTANCE_LIST + INSECURITY_LIST + LOCAL_LIST + HEALTH_LIST + SOCIOECONOMIC_LIST

len(GLOBAL_LIST)

82

In [7]:
#### GET THE VARIABLE INFORMATION TO CREATE A TABLE WITH DESCRIPTIONS #####

variables_df = pd.read_csv(VARIABLE_PATH)
variables_df

definitions_df = variables_df.loc[variables_df['Variable Code'].isin(GLOBAL_LIST)]
definitions_df

definitions_clean_df = definitions_df.drop(['Category Name', 'Subcategory Name', 'Geography'], axis=1)
definitions_clean_df
### WRITE VARIABLE DEFINITIONS TO A CSV FILE ###

definitions_clean_df.to_csv('data/ATLAS/definitions.csv')

In [8]:
### USE THE LISTS ABOVE TO CREATE DATAFRAMES FROM EACH SHEET ###

population_df = global_dict['Supplemental Data - County'][POPULATION_LIST]
access_df = global_dict['ACCESS'][ACCESS_LIST]
store_df = global_dict['STORES'][STORES_LIST]
restaurants_df = global_dict['RESTAURANTS'][RESTAURANTS_LIST]
assistance_df = global_dict['ASSISTANCE'][ASSISTANCE_LIST]
insecurity_df = global_dict['INSECURITY'][INSECURITY_LIST]
local_df = global_dict['LOCAL'][LOCAL_LIST]
health_df = global_dict['HEALTH'][HEALTH_LIST]
socioeconomic_df = global_dict['SOCIOECONOMIC'][SOCIOECONOMIC_LIST]

#access_df.head()


In [9]:
## Output the dataframes to csv files ##

population_df.to_csv('data/ATLAS/population.csv')
access_df.to_csv('data/ATLAS/access.csv')
store_df.to_csv('data/ATLAS/stores.csv')
restaurants_df.to_csv('data/ATLAS/restaurants.csv')
assistance_df.to_csv('data/ATLAS/assistance.csv')
insecurity_df.to_csv('data/ATLAS/insecurity.csv')
local_df.to_csv('data/ATLAS/local.csv')
health_df.to_csv('data/ATLAS/health.csv')
socioeconomic_df.to_csv('data/ATLAS/socioeconomic.csv')

In [10]:
### Load the dataframes into our database ###
### we are going to load each dataframe into a separate table ###

# db location string

db_string=f'postgresql://postgres:postgres@final-project-db.cvsvn4oapkzs.us-east-2.rds.amazonaws.com:5432/postgres'

# Initialize the database engine
engine=create_engine(db_string)


In [13]:
## Load the individual dataframes into the database ##

population_df.to_sql('population', engine, index = False, if_exists='replace')
access_df.to_sql('access', engine, index = False, if_exists='replace')
store_df.to_sql('stores', engine, index = False, if_exists='replace')
restaurants_df.to_sql('restaurants', engine, index = False, if_exists='replace')
assistance_df.to_sql('assistance', engine, index = False, if_exists='replace')
insecurity_df.to_sql('insecurity', engine, index = False, if_exists='replace')
local_df.to_sql('local', engine, index = False, if_exists='replace')
health_df.to_sql('health', engine, index = False, if_exists='replace')
socioeconomic_df.to_sql('socioeconomic', engine, index = False, if_exists='replace')

143

In [14]:
### Dependencies and Setup ###
### LOAD DATAFRAME FROM AWS SERVER

import pandas as pd
import sqlalchemy as sql
import config

endpoint=config.aws_endpoint
username='postgres'
password=config.aws_password
engine=sql.create_engine(f'postgresql://{username}:{password}@{endpoint}:5432/postgres')
df=pd.read_sql_table('final_new', con=engine)
df.head()

Unnamed: 0,FIPS,_Census_Population,Population_Estimate_,PCT_LACCESS_POP,PCT_LACCESS_LOWI,PCT_LACCESS_HHNV,PCT_LACCESS_SNAP,GROCPTH,SUPERCPTH,CONVSPTH,...,PCT_DIABETES_ADULTS,PCT_OBESE_ADULTS,PCT_HSPA,RECFACPTH,MEDHHINC,POVRATE,PERPOV,CHILDPOVRATE,PERCHLDPOV,METRO
0,2016.0,5561.0,5722.875,18.05166,8.053843,10.22845,2.947771,0.616488,0.0,0.265989,...,6.05,29.95,18.4,0.0,80695.0,8.5,0.0,8.5,0.0,0.0
1,2020.0,291826.0,297012.5,28.787531,4.842837,0.945721,1.321403,0.109486,0.037046,0.187002,...,7.0,29.95,18.4,0.133044,77791.0,8.7,0.0,12.4,0.0,1.0
2,2050.0,17013.0,17889.75,72.819369,40.278691,57.193042,28.741752,1.299912,0.0,0.311299,...,5.65,29.95,18.4,0.027809,44849.0,24.2,0.0,31.5,1.0,0.0
3,2070.0,4847.0,4969.125,41.676237,24.355533,30.724431,18.608623,1.008235,0.0,0.302584,...,7.5,29.95,18.4,0.0,50753.0,20.0,0.0,30.4,1.0,0.0
4,2090.0,97581.0,99700.25,35.796631,8.627468,2.707728,1.5845,0.07555,0.035165,0.231371,...,5.35,29.95,18.4,0.095306,70881.0,8.1,0.0,10.1,0.0,1.0


In [2]:
# #### READING DIRECTLY FROM LOCAL FILE
# #### Unessisary if you are reading from AWS

# import pandas as pd

# ### Read in the data from CSV as dataframe

# df=pd.read_csv('FoodEnvironmentAtlas_proposed.csv')
# df=df.iloc[:, 1:]

# ### Drop the rows with missing values from the end of the file

# df=df.drop(index=[3142, 3143])

In [16]:

# Lambda function to parse dataframe column names
# We need to parse the names in order to group them and average then so we can flatten the matching variables into one column

s = 'Population_Estimate_2011'
result = ''.join(i for i in s if not i.isdigit())
result

'Population_Estimate_'

In [17]:
# Create a dictionary of the column names 
# Top level of the dictionary is the column name we want to output
# next level is a list of the column names we want to average
# For many of our features we have the same measure from muptiple years
# This operation will average the values from the multiple years

column_dict={}
for each_column in df.columns: 
    s=each_column
    result=''.join(i for i in s if not i.isdigit())
    if result in column_dict: 
        column_dict[result].append(each_column)
    else: 
        column_dict[result]=[each_column]

In [18]:
# Display Column Dictionary
column_dict

{'FIPS': ['FIPS'],
 '_Census_Population': ['_Census_Population'],
 'Population_Estimate_': ['Population_Estimate_'],
 'PCT_LACCESS_POP': ['PCT_LACCESS_POP'],
 'PCT_LACCESS_LOWI': ['PCT_LACCESS_LOWI'],
 'PCT_LACCESS_HHNV': ['PCT_LACCESS_HHNV'],
 'PCT_LACCESS_SNAP': ['PCT_LACCESS_SNAP'],
 'GROCPTH': ['GROCPTH'],
 'SUPERCPTH': ['SUPERCPTH'],
 'CONVSPTH': ['CONVSPTH'],
 'SNAPSPTH': ['SNAPSPTH'],
 'WICSPTH': ['WICSPTH'],
 'FFRPTH': ['FFRPTH'],
 'FSRPTH': ['FSRPTH'],
 'PC_FFRSALES': ['PC_FFRSALES'],
 'PC_FSRSALES': ['PC_FSRSALES'],
 'REDEMP_SNAPS': ['REDEMP_SNAPS'],
 'PCT_SNAP': ['PCT_SNAP'],
 'PC_SNAPBEN': ['PC_SNAPBEN'],
 'SNAP_PART_RATE': ['SNAP_PART_RATE'],
 'PC_WIC_REDEMP': ['PC_WIC_REDEMP'],
 'REDEMP_WICS': ['REDEMP_WICS'],
 'PCT_WIC': ['PCT_WIC'],
 'PCT_WICINFANTCHILD': ['PCT_WICINFANTCHILD'],
 'FOOD_BANKS': ['FOOD_BANKS'],
 'FOODINSEC__': ['FOODINSEC__'],
 'VLFOODSEC__': ['VLFOODSEC__'],
 'FMRKTPTH': ['FMRKTPTH'],
 'PCT_DIABETES_ADULTS': ['PCT_DIABETES_ADULTS'],
 'PCT_OBESE_ADULTS': 

In [19]:
new_df=pd.DataFrame()

In [20]:
# This function goes through the dictionary and applys the .mean function across the rows

for each_column_group in column_dict: 
    new_df[each_column_group]=df[column_dict[each_column_group]].mean(axis=1)
new_df

Unnamed: 0,FIPS,_Census_Population,Population_Estimate_,PCT_LACCESS_POP,PCT_LACCESS_LOWI,PCT_LACCESS_HHNV,PCT_LACCESS_SNAP,GROCPTH,SUPERCPTH,CONVSPTH,...,PCT_DIABETES_ADULTS,PCT_OBESE_ADULTS,PCT_HSPA,RECFACPTH,MEDHHINC,POVRATE,PERPOV,CHILDPOVRATE,PERCHLDPOV,METRO
0,2016.0,5561.0,5722.875,18.051660,8.053843,10.228450,2.947771,0.616488,0.000000,0.265989,...,6.05,29.95,18.4,0.000000,80695.0,8.5,0.0,8.5,0.0,0.0
1,2020.0,291826.0,297012.500,28.787531,4.842837,0.945721,1.321403,0.109486,0.037046,0.187002,...,7.00,29.95,18.4,0.133044,77791.0,8.7,0.0,12.4,0.0,1.0
2,2050.0,17013.0,17889.750,72.819369,40.278691,57.193042,28.741752,1.299912,0.000000,0.311299,...,5.65,29.95,18.4,0.027809,44849.0,24.2,0.0,31.5,1.0,0.0
3,2070.0,4847.0,4969.125,41.676237,24.355533,30.724431,18.608623,1.008235,0.000000,0.302584,...,7.50,29.95,18.4,0.000000,50753.0,20.0,0.0,30.4,1.0,0.0
4,2090.0,97581.0,99700.250,35.796631,8.627468,2.707728,1.584500,0.075550,0.035165,0.231371,...,5.35,29.95,18.4,0.095306,70881.0,8.1,0.0,10.1,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1506,55133.0,52410.0,51691.750,38.410518,5.121070,1.827341,1.728491,0.140655,0.016394,0.295264,...,7.55,30.85,24.7,0.168373,78689.0,4.7,0.0,5.1,0.0,1.0
1507,55135.0,24496.0,24223.250,13.591013,4.437709,2.327224,1.115719,0.193002,0.000000,0.578317,...,9.10,30.85,24.7,0.106410,54849.0,10.7,0.0,14.4,0.0,0.0
1508,55137.0,166994.0,169361.375,1.447737,0.404642,3.695338,0.189869,0.329035,0.000000,0.616277,...,9.60,30.85,24.7,0.081817,49912.0,11.8,0.0,20.2,0.0,0.0
1509,55139.0,74749.0,73657.625,25.686514,6.601172,1.604715,1.595892,0.083004,0.011868,0.349962,...,8.10,30.85,24.7,0.088902,52725.0,11.3,0.0,13.3,0.0,1.0


In [21]:
# Look at names of new averaged columns
new_df.columns

Index(['FIPS', '_Census_Population', 'Population_Estimate_', 'PCT_LACCESS_POP',
       'PCT_LACCESS_LOWI', 'PCT_LACCESS_HHNV', 'PCT_LACCESS_SNAP', 'GROCPTH',
       'SUPERCPTH', 'CONVSPTH', 'SNAPSPTH', 'WICSPTH', 'FFRPTH', 'FSRPTH',
       'PC_FFRSALES', 'PC_FSRSALES', 'REDEMP_SNAPS', 'PCT_SNAP', 'PC_SNAPBEN',
       'SNAP_PART_RATE', 'PC_WIC_REDEMP', 'REDEMP_WICS', 'PCT_WIC',
       'PCT_WICINFANTCHILD', 'FOOD_BANKS', 'FOODINSEC__', 'VLFOODSEC__',
       'FMRKTPTH', 'PCT_DIABETES_ADULTS', 'PCT_OBESE_ADULTS', 'PCT_HSPA',
       'RECFACPTH', 'MEDHHINC', 'POVRATE', 'PERPOV', 'CHILDPOVRATE',
       'PERCHLDPOV', 'METRO'],
      dtype='object')

In [22]:
# Set the index to the FIPS values
# Each FIPS value is a single county

new_df=new_df.set_index('FIPS')
new_df

Unnamed: 0_level_0,_Census_Population,Population_Estimate_,PCT_LACCESS_POP,PCT_LACCESS_LOWI,PCT_LACCESS_HHNV,PCT_LACCESS_SNAP,GROCPTH,SUPERCPTH,CONVSPTH,SNAPSPTH,...,PCT_DIABETES_ADULTS,PCT_OBESE_ADULTS,PCT_HSPA,RECFACPTH,MEDHHINC,POVRATE,PERPOV,CHILDPOVRATE,PERCHLDPOV,METRO
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016.0,5561.0,5722.875,18.051660,8.053843,10.228450,2.947771,0.616488,0.000000,0.265989,0.820833,...,6.05,29.95,18.4,0.000000,80695.0,8.5,0.0,8.5,0.0,0.0
2020.0,291826.0,297012.500,28.787531,4.842837,0.945721,1.321403,0.109486,0.037046,0.187002,0.394099,...,7.00,29.95,18.4,0.133044,77791.0,8.7,0.0,12.4,0.0,1.0
2050.0,17013.0,17889.750,72.819369,40.278691,57.193042,28.741752,1.299912,0.000000,0.311299,3.025798,...,5.65,29.95,18.4,0.027809,44849.0,24.2,0.0,31.5,1.0,0.0
2070.0,4847.0,4969.125,41.676237,24.355533,30.724431,18.608623,1.008235,0.000000,0.302584,1.704951,...,7.50,29.95,18.4,0.000000,50753.0,20.0,0.0,30.4,1.0,0.0
2090.0,97581.0,99700.250,35.796631,8.627468,2.707728,1.584500,0.075550,0.035165,0.231371,0.253797,...,5.35,29.95,18.4,0.095306,70881.0,8.1,0.0,10.1,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55133.0,52410.0,51691.750,38.410518,5.121070,1.827341,1.728491,0.140655,0.016394,0.295264,0.423746,...,7.55,30.85,24.7,0.168373,78689.0,4.7,0.0,5.1,0.0,1.0
55135.0,24496.0,24223.250,13.591013,4.437709,2.327224,1.115719,0.193002,0.000000,0.578317,0.713940,...,9.10,30.85,24.7,0.106410,54849.0,10.7,0.0,14.4,0.0,0.0
55137.0,166994.0,169361.375,1.447737,0.404642,3.695338,0.189869,0.329035,0.000000,0.616277,0.663996,...,9.60,30.85,24.7,0.081817,49912.0,11.8,0.0,20.2,0.0,0.0
55139.0,74749.0,73657.625,25.686514,6.601172,1.604715,1.595892,0.083004,0.011868,0.349962,0.554311,...,8.10,30.85,24.7,0.088902,52725.0,11.3,0.0,13.3,0.0,1.0


In [23]:
# Check the dataframe to make sure we don't have null values or funky datatypes
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1511 entries, 2016.0 to 55141.0
Data columns (total 37 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   _Census_Population    1511 non-null   float64
 1   Population_Estimate_  1511 non-null   float64
 2   PCT_LACCESS_POP       1511 non-null   float64
 3   PCT_LACCESS_LOWI      1511 non-null   float64
 4   PCT_LACCESS_HHNV      1511 non-null   float64
 5   PCT_LACCESS_SNAP      1511 non-null   float64
 6   GROCPTH               1511 non-null   float64
 7   SUPERCPTH             1511 non-null   float64
 8   CONVSPTH              1511 non-null   float64
 9   SNAPSPTH              1511 non-null   float64
 10  WICSPTH               1511 non-null   float64
 11  FFRPTH                1511 non-null   float64
 12  FSRPTH                1511 non-null   float64
 13  PC_FFRSALES           1511 non-null   float64
 14  PC_FSRSALES           1511 non-null   float64
 15  REDEMP_SNAP

In [26]:
## Drop rows with empty fields before analysis
new_df=new_df.dropna()

In [27]:
# Define our target variable / feature

target='PCT_OBESE_ADULTS'
# target='PCT_DIABETES_ADULTS'

In [28]:
# Create a list of the features we want to use and put our target column in at y


X=new_df.drop(columns=['PCT_OBESE_ADULTS', 'PCT_DIABETES_ADULTS'])
y=new_df[target]

In [29]:
# Make sure we still match
display(X.shape)
display(y.shape)

(1511, 35)

(1511,)

In [30]:
# Machine Learning Dependnecies and tools


import sklearn

# get Standard Scaler from sklearn
from sklearn.preprocessing import StandardScaler
# get Random Forest Regressor from sklearn
from sklearn.ensemble import RandomForestRegressor

# Get train_test_split from sklearn
from sklearn.model_selection import train_test_split

In [31]:
# Split our data into training and testing sets

X_train, X_test, y_train, y_test=train_test_split(X, y)

In [32]:
# Apply the Standard Scaler to all of our data
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [33]:
# Fit our Random Forest Regressor

rfr=RandomForestRegressor()
rfr.fit(X_train_scaled, y_train)
# rfr.evaluate(X_train_scaled, y_train)

In [34]:
# Checkout the model score for training data
rfr.score(X_train_scaled, y_train)

0.9995941252363685

In [35]:
# Check model score for testing data
rfr.score(X_test_scaled, y_test)

0.9998744169946757

In [36]:
# Create and display a list of the features order of importance according to the model
sorted(list(zip(rfr.feature_importances_, X_train.columns)), reverse=False)

[(3.869604893344258e-16, 'PERCHLDPOV'),
 (5.369405171331259e-16, 'PERPOV'),
 (3.910511844075195e-07, 'METRO'),
 (4.65676573044436e-06, 'FSRPTH'),
 (6.722909697566451e-06, 'FOOD_BANKS'),
 (8.365653381932037e-06, 'RECFACPTH'),
 (9.988309270899156e-06, 'PCT_LACCESS_LOWI'),
 (1.2582858452319578e-05, 'PC_SNAPBEN'),
 (1.27265110817525e-05, 'POVRATE'),
 (1.451587066275793e-05, 'PCT_LACCESS_HHNV'),
 (1.813231104679228e-05, '_Census_Population'),
 (2.1750259413905315e-05, 'GROCPTH'),
 (2.1994885871199797e-05, 'FMRKTPTH'),
 (2.5635784460900007e-05, 'CHILDPOVRATE'),
 (2.6445538764714193e-05, 'REDEMP_SNAPS'),
 (2.8301243482664968e-05, 'SUPERCPTH'),
 (3.166583438734816e-05, 'Population_Estimate_'),
 (3.919283142244958e-05, 'CONVSPTH'),
 (4.677174287542692e-05, 'PCT_LACCESS_POP'),
 (5.728321240372208e-05, 'PCT_LACCESS_SNAP'),
 (7.559323299860456e-05, 'MEDHHINC'),
 (8.362587172730744e-05, 'SNAPSPTH'),
 (0.00010467751761951526, 'WICSPTH'),
 (0.0002057272834249819, 'FFRPTH'),
 (0.00029907023951887193, 