<a href="https://colab.research.google.com/github/SamuelMiller413/N1-Health/blob/main/N1_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set Up

#### Imports

In [None]:
import numpy as np 
import pandas as pd 

                                                                          # PRE-PROCESSING
from sklearn.preprocessing import StandardScaler
                                                                          # FEATURE SELECTION
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel, mutual_info_regression, RFE, RFECV

                                                                          # PIPELINE
from sklearn.pipeline import Pipeline
                                                                          # REGRESSORS       
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
                                          
                                                                          # NEURAL NETWORK
from sklearn.neural_network import MLPRegressor

                                                                          # CROSS VALIDATION
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, StratifiedKFold
from sklearn.model_selection import learning_curve, cross_val_predict
from sklearn.model_selection import KFold, RandomizedSearchCV

                                                                          # EVALUATION
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import SCORERS

                                                                          # PLOTTING
import random
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

                                                                          # DATA SET



#### Drive Mount

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Data Load

##### FDA

The Food Atlas data span multiple CSV's:
* primary dataset
* variable list
* 2 supplemental datasets 
    * county
    * state
<br/>  

I'll be supplementing the primary
dataset with the county dataset and using the variable list for reference while merging and parsing the dataframes.

In [None]:
# path to primary csv
fda = '/content/drive/MyDrive/N1 Health/N1 Health/Datasets//USDA ERS/StateAndCountyData.csv'

# path to variable list
fda_vars = '/content/drive/MyDrive/N1 Health/N1 Health/Datasets/USDA ERS/VariableList.csv'

# supplemental FDA paths
fda_county = '/content/drive/MyDrive/N1 Health/N1 Health/Datasets/USDA ERS/SupplementalDataCounty.csv'
# fda_state = '/content/drive/MyDrive/N1 Health/N1 Health/Datasets/USDA ERS/SupplementalDataState.csv'

# read to df
df_fda_full = pd.read_csv(fda)
df_fda_vars = pd.read_csv(fda_vars)
df_fda_county = pd.read_csv(fda_county)
# df_fda_state = pd.read_csv(fda_state)

##### CDC

In [None]:
## path
# cdc = '/content/drive/MyDrive/N1 Health/N1 Health/Datasets/CDC/500_Cities__Census_Tract-level_Data__GIS_Friendly_Format___2019_release.csv'

## read to df
# df_cdc = pd.read_csv(cdc)

##### CMS

In [None]:
# path
# cms = '/content/drive/MyDrive/N1 Health/N1 Health/Datasets/CMS/Geographic Variation Public Use File State County.csv'

# read to df
# df_cms = pd.read_csv(cms)

### Initial EDA

#### df_fda_vars

In [None]:
df_fda_vars

Unnamed: 0,Variable_Name,Category_Name,Category_Code,Subcategory_Name,Variable_Code,Geography,Units
0,"Population, low access to store, 2010",Access and Proximity to Grocery Store,ACCESS,Overall,LACCESS_POP10,CNTY10,Count
1,"Population, low access to store, 2015",Access and Proximity to Grocery Store,ACCESS,Overall,LACCESS_POP15,CNTY10,Count
2,"Population, low access to store (% change), 20...",Access and Proximity to Grocery Store,ACCESS,Overall,PCH_LACCESS_POP_10_15,CNTY10,% change
3,"Population, low access to store (%), 2010",Access and Proximity to Grocery Store,ACCESS,Overall,PCT_LACCESS_POP10,CNTY10,Percent
4,"Population, low access to store (%), 2015",Access and Proximity to Grocery Store,ACCESS,Overall,PCT_LACCESS_POP15,CNTY10,Percent
...,...,...,...,...,...,...,...
276,"Persistent-poverty counties, 2010",Socioeconomic Characteristics,SOCIOECONOMIC,Income Level,PERPOV10,CNTY10,Legend
277,"Child poverty rate, 2015",Socioeconomic Characteristics,SOCIOECONOMIC,Income Level,CHILDPOVRATE15,CNTY10,Percent
278,"Persistent-child-poverty counties, 2010",Socioeconomic Characteristics,SOCIOECONOMIC,Income Level,PERCHLDPOV10,CNTY10,Legend
279,"Metro/nonmetro counties, 2010",Socioeconomic Characteristics,SOCIOECONOMIC,Other,METRO13,CNTY10,Legend


In [None]:
df_fda_vars.Category_Code.unique().tolist()

['ACCESS',
 'STORES',
 'RESTAURANTS',
 'ASSISTANCE',
 'INSECURITY',
 'PRICES_TAXES',
 'LOCAL',
 'HEALTH',
 'SOCIOECONOMIC']

In [None]:
def cat_var(df):
    '''
    Creates adictionary of variables for each category
    in the variable list df.
    '''
    cat_list = df.Category_Code.unique().tolist()
    all_cats = {}
    for i in range(len(cat_list)):
        all_cats[cat_list[i]] = df[df.Category_Code == cat_list[i]].Variable_Code.tolist()
    for k,v in all_cats.items():
        print(f'{k}:')
        print(v)
        print('')
    return all_cats




In [None]:
all_cats = cat_var(df_fda_vars)

ACCESS:
['LACCESS_POP10', 'LACCESS_POP15', 'PCH_LACCESS_POP_10_15', 'PCT_LACCESS_POP10', 'PCT_LACCESS_POP15', 'LACCESS_LOWI10', 'LACCESS_LOWI15', 'PCH_LACCESS_LOWI_10_15', 'PCT_LACCESS_LOWI10', 'PCT_LACCESS_LOWI15', 'LACCESS_HHNV10', 'LACCESS_HHNV15', 'PCH_LACCESS_HHNV_10_15', 'PCT_LACCESS_HHNV10', 'PCT_LACCESS_HHNV15', 'LACCESS_SNAP15', 'PCT_LACCESS_SNAP15', 'LACCESS_CHILD10', 'LACCESS_CHILD15', 'PCH_LACCESS_CHILD_10_15', 'PCT_LACCESS_CHILD10', 'PCT_LACCESS_CHILD15', 'LACCESS_SENIORS10', 'LACCESS_SENIORS15', 'PCH_LACCESS_SENIORS_10_15', 'PCT_LACCESS_SENIORS10', 'PCT_LACCESS_SENIORS15', 'LACCESS_WHITE15', 'PCT_LACCESS_WHITE15', 'LACCESS_BLACK15', 'PCT_LACCESS_BLACK15', 'LACCESS_HISP15', 'PCT_LACCESS_HISP15', 'LACCESS_NHASIAN15', 'PCT_LACCESS_NHASIAN15', 'LACCESS_NHNA15', 'PCT_LACCESS_NHNA15', 'LACCESS_NHPI15', 'PCT_LACCESS_NHPI15', 'LACCESS_MULTIR15', 'PCT_LACCESS_MULTIR15']

STORES:
['GROC11', 'GROC16', 'PCH_GROC_11_16', 'GROCPTH11', 'GROCPTH16', 'PCH_GROCPTH_11_16', 'SUPERC11', 'SUPE

In [None]:
print(all_cats.keys())

dict_keys(['ACCESS', 'STORES', 'RESTAURANTS', 'ASSISTANCE', 'INSECURITY', 'PRICES_TAXES', 'LOCAL', 'HEALTH', 'SOCIOECONOMIC'])


In [53]:
# select categories relevant to the project
cat_select = ['ACCESS','ASSISTANCE', 'INSECURITY','HEALTH', 'SOCIOECONOMIC']
print(cat_select)

['ACCESS', 'ASSISTANCE', 'INSECURITY', 'HEALTH', 'SOCIOECONOMIC']


#### df_fda_county

In [None]:
df_fda_county

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga County,2010_Census_Population,54571
1,1001,AL,Autauga County,Population_Estimate_2011,55208
2,1001,AL,Autauga County,Population_Estimate_2012,54936
3,1001,AL,Autauga County,Population_Estimate_2013,54713
4,1001,AL,Autauga County,Population_Estimate_2014,54876
...,...,...,...,...,...
28273,56045,WY,Weston County,Population_Estimate_2014,7138
28274,56045,WY,Weston County,Population_Estimate_2015,7197
28275,56045,WY,Weston County,Population_Estimate_2016,7213
28276,56045,WY,Weston County,Population_Estimate_2017,6986


In [None]:
# shape
print(f'Shape of df_fda_county: {df_fda_county.shape}')

print('')

# columns
print(f'Columns in df_fda_county: {df_fda_county.columns}')

Shape of df_fda_county: (28278, 5)

Columns in df_fda_county: Index(['FIPS', 'State', 'County', 'Variable_Code', 'Value'], dtype='object')


In [None]:
# display
df_fda_county.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga County,2010_Census_Population,54571
1,1001,AL,Autauga County,Population_Estimate_2011,55208
2,1001,AL,Autauga County,Population_Estimate_2012,54936
3,1001,AL,Autauga County,Population_Estimate_2013,54713
4,1001,AL,Autauga County,Population_Estimate_2014,54876


In [None]:
# pivot
df_fda_county = df_fda_county.pivot_table(values='Value', index=['FIPS', 'State', 'County'], columns='Variable_Code')
df_fda_county.reset_index(inplace=True)

In [None]:
# display
df_fda_county.head()

Variable_Code,FIPS,State,County,2010_Census_Population,Population_Estimate_2011,Population_Estimate_2012,Population_Estimate_2013,Population_Estimate_2014,Population_Estimate_2015,Population_Estimate_2016,Population_Estimate_2017,Population_Estimate_2018
0,1001,AL,Autauga County,54571,55208,54936,54713,54876,54838,55242,55443,55601
1,1003,AL,Baldwin County,182265,186540,190143,194886,199189,202995,207712,212619,218022
2,1005,AL,Barbour County,27457,27350,27174,26944,26758,26294,25819,25158,24881
3,1007,AL,Bibb County,22915,22747,22664,22516,22541,22562,22576,22555,22400
4,1009,AL,Blount County,57322,57554,57570,57611,57521,57522,57517,57827,57840


In [None]:
# shape
print(f'Shape of df_fda_county: {df_fda_county.shape}')

print('')

# columns
print(f'Columns in df_fda_county: {df_fda_county.columns}')

Shape of df_fda_county: (3142, 12)

Columns in df_fda_county: Index(['FIPS', 'State', 'County', '2010_Census_Population',
       'Population_Estimate_2011', 'Population_Estimate_2012',
       'Population_Estimate_2013', 'Population_Estimate_2014',
       'Population_Estimate_2015', 'Population_Estimate_2016',
       'Population_Estimate_2017', 'Population_Estimate_2018'],
      dtype='object', name='Variable_Code')


In [None]:
# info
df_fda_county.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3142 entries, 0 to 3141
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   FIPS                      3142 non-null   int64 
 1   State                     3142 non-null   object
 2   County                    3142 non-null   object
 3   2010_Census_Population    3142 non-null   int64 
 4   Population_Estimate_2011  3142 non-null   int64 
 5   Population_Estimate_2012  3142 non-null   int64 
 6   Population_Estimate_2013  3142 non-null   int64 
 7   Population_Estimate_2014  3142 non-null   int64 
 8   Population_Estimate_2015  3142 non-null   int64 
 9   Population_Estimate_2016  3142 non-null   int64 
 10  Population_Estimate_2017  3142 non-null   int64 
 11  Population_Estimate_2018  3142 non-null   int64 
dtypes: int64(10), object(2)
memory usage: 294.7+ KB


In [None]:
# describe
df_fda_county.describe()

Variable_Code,FIPS,2010_Census_Population,Population_Estimate_2011,Population_Estimate_2012,Population_Estimate_2013,Population_Estimate_2014,Population_Estimate_2015,Population_Estimate_2016,Population_Estimate_2017,Population_Estimate_2018
count,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0
mean,30383.649268,98262.04,99166.14,99896.31,100591.3,101332.4,102082.3,102823.5,103484.1,104127.1
std,15162.508374,312946.7,316395.4,319509.6,322424.7,325335.0,328194.3,330539.5,332099.3,333486.3
min,1001.0,82.0,90.0,86.0,89.0,89.0,88.0,88.0,88.0,88.0
25%,18177.5,11114.5,11087.0,11005.25,11017.0,10985.5,10948.0,10935.25,10889.0,10926.5
50%,29176.0,25872.0,25794.5,25788.5,25710.5,25642.0,25632.5,25716.5,25784.0,25758.5
75%,45080.5,66780.0,66992.25,67112.0,67408.75,67607.0,67488.25,67306.25,67815.75,67820.5
max,56045.0,9818605.0,9876727.0,9938436.0,9998105.0,10048410.0,10097040.0,10120540.0,10118760.0,10105520.0


In [None]:
df_fda_county.isnull().sum()

Variable_Code
FIPS                        0
State                       0
County                      0
2010_Census_Population      0
Population_Estimate_2011    0
Population_Estimate_2012    0
Population_Estimate_2013    0
Population_Estimate_2014    0
Population_Estimate_2015    0
Population_Estimate_2016    0
Population_Estimate_2017    0
Population_Estimate_2018    0
dtype: int64

In [None]:
df_fda_county

Variable_Code,FIPS,State,County,2010_Census_Population,Population_Estimate_2011,Population_Estimate_2012,Population_Estimate_2013,Population_Estimate_2014,Population_Estimate_2015,Population_Estimate_2016,Population_Estimate_2017,Population_Estimate_2018
0,1001,AL,Autauga County,54571,55208,54936,54713,54876,54838,55242,55443,55601
1,1003,AL,Baldwin County,182265,186540,190143,194886,199189,202995,207712,212619,218022
2,1005,AL,Barbour County,27457,27350,27174,26944,26758,26294,25819,25158,24881
3,1007,AL,Bibb County,22915,22747,22664,22516,22541,22562,22576,22555,22400
4,1009,AL,Blount County,57322,57554,57570,57611,57521,57522,57517,57827,57840
...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,WY,Sweetwater County,43806,43981,45005,45162,44957,44754,44275,43547,43051
3138,56039,WY,Teton County,21294,21414,21625,22318,22777,23016,23161,23261,23081
3139,56041,WY,Uinta County,21118,20893,20994,20953,20827,20770,20691,20456,20299
3140,56043,WY,Washakie County,8533,8448,8408,8414,8275,8280,8168,8035,7885


In [None]:
col_fda_county = df_fda_county.columns.tolist()
for col in col_fda_county:
    print(col)

FIPS
State
County
2010_Census_Population
Population_Estimate_2011
Population_Estimate_2012
Population_Estimate_2013
Population_Estimate_2014
Population_Estimate_2015
Population_Estimate_2016
Population_Estimate_2017
Population_Estimate_2018


#### df_fda_full

In [None]:
# shape
print(f'Shape of df_fda_full: {df_fda_full.shape}')

print('')

# columns
print(f'Columns in df_fda_full: {df_fda.columns}')

Shape of df_fda: (852810, 5)

Columns in df_fda: Index(['FIPS', 'State', 'County', 'Variable_Code', 'Value'], dtype='object')


In [None]:
# display
df_fda_full.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga,LACCESS_POP10,18428.43969
1,1001,AL,Autauga,LACCESS_POP15,17496.69304
2,1001,AL,Autauga,PCH_LACCESS_POP_10_15,-5.056026
3,1001,AL,Autauga,PCT_LACCESS_POP10,33.769657
4,1001,AL,Autauga,PCT_LACCESS_POP15,32.062255


In [None]:
# pivot
df_fda_full = df_fda_full.pivot_table(values='Value', index=['FIPS', 'State', 'County'], columns='Variable_Code')
df_fda_full.reset_index(inplace=True)

In [None]:
# display
df_fda_full.head()

Variable_Code,FIPS,State,County,2010_Census_Population,AGRITRSM_OPS07,AGRITRSM_OPS12,AGRITRSM_RCT07,AGRITRSM_RCT12,BERRY_ACRES07,BERRY_ACRES12,...,WICS16,WICSPTH11,WICSPTH16,WIC_PART_2012,WIC_PART_2013,WIC_PART_2014,WIC_PART_2015,WIC_PART_2016,WIC_PART_2017,WIC_PART_2018
0,1,AL,Total,,,,,,,,...,,,,141899.8,139000.5,131046.2,132132.8,129159.9,123992.6,120604.8
1,2,AK,Total,,,,,,,,...,,,,24969.33,23054.67,19604.67,19682.0,19120.92,18188.17,17092.42
2,4,AZ,Total,,,,,,,,...,,,,193214.1,182500.8,173020.2,167072.2,163997.8,153510.1,149513.3
3,5,AR,Total,,,,,,,,...,,,,94292.75,89776.83,83288.58,84219.58,80554.67,76518.5,73606.75
4,6,CA,Total,,,,,,,,...,,,,1472468.0,1431881.0,1348939.0,1265005.0,1174875.0,1080241.0,1009492.0


In [None]:
# shape
print(f'Shape of df_fda_full: {df_fda_full.shape}')

print('')

# columns
print(f'Columns in df_fda_full: {df_fda_full.columns}')

Shape of df_fda: (6336, 335)

Columns in df_fda: Index(['FIPS', 'State', 'County', '2010_Census_Population', 'AGRITRSM_OPS07',
       'AGRITRSM_OPS12', 'AGRITRSM_RCT07', 'AGRITRSM_RCT12', 'BERRY_ACRES07',
       'BERRY_ACRES12',
       ...
       'WICS16', 'WICSPTH11', 'WICSPTH16', 'WIC_PART_2012', 'WIC_PART_2013',
       'WIC_PART_2014', 'WIC_PART_2015', 'WIC_PART_2016', 'WIC_PART_2017',
       'WIC_PART_2018'],
      dtype='object', name='Variable_Code', length=335)


In [None]:
# info
df_fda_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6336 entries, 0 to 6335
Columns: 335 entries, FIPS to WIC_PART_2018
dtypes: float64(332), int64(1), object(2)
memory usage: 16.2+ MB


In [None]:
# describe
df_fda_full.describe()

Variable_Code,FIPS,2010_Census_Population,AGRITRSM_OPS07,AGRITRSM_OPS12,AGRITRSM_RCT07,AGRITRSM_RCT12,BERRY_ACRES07,BERRY_ACRES12,BERRY_ACRESPTH07,BERRY_ACRESPTH12,...,WICS16,WICSPTH11,WICSPTH16,WIC_PART_2012,WIC_PART_2013,WIC_PART_2014,WIC_PART_2015,WIC_PART_2016,WIC_PART_2017,WIC_PART_2018
count,6336.0,3142.0,3080.0,3080.0,1974.0,2119.0,2305.0,2334.0,2305.0,2333.0,...,2985.0,3008.0,2982.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,30142.671559,98262.04,7.579545,10.764935,210831.3,276643.7,108.856833,120.325621,1.728031,1.972856,...,14.577889,0.24519,0.21724,170407.3,165720.4,158089.6,153672.7,147452.4,139746.0,132147.7
std,15342.995807,312946.7,10.407053,15.438169,542363.4,775018.2,924.730823,975.864547,21.152581,22.98552,...,48.084899,0.211686,0.191315,246827.1,241114.1,228757.5,218048.8,206549.3,193299.8,179797.6
min,1.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.004029,0.004938,12601.58,11980.17,11628.92,10995.42,10782.83,10158.58,9062.0
25%,18139.0,11114.5,2.0,3.0,7000.0,13000.0,0.0,0.0,0.0,0.0,...,2.0,0.128155,0.114667,43239.08,41721.46,40317.25,39053.63,38567.21,36983.21,34564.5
50%,29151.0,25872.0,5.0,6.0,42000.0,65000.0,5.0,6.0,0.101541,0.132127,...,5.0,0.18686,0.165353,118584.9,113762.8,108900.5,105503.6,100151.9,95958.17,92152.5
75%,45069.0,66780.0,9.0,13.0,168750.0,233000.0,22.0,24.0,0.400727,0.466003,...,10.0,0.27721,0.247699,168306.8,163698.5,159186.1,158074.5,151595.5,145955.8,142649.0
max,56045.0,9818605.0,161.0,236.0,8464000.0,23723000.0,28904.0,29218.0,864.767832,898.490114,...,1188.0,3.04878,3.262643,1472468.0,1431881.0,1348939.0,1265005.0,1174875.0,1080241.0,1009492.0


In [None]:
df_fda_full.isnull().sum()

Variable_Code
FIPS                         0
State                        0
County                       0
2010_Census_Population    3194
AGRITRSM_OPS07            3256
                          ... 
WIC_PART_2014             6285
WIC_PART_2015             6285
WIC_PART_2016             6285
WIC_PART_2017             6285
WIC_PART_2018             6285
Length: 335, dtype: int64

In [50]:
df_fda_full

Variable_Code,FIPS,State,County,2010_Census_Population,AGRITRSM_OPS07,AGRITRSM_OPS12,AGRITRSM_RCT07,AGRITRSM_RCT12,BERRY_ACRES07,BERRY_ACRES12,...,WICS16,WICSPTH11,WICSPTH16,WIC_PART_2012,WIC_PART_2013,WIC_PART_2014,WIC_PART_2015,WIC_PART_2016,WIC_PART_2017,WIC_PART_2018
0,1001.0,AL,Autauga County,54571.0,,,,,,,...,,,,1.418998e+05,1.390005e+05,1.310462e+05,1.321328e+05,1.291599e+05,1.239926e+05,1.206048e+05
1,1003.0,AL,Baldwin County,182265.0,,,,,,,...,,,,2.496933e+04,2.305467e+04,1.960467e+04,1.968200e+04,1.912092e+04,1.818817e+04,1.709242e+04
2,1005.0,AL,Barbour County,27457.0,,,,,,,...,,,,1.932141e+05,1.825008e+05,1.730202e+05,1.670722e+05,1.639978e+05,1.535101e+05,1.495133e+05
3,1007.0,AL,Bibb County,22915.0,,,,,,,...,,,,9.429275e+04,8.977683e+04,8.328858e+04,8.421958e+04,8.055467e+04,7.651850e+04,7.360675e+04
4,1009.0,AL,Blount County,57322.0,,,,,,,...,,,,1.472468e+06,1.431881e+06,1.348939e+06,1.265005e+06,1.174875e+06,1.080241e+06,1.009492e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6331,56041.0,WY,Uinta,,5.0,9.0,105000.0,,0.0,0.0,...,3.0,0.143589,0.144991,,,,,,,
6332,56043.0,WY,Washakie County,8533.0,,,,,,,...,,,,,,,,,,
6333,56043.0,WY,Washakie,,8.0,6.0,70000.0,62000.0,0.0,,...,2.0,0.236742,0.244858,,,,,,,
6334,56045.0,WY,Weston County,7208.0,,,,,,,...,,,,,,,,,,


In [None]:
col_fda = df_fda_full.columns.tolist()
for col in col_fda:
    print(col)

In [None]:
df_fda_full.update(df_fda_county)

In [51]:
6336-3256 

3080

In [None]:
count = 0
for col in df_fda_full.columns:
    if df_fda_full[col].isnull().sum() >= 3256:
        print(col)
        count +=1
    else:
        pass
        # print('not working')
print(count)
# all_cats[cat_list[i]] = df[df.Category_Code == cat_list[i]].Variable_Code.tolist()

#### df_fda

Here I'll create a new working dataframe, 'df_fda', which will feature only the relevant categories saved in 'cat_select'.

In [52]:
cat_select

['ACCESS', 'ASSISTANCE', 'INSECURITY', 'HEALTH', 'SOCIOECONOMIC']