# US Poverty data by county for 2019
<hr>

In [2]:
import pandas as pd
import os
import re


# Data

### US Poverty data by county for 2019

Economic Research Service  
U.S. Department of Agriculture  
link: https://data.ers.usda.gov/reports.aspx?ID=17826

| Columns             | Description          |
| -----------         | -----------          |
| fips                | FIPS CODE            |
| county              | County within state        |
| State               | State                   |
| ruc_code            | ruc_code             |
| total_all_people'      | Totals All people in poverty (2019)              |
| total_all_people_min | 90% confidence interval of estimate  Lower Bound, All People |
| total_all_people_max | 90% confidence interval of estimate  Upper Bound, All People |
| under18_pct         | Totals Children ages 0-17 in poverty (2019)                  |
| under18_min         | 90% confidence interval of estimate, Children ages 0-17      |
| under18_max         | 90% confidence interval of estimate, Children ages 0-17      |
| type                | count or percent 

## Constants
<hr>

In [3]:
stats_master_list = ['Vermont',
 'Mississippi',
 'Maine',
 'Montana',
 'Washington',
 'District of Columbia',
 'Texas',
 'Alabama',
 'Michigan',
 'Maryland',
 'Rhode Island',
 'South Dakota',
 'Nebraska',
 'Virginia',
 'Florida',
 'Utah',
 'Louisiana',
 'Missouri',
 'Massachusetts',
 'South Carolina',
 'Pennsylvania',
 'Tennessee',
 'Minnesota',
 'Idaho',
 'Alaska',
 'Oklahoma',
 'North Dakota',
 'Arkansas',
 'Georgia',
 'New Hampshire',
 'Indiana',
 'Puerto Rico',
 'New Jersey',
 'Delaware',
 'West Virginia',
 'Colorado',
 'New York',
 'Kansas',
 'Arizona',
 'Ohio',
 'Hawaii',
 'Illinois',
 'Oregon',
 'North Carolina',
 'California',
 'Kentucky',
 'Wyoming',
 'Iowa',
 'Nevada',
 'Connecticut',
 'Wisconsin',
 'New Mexico']

In [9]:
columns = ['Textbox105', 'Textbox106', 'fips', 'county', 'ruc_code',
       'total_all_people', 'total_all_people_min', 'total_all_people_max',
       'under18', 'under18_min', 'under18_max', 'type',
       'state']

## File managment
<hr>

In [10]:
#os.chdir('../codeathon/data-overlook/USDA_gov-Poverty/')

In [19]:
files = os.listdir("../data_raw/USDA_gov-Poverty/")

In [20]:
# remove mac file 
files.remove(".DS_Store")

In [21]:
files

['PovertyReport_Mississippi.csv',
 'PovertyReport_Massachusetts.csv',
 'PovertyReport_Florida.csv',
 'PovertyReport_Michigan.csv',
 'PovertyReport_Rhode_Island.csv',
 'PovertyReport_Idaho.csv',
 'PovertyReport_Oregon.csv',
 'PovertyReport_Kansas.csv',
 'PovertyReport_Wyoming.csv',
 'PovertyReport_Georgia.csv',
 'PovertyReport_Nevada.csv',
 'PovertyReport_Maryland.csv',
 'PovertyReport_Alaska.csv',
 'PovertyReport_Ohio.csv',
 'PovertyReport_Wisconsin.csv',
 'PovertyReport_District_of_Columbia.csv',
 'PovertyReport_North_Carolina.csv',
 'PovertyReport_Maine.csv',
 'PovertyReport_Minnesota.csv',
 'PovertyReport_Nebraska.csv',
 'PovertyReport_Indiana.csv',
 'PovertyReport_Arkansas.csv',
 'PovertyReport_Tennessee.csv',
 'PovertyReport_North_Dakota.csv',
 'PovertyReport_West_Virginia.csv',
 'PovertyReport_Montana.csv',
 'PovertyReport_New_Hampshire.csv',
 'PovertyReport_Virginia.csv',
 'PovertyReport_Oklahoma.csv',
 'PovertyReport_Utah.csv',
 'PovertyReport_Colorado.csv',
 'PovertyReport_Sou

# Create master DataFrame
<hr>

In [23]:
# Concat 

# create master file 
master_df = pd.DataFrame(columns = columns)

for file in files:
    # read csv to dataframe
    df = pd.read_csv('../data_raw/USDA_gov-Poverty/' +file, names=columns)
    # locate breaks in index
    index_s = df[ df.Textbox105.str.contains(r'Textbox*') == True ].index
    # find anem of stName of state
    state_name = df.iloc[index_s[0]+1,3]
    print(f' File name: {file} for the state {state_name} has {len(index_s)} inside.')
    
    # create dataframe with percent
    percent_df = df.iloc[index_s[0]+2:index_s[1],:].copy()
    percent_df['type'] = 'percent'
    percent_df['state'] = state_name
    
    # create dataframe with numbers
    numbers_df = df.iloc[index_s[1]+2:index_s[2],:].copy()
    numbers_df['type'] = 'numbers'
    numbers_df['state'] = state_name
    
    # CONCAT dataFRAME together
    master_df = pd.concat([master_df, percent_df, numbers_df])
    

 File name: PovertyReport_Mississippi.csv for the state Mississippi has 4 inside.
 File name: PovertyReport_Massachusetts.csv for the state Massachusetts has 4 inside.
 File name: PovertyReport_Florida.csv for the state Florida has 4 inside.
 File name: PovertyReport_Michigan.csv for the state Michigan has 4 inside.
 File name: PovertyReport_Rhode_Island.csv for the state Rhode Island has 4 inside.
 File name: PovertyReport_Idaho.csv for the state Idaho has 4 inside.
 File name: PovertyReport_Oregon.csv for the state Oregon has 4 inside.
 File name: PovertyReport_Kansas.csv for the state Kansas has 4 inside.
 File name: PovertyReport_Wyoming.csv for the state Wyoming has 4 inside.
 File name: PovertyReport_Georgia.csv for the state Georgia has 4 inside.
 File name: PovertyReport_Nevada.csv for the state Nevada has 4 inside.
 File name: PovertyReport_Maryland.csv for the state Maryland has 4 inside.
 File name: PovertyReport_Alaska.csv for the state Alaska has 4 inside.
 File name: Pove

In [24]:
master_df.shape

(6284, 13)

In [26]:
master_df.head(3)

Unnamed: 0,Textbox105,Textbox106,fips,county,ruc_code,total_all_people,total_all_people_min,total_all_people_max,under18,under18_min,under18_max,type,state
139,All people in poverty (2019),Children ages 0-17 in poverty (2019),28001,Adams,5,27.9,22.2,33.6,40.8,30.3,51.3,percent,Mississippi
140,All people in poverty (2019),Children ages 0-17 in poverty (2019),28003,Alcorn,7,17.3,13.6,21.0,23.7,16.5,30.9,percent,Mississippi
141,All people in poverty (2019),Children ages 0-17 in poverty (2019),28005,Amite,8,20.9,16.4,25.4,29.1,20.3,37.9,percent,Mississippi


## Clean up DataFrame
<hr>

In [28]:
# Clean up master_df 
master_df.drop(columns = ['Textbox105', 'Textbox106'], inplace = True)
master_df

Unnamed: 0,fips,county,ruc_code,total_all_people,total_all_people_min,total_all_people_max,under18,under18_min,under18_max,type,state
139,28001,Adams,5,27.9,22.2,33.6,40.8,30.3,51.3,percent,Mississippi
140,28003,Alcorn,7,17.3,13.6,21.0,23.7,16.5,30.9,percent,Mississippi
141,28005,Amite,8,20.9,16.4,25.4,29.1,20.3,37.9,percent,Mississippi
142,28007,Attala,6,24.1,19.3,28.9,35.0,25.9,44.1,percent,Mississippi
143,28009,Benton,1,20.7,15.4,26.0,31.1,20.9,41.3,percent,Mississippi
...,...,...,...,...,...,...,...,...,...,...,...
78,09007,Middlesex,1,11024,9322,12726,1996,1466,2526,numbers,Connecticut
79,09009,New Haven,2,99423,91433,107413,31191,27785,34597,numbers,Connecticut
80,09011,New London,2,19739,16173,23305,5774,4340,7208,numbers,Connecticut
81,09013,Tolland,1,10559,8739,12379,1614,1130,2098,numbers,Connecticut


# Sanity Check
<hr>

In [29]:
for each_state in master_df['state'].unique():
    stats_master_list.remove(each_state)

In [30]:
# items missing from dataset
stats_master_list

['Puerto Rico']

In [23]:
# number of states inculuded in data set
master_df['state'].nunique()

51

# Write to CSV
<hr>

In [32]:
master_df.to_csv('../data/USDA/USDA_poverty_2019.csv', index=False)

<br>

# EDA
<hr>

In [31]:
master_df.shape

(6284, 11)

In [30]:
master_df.head()

Unnamed: 0,fips,county,ruc_code,total_all_people,total_all_people_min,total_all_people_max,under18,under18_min,under18_max,type,state
139,28001,Adams,5,27.9,22.2,33.6,40.8,30.3,51.3,percent,Mississippi
140,28003,Alcorn,7,17.3,13.6,21.0,23.7,16.5,30.9,percent,Mississippi
141,28005,Amite,8,20.9,16.4,25.4,29.1,20.3,37.9,percent,Mississippi
142,28007,Attala,6,24.1,19.3,28.9,35.0,25.9,44.1,percent,Mississippi
143,28009,Benton,1,20.7,15.4,26.0,31.1,20.9,41.3,percent,Mississippi
