In [400]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup as BS
import requests
import re
import json
import pprint 
import calendar
import datetime

%matplotlib inline

#### Households that are single parent (count) by race/ethnicity 2009-2017
https://data.diversitydatakids.org/dataset/11001_2_c-households-that-are-single-parent--count--by-race-ethnicity

The number of households with a single parent (male householder and no wife present or female householder and no husband present), for the total population and by race/ethnicity.

By using this data, you agree to abide by diversitydatakids.org's Terms and Conditions.
Suggested citation: diversitydatakids.org. 2021. “Households that are single parent (count) by race/ethnicity”, retrieved from https://data.diversitydatakids.org/dataset/11001_2_c-households-that-are-single-parent--count--by-race-ethnicity?_external=True on May 02 2023, calculated from American Community Survey Summary Files

#### Households that are single parent (percent) by race/ethnicity 2009-2017
https://data.diversitydatakids.org/dataset/11001_2_p-households-that-are-single-parent--percent--by-race-ethnicity

The number of households with a single parent (male householder and no wife present or female householder and no husband present) divided by the number of households, times 100, for the total population and by race/ethnicity.

By using this data, you agree to abide by diversitydatakids.org's Terms and Conditions.
Suggested citation: diversitydatakids.org. 2021. “Households that are single parent (percent) by race/ethnicity”, retrieved from https://data.diversitydatakids.org/dataset/11001_2_p-households-that-are-single-parent--percent--by-race-ethnicity?_external=True on May 02 2023, calculated from American Community Survey Summary Files

#### Single mother families with children aged 0-17 years (count) by race/ethnicity
The number of single mother (female householder and no husband present) family households with children aged 0-17 years related to the householder, for the total population and by race/ethnicity.

By using this data, you agree to abide by diversitydatakids.org's Terms and Conditions.
Suggested citation: diversitydatakids.org. 2021. “Single mother families with children aged 0-17 years (count) by race/ethnicity”, retrieved from https://data.diversitydatakids.org/dataset/17010_3_c-single-mother-families-with-children-aged-0-17-years--count--by-race-ethnicity?_external=True on May 02 2023, calculated from American Community Survey Summary Files

In [401]:
#load in single parent count at national level and by race, 2009-2017
sp_count_country = pd.read_csv('../data/diversitydatakids/nation_sp_hh/sp_hh_count_nation_one_yr.csv')
sp_count_country

Unnamed: 0,geoid,name,year,total_est,total_se,aian_se,aian_est,api_se,api_est,asian_se,...,nnhwhite_se,nnhwhite_est,other_se,other_est,othermore_se,othermore_est,twomore_se,twomore_est,white_se,white_est
0,01000US,United States,2009,19719269,39827.410866,4567.398767,227518,8250.974338,627700,8013.363063,...,49354.934929,9473570,11707.99865,1225328,13507.738476,1633612,6736.599019,408284,33035.428737,12394013
1,01000US,United States,2010,20384264,44892.732675,4047.531133,241503,7346.118545,677530,7147.211847,...,53999.574682,9945994,10702.272289,1276885,13072.512443,1740849,7506.793551,463964,36302.725425,12810720
2,01000US,United States,2011,20564358,44596.597001,4424.383847,241672,8987.311749,692336,8836.553514,...,54828.27754,10084383,11190.665989,1266791,13340.554905,1743738,7262.189745,476947,36602.700124,12964214
3,01000US,United States,2012,20754812,41742.136341,3739.180254,243523,8282.686001,732246,8069.477139,...,51198.71346,10244401,12219.236181,1271764,13772.636048,1788048,6354.193172,516284,34266.844003,13079009
4,01000US,United States,2013,20821655,42805.144026,3586.443072,232786,8005.99779,728977,7828.536578,...,53325.67098,10282566,11608.696785,1281287,13733.889194,1800252,7338.792227,518965,33167.890046,13139067
5,01000US,United States,2014,21037401,40666.259189,4097.107885,251320,7945.972757,770051,7710.251954,...,50363.135261,10534167,10794.83295,1286324,13377.194795,1818075,7900.691247,531751,33881.045237,13244910
6,01000US,United States,2015,20814961,42483.686267,4292.328951,243543,8167.554251,767882,7857.9986,...,50842.751204,10502035,12383.823911,1299266,14111.067279,1814799,6764.844794,515533,31732.932193,13054222
7,01000US,United States,2016,20818164,48217.287917,4267.676321,242678,8841.251486,788597,8628.385326,...,59081.191668,10551826,12652.399533,1345798,14548.638541,1891751,7181.898736,545953,36055.59144,12982408
8,01000US,United States,2017,20783589,44924.673966,3887.362642,257928,8232.236076,807557,7931.103894,...,54844.391554,10645007,12143.618402,1356986,14323.357701,1911883,7595.466275,554897,35898.82172,12928220


In [402]:
#drop columns not needed
sp_count_country.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_count_country.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,01000US,United States,2009,19719269,227518,627700,593200,4836426,3587984,16131285,34500,10245699,9473570,1225328,1633612,408284,12394013
1,01000US,United States,2010,20384264,241503,677530,647136,4913662,3905078,16479186,30394,10438270,9945994,1276885,1740849,463964,12810720
2,01000US,United States,2011,20564358,241672,692336,658767,4922398,4013022,16551336,33569,10479975,10084383,1266791,1743738,476947,12964214


In [403]:
#rename columns
sp_count_country = sp_count_country.rename(columns = {'name': 'country', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_count_country  

Unnamed: 0,geoid,country,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,01000US,United States,2009,19719269,227518,627700,593200,4836426,3587984,16131285,34500,10245699,9473570,1225328,1633612,408284,12394013
1,01000US,United States,2010,20384264,241503,677530,647136,4913662,3905078,16479186,30394,10438270,9945994,1276885,1740849,463964,12810720
2,01000US,United States,2011,20564358,241672,692336,658767,4922398,4013022,16551336,33569,10479975,10084383,1266791,1743738,476947,12964214
3,01000US,United States,2012,20754812,243523,732246,695285,4911986,4111806,16643006,36961,10510411,10244401,1271764,1788048,516284,13079009
4,01000US,United States,2013,20821655,232786,728977,697692,4920573,4136065,16685590,31285,10539089,10282566,1281287,1800252,518965,13139067
5,01000US,United States,2014,21037401,251320,770051,732356,4953045,4298398,16739003,37695,10503234,10534167,1286324,1818075,531751,13244910
6,01000US,United States,2015,20814961,243543,767882,730043,4934515,4298685,16516276,37839,10312926,10502035,1299266,1814799,515533,13054222
7,01000US,United States,2016,20818164,242678,788597,747992,4912730,4340948,16477216,40605,10266338,10551826,1345798,1891751,545953,12982408
8,01000US,United States,2017,20783589,257928,807557,765471,4878001,4431071,16352518,42086,10138582,10645007,1356986,1911883,554897,12928220


In [404]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'country', 'year']
# Melt the DataFrame to transform it to long format
sp_count_country_long = pd.melt(sp_count_country, id_vars=id_vars, var_name='race', value_name='count_country')
sp_count_country_long

Unnamed: 0,geoid,country,year,race,count_country
0,01000US,United States,2009,All,19719269
1,01000US,United States,2010,All,20384264
2,01000US,United States,2011,All,20564358
3,01000US,United States,2012,All,20754812
4,01000US,United States,2013,All,20821655
...,...,...,...,...,...
121,01000US,United States,2013,White,13139067
122,01000US,United States,2014,White,13244910
123,01000US,United States,2015,White,13054222
124,01000US,United States,2016,White,12982408


In [405]:
sp_perc_country = pd.read_csv('../data/diversitydatakids/nation_sp_hh/sp_hh_percent_nation_one_yr.csv')
sp_perc_country

Unnamed: 0,geoid,name,year,total_est,total_se,aian_est,aian_se,api_est,api_se,asian_est,...,nnhwhite_est,nnhwhite_se,other_est,other_se,othermore_est,othermore_se,twomore_est,twomore_se,white_est,white_se
0,01000US,United States,2009,17.356031,0.031689,28.879255,0.475158,14.150786,0.179763,13.763057,...,29.105686,0.112265,31.451828,0.258391,29.069965,0.199695,23.686501,0.307669,13.886603,0.035575
1,01000US,United States,2010,17.792374,0.036026,29.696947,0.359956,14.373991,0.149159,14.121069,...,29.503918,0.126728,33.332108,0.227294,30.260378,0.182314,24.138351,0.320369,14.305787,0.039169
2,01000US,United States,2011,17.883337,0.034871,29.672375,0.420019,14.500409,0.182514,14.184734,...,29.396454,0.119115,32.976486,0.246212,30.031368,0.179101,24.273458,0.262016,14.450139,0.039108
3,01000US,United States,2012,17.896778,0.033107,29.519764,0.34083,14.722599,0.161499,14.39463,...,29.204094,0.117299,33.2514,0.269149,30.463181,0.19139,25.248074,0.228082,14.491941,0.036619
4,01000US,United States,2013,17.904781,0.034339,29.101524,0.331333,14.236302,0.150756,13.990604,...,28.890228,0.127806,33.098991,0.245887,30.059299,0.181344,24.503452,0.263324,14.556652,0.035551
5,01000US,United States,2014,17.940905,0.031874,30.006351,0.387928,14.332034,0.143175,14.024188,...,28.779802,0.112017,32.43718,0.223735,29.627934,0.173993,24.495974,0.289824,14.622648,0.036008
6,01000US,United States,2015,17.608721,0.033081,29.550774,0.424849,13.919462,0.143315,13.598124,...,28.109261,0.109593,31.87882,0.25861,28.628986,0.182904,22.777071,0.226973,14.341585,0.033336
7,01000US,United States,2016,17.514853,0.038129,28.823566,0.403143,13.914927,0.151614,13.562786,...,27.756002,0.134531,31.40254,0.264743,28.347229,0.18369,22.863684,0.222638,14.259237,0.038242
8,01000US,United States,2017,17.310596,0.03465,29.22028,0.30641,13.688169,0.134369,13.335874,...,27.301233,0.117568,30.959976,0.228167,27.870193,0.166051,22.402674,0.234782,14.110628,0.037878


In [406]:
#drop columns not needed
sp_perc_country.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_perc_country.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,01000US,United States,2009,17.356031,28.879255,14.150786,13.763057,35.768578,28.199253,15.98858,27.444771,12.6385,29.105686,31.451828,29.069965,23.686501,13.886603
1,01000US,United States,2010,17.792374,29.696947,14.373991,14.121069,35.765656,29.429907,16.267973,23.234516,12.909598,29.503918,33.332108,30.260378,24.138351,14.305787
2,01000US,United States,2011,17.883337,29.672375,14.500409,14.184734,35.465519,29.427132,16.330132,25.743298,12.988436,29.396454,32.976486,30.031368,24.273458,14.450139


In [407]:
#rename columns
sp_perc_country = sp_perc_country.rename(columns = {'name': 'country', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_perc_country.head(3)

Unnamed: 0,geoid,country,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,01000US,United States,2009,17.356031,28.879255,14.150786,13.763057,35.768578,28.199253,15.98858,27.444771,12.6385,29.105686,31.451828,29.069965,23.686501,13.886603
1,01000US,United States,2010,17.792374,29.696947,14.373991,14.121069,35.765656,29.429907,16.267973,23.234516,12.909598,29.503918,33.332108,30.260378,24.138351,14.305787
2,01000US,United States,2011,17.883337,29.672375,14.500409,14.184734,35.465519,29.427132,16.330132,25.743298,12.988436,29.396454,32.976486,30.031368,24.273458,14.450139


In [408]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'country', 'year']
# Melt the DataFrame to transform it to long format
sp_perc_country_long = pd.melt(sp_perc_country, id_vars=id_vars, var_name='race', value_name='perc_country')
sp_perc_country_long

Unnamed: 0,geoid,country,year,race,perc_country
0,01000US,United States,2009,All,17.356031
1,01000US,United States,2010,All,17.792374
2,01000US,United States,2011,All,17.883337
3,01000US,United States,2012,All,17.896778
4,01000US,United States,2013,All,17.904781
...,...,...,...,...,...
121,01000US,United States,2013,White,14.556652
122,01000US,United States,2014,White,14.622648
123,01000US,United States,2015,White,14.341585
124,01000US,United States,2016,White,14.259237


In [409]:
sphh_ar_count_perc_country = pd.merge(sp_count_country_long, sp_perc_country_long, on=['geoid', 'country', 'year', 'race'])
sphh_ar_count_perc_country

Unnamed: 0,geoid,country,year,race,count_country,perc_country
0,01000US,United States,2009,All,19719269,17.356031
1,01000US,United States,2010,All,20384264,17.792374
2,01000US,United States,2011,All,20564358,17.883337
3,01000US,United States,2012,All,20754812,17.896778
4,01000US,United States,2013,All,20821655,17.904781
...,...,...,...,...,...,...
121,01000US,United States,2013,White,13139067,14.556652
122,01000US,United States,2014,White,13244910,14.622648
123,01000US,United States,2015,White,13054222,14.341585
124,01000US,United States,2016,White,12982408,14.259237


In [500]:
sphh_ar_count_perc_country = sphh_ar_count_perc_country.reindex(columns=['geoid', 'year', 'race', 'country', 'count_country', 'perc_country'])
sphh_ar_count_perc_country

Unnamed: 0,geoid,year,race,country,count_country,perc_country
0,01000US,2009,All,United States,19719269,17.356031
1,01000US,2010,All,United States,20384264,17.792374
2,01000US,2011,All,United States,20564358,17.883337
3,01000US,2012,All,United States,20754812,17.896778
4,01000US,2013,All,United States,20821655,17.904781
...,...,...,...,...,...,...
121,01000US,2013,White,United States,13139067,14.556652
122,01000US,2014,White,United States,13244910,14.622648
123,01000US,2015,White,United States,13054222,14.341585
124,01000US,2016,White,United States,12982408,14.259237


In [410]:
country_sp_hh_count_perc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126 entries, 0 to 125
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   geoid          126 non-null    object 
 1   country        126 non-null    object 
 2   year           126 non-null    int64  
 3   race           126 non-null    object 
 4   count_country  126 non-null    int64  
 5   perc_country   126 non-null    float64
dtypes: float64(1), int64(2), object(3)
memory usage: 6.9+ KB


In [411]:
sp_count_region = pd.read_csv('../data/diversitydatakids/region_sp_hh/sp_hh_count_region_one_yr.csv')
sp_count_region.head(3)

Unnamed: 0,geoid,name,year,total_est,total_se,aian_se,aian_est,api_se,api_est,asian_se,...,nnhwhite_se,nnhwhite_est,other_se,other_est,othermore_se,othermore_est,twomore_se,twomore_est,white_se,white_est
0,02000US1,Northeast Region,2009,3591526,18178.579359,1187.196846,13910,3229.575612,113644,3214.168947,...,22656.667018,1626352,5062.295933,307014,5839.363182,379982,2910.553599,72968,14443.141129,2257205
1,02000US1,Northeast Region,2010,3722961,18379.573293,903.884521,14944,3296.57858,119921,3275.570957,...,22264.351172,1733466,6335.502799,337922,7120.531921,429534,3250.135247,91612,13538.509509,2314898
2,02000US1,Northeast Region,2011,3758465,17859.248241,1161.000829,15779,3559.694081,129340,3546.48203,...,22335.507663,1755845,6162.372209,358377,6978.889705,449116,3275.678598,90739,14269.60895,2325839


In [412]:
sp_count_region.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_count_region.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,02000US1,Northeast Region,2009,3591526,13910,113644,112746,826785,667909,2923617,898,1965174,1626352,307014,379982,72968,2257205
1,02000US1,Northeast Region,2010,3722961,14944,119921,118934,843664,746164,2976797,987,1989495,1733466,337922,429534,91612,2314898
2,02000US1,Northeast Region,2011,3758465,15779,129340,128351,838391,760934,2997531,989,2002620,1755845,358377,449116,90739,2325839


In [413]:
sp_count_region = sp_count_region.rename(columns = {'name': 'region', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_count_region.head(3)                          

Unnamed: 0,geoid,region,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,02000US1,Northeast Region,2009,3591526,13910,113644,112746,826785,667909,2923617,898,1965174,1626352,307014,379982,72968,2257205
1,02000US1,Northeast Region,2010,3722961,14944,119921,118934,843664,746164,2976797,987,1989495,1733466,337922,429534,91612,2314898
2,02000US1,Northeast Region,2011,3758465,15779,129340,128351,838391,760934,2997531,989,2002620,1755845,358377,449116,90739,2325839


In [414]:
sp_count_region['region'] = sp_count_region['region'].str.replace(' Region', '')
sp_count_region.head(3)

Unnamed: 0,geoid,region,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,02000US1,Northeast,2009,3591526,13910,113644,112746,826785,667909,2923617,898,1965174,1626352,307014,379982,72968,2257205
1,02000US1,Northeast,2010,3722961,14944,119921,118934,843664,746164,2976797,987,1989495,1733466,337922,429534,91612,2314898
2,02000US1,Northeast,2011,3758465,15779,129340,128351,838391,760934,2997531,989,2002620,1755845,358377,449116,90739,2325839


In [415]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'region', 'year']
# Melt the DataFrame to transform it to long format
sp_count_region_long = pd.melt(sp_count_region, id_vars=id_vars, var_name='race', value_name='count_region')
sp_count_region_long

Unnamed: 0,geoid,region,year,race,count_region
0,02000US1,Northeast,2009,All,3591526
1,02000US1,Northeast,2010,All,3722961
2,02000US1,Northeast,2011,All,3758465
3,02000US1,Northeast,2012,All,3768942
4,02000US1,Northeast,2013,All,3745531
...,...,...,...,...,...
499,02000US4,West,2013,White,2999194
500,02000US4,West,2014,White,3015983
501,02000US4,West,2015,White,2972244
502,02000US4,West,2016,White,2925727


In [454]:
sp_perc_region = pd.read_csv('../data/diversitydatakids/region_sp_hh/sp_hh_percent_region_one_yr.csv')
sp_perc_region.head(3)

Unnamed: 0,geoid,name,year,total_est,total_se,aian_est,aian_se,api_est,api_se,asian_est,...,nnhwhite_est,nnhwhite_se,other_est,other_se,othermore_est,othermore_se,twomore_est,twomore_se,white_est,white_se
0,02000US1,Northeast Region,2009,17.291521,0.086303,27.875195,2.095928,12.776525,0.353503,12.734467,...,31.625832,0.418986,39.839146,0.536228,36.9342,0.464701,28.263109,0.963842,13.600248,0.086071
1,02000US1,Northeast Region,2010,17.809656,0.086173,28.157442,1.349333,12.510615,0.336882,12.470837,...,31.932354,0.381739,41.745564,0.684314,38.035793,0.547671,28.645849,0.881811,14.028399,0.080619
2,02000US1,Northeast Region,2011,17.970419,0.083569,29.105564,1.784015,13.309747,0.358457,13.273647,...,31.946665,0.378811,41.400906,0.590933,37.914482,0.47856,28.451607,0.828179,14.137472,0.085581


In [455]:
sp_perc_region.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_perc_region.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,02000US1,Northeast Region,2009,17.291521,27.875195,12.776525,12.734467,37.487911,35.224964,15.489917,21.827906,12.574726,31.625832,39.839146,36.9342,28.263109,13.600248
1,02000US1,Northeast Region,2010,17.809656,28.157442,12.510615,12.470837,37.301495,36.875214,15.76636,20.321186,12.855678,31.932354,41.745564,38.035793,28.645849,14.028399
2,02000US1,Northeast Region,2011,17.970419,29.105564,13.309747,13.273647,37.218647,36.585865,15.914792,20.569883,12.988375,31.946665,41.400906,37.914482,28.451607,14.137472


In [456]:
sp_perc_region = sp_perc_region.rename(columns = {'name': 'region', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_perc_region.head(3)

Unnamed: 0,geoid,region,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,02000US1,Northeast Region,2009,17.291521,27.875195,12.776525,12.734467,37.487911,35.224964,15.489917,21.827906,12.574726,31.625832,39.839146,36.9342,28.263109,13.600248
1,02000US1,Northeast Region,2010,17.809656,28.157442,12.510615,12.470837,37.301495,36.875214,15.76636,20.321186,12.855678,31.932354,41.745564,38.035793,28.645849,14.028399
2,02000US1,Northeast Region,2011,17.970419,29.105564,13.309747,13.273647,37.218647,36.585865,15.914792,20.569883,12.988375,31.946665,41.400906,37.914482,28.451607,14.137472


In [457]:
sp_perc_region['region'] = sp_perc_region['region'].str.replace(' Region', '')
sp_perc_region.head(3)

Unnamed: 0,geoid,region,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,02000US1,Northeast,2009,17.291521,27.875195,12.776525,12.734467,37.487911,35.224964,15.489917,21.827906,12.574726,31.625832,39.839146,36.9342,28.263109,13.600248
1,02000US1,Northeast,2010,17.809656,28.157442,12.510615,12.470837,37.301495,36.875214,15.76636,20.321186,12.855678,31.932354,41.745564,38.035793,28.645849,14.028399
2,02000US1,Northeast,2011,17.970419,29.105564,13.309747,13.273647,37.218647,36.585865,15.914792,20.569883,12.988375,31.946665,41.400906,37.914482,28.451607,14.137472


In [458]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'region', 'year']
# Melt the DataFrame to transform it to long format
sp_perc_region_long = pd.melt(sp_perc_region, id_vars=id_vars, var_name='race', value_name='perc_region')
sp_perc_region_long

Unnamed: 0,geoid,region,year,race,perc_region
0,02000US1,Northeast,2009,All,17.291521
1,02000US1,Northeast,2010,All,17.809656
2,02000US1,Northeast,2011,All,17.970419
3,02000US1,Northeast,2012,All,17.929022
4,02000US1,Northeast,2013,All,17.889442
...,...,...,...,...,...
499,02000US4,West,2013,White,15.429990
500,02000US4,West,2014,White,15.437226
501,02000US4,West,2015,White,15.111566
502,02000US4,West,2016,White,14.891267


In [459]:
sphh_ar_count_perc_region = pd.merge(sp_count_region_long, sp_perc_region_long, on=['geoid', 'region', 'year', 'race'])
sphh_ar_count_perc_region

Unnamed: 0,geoid,region,year,race,count_region,perc_region
0,02000US1,Northeast,2009,All,3591526,17.291521
1,02000US1,Northeast,2010,All,3722961,17.809656
2,02000US1,Northeast,2011,All,3758465,17.970419
3,02000US1,Northeast,2012,All,3768942,17.929022
4,02000US1,Northeast,2013,All,3745531,17.889442
...,...,...,...,...,...,...
499,02000US4,West,2013,White,2999194,15.429990
500,02000US4,West,2014,White,3015983,15.437226
501,02000US4,West,2015,White,2972244,15.111566
502,02000US4,West,2016,White,2925727,14.891267


In [499]:
sphh_ar_count_perc_region = sphh_ar_count_perc_region.reindex(columns=['geoid', 'year', 'race', 'region', 'count_region', 'perc_region'])
sphh_ar_count_perc_region

Unnamed: 0,geoid,year,race,region,count_region,perc_region
0,02000US1,2009,All,Northeast,3591526,17.291521
1,02000US1,2010,All,Northeast,3722961,17.809656
2,02000US1,2011,All,Northeast,3758465,17.970419
3,02000US1,2012,All,Northeast,3768942,17.929022
4,02000US1,2013,All,Northeast,3745531,17.889442
...,...,...,...,...,...,...
499,02000US4,2013,White,West,2999194,15.429990
500,02000US4,2014,White,West,3015983,15.437226
501,02000US4,2015,White,West,2972244,15.111566
502,02000US4,2016,White,West,2925727,14.891267


In [460]:
sphh_ar_count_perc_region.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504 entries, 0 to 503
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   geoid         504 non-null    object 
 1   region        504 non-null    object 
 2   year          504 non-null    int64  
 3   race          504 non-null    object 
 4   count_region  504 non-null    int64  
 5   perc_region   504 non-null    float64
dtypes: float64(1), int64(2), object(3)
memory usage: 27.6+ KB


In [469]:
sp_count_division = pd.read_csv('../data/diversitydatakids/division_sp_hh/sp_hh_count_division_one_yr.csv')
sp_count_division.head(3)

Unnamed: 0,geoid,name,year,total_est,total_se,aian_se,aian_est,api_se,api_est,asian_se,...,nnhwhite_se,nnhwhite_est,other_se,other_est,othermore_se,othermore_est,twomore_se,twomore_est,white_se,white_est
0,03000US1,New England Division,2009,857357,9074.387087,474.276734,3162,,,1294.852006,...,11963.094117,269180,2181.227628,62934,2652.241831,83697,1508.851472,20763,8235.383187,647260
1,03000US1,New England Division,2010,913062,8423.434174,551.199859,3822,,,1643.895396,...,10798.124408,299727,2577.739158,65701,3180.283582,93636,1862.649858,27935,7288.388046,679653
2,03000US1,New England Division,2011,914468,8234.200536,472.506247,3778,,,1629.282062,...,10973.851114,306394,2290.432353,70637,2749.141471,93664,1520.427067,23027,7743.269699,675777


In [470]:
sp_count_division.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_count_division.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,03000US1,New England Division,2009,857357,3162,,20418,102673,132382,724975,,588177,269180,62934,83697,20763,647260
1,03000US1,New England Division,2010,913062,3822,,24270,111230,147163,765899,,613335,299727,65701,93636,27935,679653
2,03000US1,New England Division,2011,914468,3778,,25109,116060,153308,761160,,608074,306394,70637,93664,23027,675777


In [471]:
sp_count_division = sp_count_division.rename(columns = {'name': 'division', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_count_division.head(3)

Unnamed: 0,geoid,division,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,03000US1,New England Division,2009,857357,3162,,20418,102673,132382,724975,,588177,269180,62934,83697,20763,647260
1,03000US1,New England Division,2010,913062,3822,,24270,111230,147163,765899,,613335,299727,65701,93636,27935,679653
2,03000US1,New England Division,2011,914468,3778,,25109,116060,153308,761160,,608074,306394,70637,93664,23027,675777


In [472]:
sp_count_division['division'] = sp_count_division['division'].str.replace(' Division', '')
sp_count_division

Unnamed: 0,geoid,division,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,03000US1,New England,2009,857357,3162,,20418,102673,132382,724975,,588177,269180,62934,83697,20763,647260
1,03000US1,New England,2010,913062,3822,,24270,111230,147163,765899,,613335,299727,65701,93636,27935,679653
2,03000US1,New England,2011,914468,3778,,25109,116060,153308,761160,,608074,306394,70637,93664,23027,675777
3,03000US1,New England,2012,927449,4262,25395.0,24919,117914,153616,773833,476.0,618584,308865,62621,86231,23610,693647
4,03000US1,New England,2013,924813,3734,,23390,118264,157344,767469,,610957,313856,69436,94782,25346,684165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,03000US9,Pacific,2013,3226069,45901,348385.0,327299,305403,1169411,2056658,21086.0,1282720,1943349,426303,565200,138897,1961180
77,03000US9,Pacific,2014,3277463,52157,371587.0,345519,302234,1204014,2073449,26068.0,1271238,2006225,429757,573272,143515,1978213
78,03000US9,Pacific,2015,3239227,49055,365340.0,339586,310392,1196576,2042651,25754.0,1249310,1989917,436101,566076,129975,1948364
79,03000US9,Pacific,2016,3226837,49685,372305.0,347230,291229,1200306,2026531,25075.0,1240114,1986723,466539,611745,145206,1901873


In [473]:
#replace NaN with 0
sp_count_division = sp_count_division.fillna(0)
#convert column type to int
sp_count_division['Asian_Pacific_Islander'] = sp_count_division['Asian_Pacific_Islander'].astype('int64')
sp_count_division['Native_Hawaiian_Other_Pacific_Islander'] = sp_count_division['Native_Hawaiian_Other_Pacific_Islander'].astype('int64')

In [474]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'division', 'year']
# Melt the DataFrame to transform it to long format
sp_count_division_long = pd.melt(sp_count_division, id_vars=id_vars, var_name='race', value_name='count_division')
sp_count_division_long

Unnamed: 0,geoid,division,year,race,count_division
0,03000US1,New England,2009,All,857357
1,03000US1,New England,2010,All,913062
2,03000US1,New England,2011,All,914468
3,03000US1,New England,2012,All,927449
4,03000US1,New England,2013,All,924813
...,...,...,...,...,...
1129,03000US9,Pacific,2013,White,1961180
1130,03000US9,Pacific,2014,White,1978213
1131,03000US9,Pacific,2015,White,1948364
1132,03000US9,Pacific,2016,White,1901873


In [489]:
sp_perc_division = pd.read_csv('../data/diversitydatakids/division_sp_hh/sp_hh_percent_division_one_yr.csv')
sp_perc_division.head(3)

Unnamed: 0,geoid,name,year,total_est,total_se,aian_est,aian_se,api_est,api_se,asian_est,...,nnhwhite_est,nnhwhite_se,other_est,other_se,othermore_est,othermore_se,twomore_est,twomore_se,white_est,white_se
0,03000US1,New England Division,2009,15.556921,0.163094,24.944778,3.435948,,,12.746273,...,31.365868,1.336397,41.051231,1.031691,37.288322,0.933391,29.18078,1.883887,13.404658,0.169093
1,03000US1,New England Division,2010,16.306738,0.147957,27.366463,3.456036,,,13.61777,...,32.253193,1.077112,42.577557,1.380146,39.169556,1.126422,32.963985,1.941049,13.968344,0.147519
2,03000US1,New England Division,2011,16.266369,0.143124,26.441771,2.473632,,,13.91273,...,31.758341,1.046565,42.652618,1.02052,37.095242,0.792017,26.502544,1.370497,13.910892,0.157276


In [490]:
sp_perc_division.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_perc_division.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,03000US1,New England Division,2009,15.556921,24.944778,,12.746273,36.100094,37.904758,14.044873,,12.641075,31.365868,41.051231,37.288322,29.18078,13.404658
1,03000US1,New England Division,2010,16.306738,27.366463,,13.61777,36.971546,39.106541,14.664022,,13.133514,32.253193,42.577557,39.169556,32.963985,13.968344
2,03000US1,New England Division,2011,16.266369,26.441771,,13.91273,36.771019,38.848148,14.561529,,13.057022,31.758341,42.652618,37.095242,26.502544,13.910892


In [491]:
sp_perc_division = sp_perc_division.rename(columns = {'name': 'division', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_perc_division.head(3)

Unnamed: 0,geoid,division,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,03000US1,New England Division,2009,15.556921,24.944778,,12.746273,36.100094,37.904758,14.044873,,12.641075,31.365868,41.051231,37.288322,29.18078,13.404658
1,03000US1,New England Division,2010,16.306738,27.366463,,13.61777,36.971546,39.106541,14.664022,,13.133514,32.253193,42.577557,39.169556,32.963985,13.968344
2,03000US1,New England Division,2011,16.266369,26.441771,,13.91273,36.771019,38.848148,14.561529,,13.057022,31.758341,42.652618,37.095242,26.502544,13.910892


In [493]:
sp_perc_division['division'] = sp_perc_division['division'].str.replace(' Division', '')
sp_perc_division

Unnamed: 0,geoid,division,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,03000US1,New England,2009,15.556921,24.944778,,12.746273,36.100094,37.904758,14.044873,,12.641075,31.365868,41.051231,37.288322,29.180780,13.404658
1,03000US1,New England,2010,16.306738,27.366463,,13.617770,36.971546,39.106541,14.664022,,13.133514,32.253193,42.577557,39.169556,32.963985,13.968344
2,03000US1,New England,2011,16.266369,26.441771,,13.912730,36.771019,38.848148,14.561529,,13.057022,31.758341,42.652618,37.095242,26.502544,13.910892
3,03000US1,New England,2012,16.486549,26.048161,13.203113,13.026613,37.160156,37.765018,14.828020,45.419849,13.326962,31.392197,41.656242,36.416504,27.306479,14.264700
4,03000US1,New England,2013,16.506521,25.149860,,12.009592,37.453407,37.517258,14.806507,,13.273819,31.385883,41.280579,36.963577,28.732077,14.194995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,03000US9,Pacific,2013,18.418806,27.328041,16.380619,16.038235,32.162888,29.778433,15.135792,24.498663,12.860727,25.769922,32.747799,29.992264,23.836384,15.833499
77,03000US9,Pacific,2014,18.543783,31.269184,16.859756,16.408522,31.823620,29.886808,15.194991,26.529884,12.781513,25.959543,32.119499,29.474262,23.643406,15.942156
78,03000US9,Pacific,2015,18.122303,30.029324,16.207727,15.704532,32.243996,29.060438,14.848394,28.064861,12.528231,25.181501,31.342312,28.042694,20.722757,15.617482
79,03000US9,Pacific,2016,17.938564,30.162577,16.180962,15.723668,30.518255,28.866714,14.652972,27.091705,12.422379,24.817390,31.379852,28.337374,21.606588,15.325830


In [494]:
#replace NaN with 0
sp_perc_division = sp_perc_division.fillna(0)

In [495]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'division', 'year']
# Melt the DataFrame to transform it to long format
sp_perc_division_long = pd.melt(sp_perc_division, id_vars=id_vars, var_name='race', value_name='perc_division')
sp_perc_division_long

Unnamed: 0,geoid,division,year,race,perc_division
0,03000US1,New England,2009,All,15.556921
1,03000US1,New England,2010,All,16.306738
2,03000US1,New England,2011,All,16.266369
3,03000US1,New England,2012,All,16.486549
4,03000US1,New England,2013,All,16.506521
...,...,...,...,...,...
1129,03000US9,Pacific,2013,White,15.833499
1130,03000US9,Pacific,2014,White,15.942156
1131,03000US9,Pacific,2015,White,15.617482
1132,03000US9,Pacific,2016,White,15.325830


In [496]:
sp_perc_division.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   geoid                                   81 non-null     object 
 1   division                                81 non-null     object 
 2   year                                    81 non-null     int64  
 3   All                                     81 non-null     float64
 4   American_Indian_Alaska_Native           81 non-null     float64
 5   Asian_Pacific_Islander                  81 non-null     float64
 6   Asian                                   81 non-null     float64
 7   Black                                   81 non-null     float64
 8   Hispanic                                81 non-null     float64
 9   Not_Hispanic_or_Latino                  81 non-null     float64
 10  Native_Hawaiian_Other_Pacific_Islander  81 non-null     float64


In [497]:
sphh_ar_count_perc_division = pd.merge(sp_count_division_long, sp_perc_division_long, on=['geoid', 'division', 'year', 'race'])
sphh_ar_count_perc_division

Unnamed: 0,geoid,division,year,race,count_division,perc_division
0,03000US1,New England,2009,All,857357,15.556921
1,03000US1,New England,2010,All,913062,16.306738
2,03000US1,New England,2011,All,914468,16.266369
3,03000US1,New England,2012,All,927449,16.486549
4,03000US1,New England,2013,All,924813,16.506521
...,...,...,...,...,...,...
1129,03000US9,Pacific,2013,White,1961180,15.833499
1130,03000US9,Pacific,2014,White,1978213,15.942156
1131,03000US9,Pacific,2015,White,1948364,15.617482
1132,03000US9,Pacific,2016,White,1901873,15.325830


In [498]:
sphh_ar_count_perc_division = sphh_ar_count_perc_division.reindex(columns=['geoid', 'year', 'race', 'division', 'count_division', 'perc_division'])
sphh_ar_count_perc_division

Unnamed: 0,geoid,year,race,division,count_division,perc_division
0,03000US1,2009,All,New England,857357,15.556921
1,03000US1,2010,All,New England,913062,16.306738
2,03000US1,2011,All,New England,914468,16.266369
3,03000US1,2012,All,New England,927449,16.486549
4,03000US1,2013,All,New England,924813,16.506521
...,...,...,...,...,...,...
1129,03000US9,2013,White,Pacific,1961180,15.833499
1130,03000US9,2014,White,Pacific,1978213,15.942156
1131,03000US9,2015,White,Pacific,1948364,15.617482
1132,03000US9,2016,White,Pacific,1901873,15.325830


In [515]:
sp_count_state = pd.read_csv('../data/diversitydatakids/state_sp_hh/sp_hh_count_state_one_yr.csv')
sp_count_state

Unnamed: 0,geoid,name,year,total_est,total_se,aian_se,aian_est,api_se,api_est,asian_se,...,nnhwhite_se,nnhwhite_est,other_se,other_est,othermore_se,othermore_est,twomore_se,twomore_est,white_se,white_est
0,04000US01,Alabama,2009,361523,6188.886850,322.195323,1664.0,,,457.848023,...,7604.138648,193119,471.383392,2937.0,784.297017,7007.0,626.832918,4070,4499.157121,175911
1,04000US01,Alabama,2010,358938,5333.869228,321.601237,1514.0,,,580.502552,...,6659.354224,185201,617.065594,3759.0,850.906853,7452.0,585.894637,3693,4113.921115,180034
2,04000US01,Alabama,2011,361985,6612.301384,312.602699,1703.0,,,527.656776,...,8070.492074,188697,758.789757,4338.0,881.811708,7170.0,449.254930,2832,4564.151303,179956
3,04000US01,Alabama,2012,350646,4688.940169,342.798944,1311.0,,,226.649918,...,6040.263831,181482,574.699614,3136.0,834.691163,7418.0,605.334362,4282,3812.586297,175783
4,04000US01,Alabama,2013,364703,5365.358302,533.178184,2278.0,,,371.964958,...,6714.656487,187956,698.448240,3736.0,862.759637,8095.0,506.482229,4359,4206.699906,184823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,04000US56,Wyoming,2013,30539,1831.468167,309.451754,1490.0,,,117.325226,...,2456.735399,5950,468.362137,1293.0,522.616781,1728.0,231.873260,435,1653.521917,26772
455,04000US56,Wyoming,2014,31992,1585.478459,301.731086,1216.0,,,211.652325,...,2078.058899,7248,315.506799,1224.0,414.544652,1772.0,268.891666,548,1459.591531,27982
456,04000US56,Wyoming,2015,28225,1457.397549,284.243762,1441.0,,,,...,2011.801159,6470,364.500938,913.0,437.043596,1539.0,241.135171,626,1496.409197,24935
457,04000US56,Wyoming,2016,28486,1589.632510,208.599234,1266.0,,,,...,2136.539073,6085,,,,,304.277645,1047,1540.149918,25809


In [516]:
sp_count_state.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_count_state.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,04000US01,Alabama,2009,361523,1664.0,,1851.0,175090.0,10651,350872,,168404,193119,2937.0,7007.0,4070,175911
1,04000US01,Alabama,2010,358938,1514.0,,2633.0,167305.0,10926,348012,,173737,185201,3759.0,7452.0,3693,180034
2,04000US01,Alabama,2011,361985,1703.0,,2601.0,170094.0,11689,350296,,173288,188697,4338.0,7170.0,2832,179956


In [517]:
sp_count_state = sp_count_state.rename(columns = {'name': 'state', 'total_est': 'All',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_count_state.head(3)

Unnamed: 0,geoid,state,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,04000US01,Alabama,2009,361523,1664.0,,1851.0,175090.0,10651,350872,,168404,193119,2937.0,7007.0,4070,175911
1,04000US01,Alabama,2010,358938,1514.0,,2633.0,167305.0,10926,348012,,173737,185201,3759.0,7452.0,3693,180034
2,04000US01,Alabama,2011,361985,1703.0,,2601.0,170094.0,11689,350296,,173288,188697,4338.0,7170.0,2832,179956


In [518]:
#replace NaN with 0
sp_count_state = sp_count_state.fillna(0)
sp_count_state

Unnamed: 0,geoid,state,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,04000US01,Alabama,2009,361523,1664.0,0.0,1851.0,175090.0,10651,350872,0.0,168404,193119,2937.0,7007.0,4070,175911
1,04000US01,Alabama,2010,358938,1514.0,0.0,2633.0,167305.0,10926,348012,0.0,173737,185201,3759.0,7452.0,3693,180034
2,04000US01,Alabama,2011,361985,1703.0,0.0,2601.0,170094.0,11689,350296,0.0,173288,188697,4338.0,7170.0,2832,179956
3,04000US01,Alabama,2012,350646,1311.0,0.0,731.0,165127.0,10673,339973,0.0,169164,181482,3136.0,7418.0,4282,175783
4,04000US01,Alabama,2013,364703,2278.0,0.0,1688.0,167708.0,12522,352181,0.0,176747,187956,3736.0,8095.0,4359,184823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,04000US56,Wyoming,2013,30539,1490.0,0.0,0.0,549.0,3882,26657,0.0,24589,5950,1293.0,1728.0,435,26772
455,04000US56,Wyoming,2014,31992,1216.0,0.0,406.0,616.0,4549,27443,0.0,24744,7248,1224.0,1772.0,548,27982
456,04000US56,Wyoming,2015,28225,1441.0,0.0,0.0,0.0,4383,23842,0.0,21755,6470,913.0,1539.0,626,24935
457,04000US56,Wyoming,2016,28486,1266.0,0.0,0.0,344.0,4006,24480,0.0,22401,6085,0.0,0.0,1047,25809


In [519]:
#convert column type to int
sp_count_state['American_Indian_Alaska_Native'] = sp_count_state['American_Indian_Alaska_Native'].astype('int64')
sp_count_state['Asian_Pacific_Islander'] = sp_count_state['Asian_Pacific_Islander'].astype('int64')
sp_count_state['Asian'] = sp_count_state['Asian'].astype('int64')
sp_count_state['Black'] = sp_count_state['Black'].astype('int64')
sp_count_state['Native_Hawaiian_Other_Pacific_Islander'] = sp_count_state['Native_Hawaiian_Other_Pacific_Islander'].astype('int64')
sp_count_state['Other_Race'] = sp_count_state['Other_Race'].astype('int64')
sp_count_state['Some_Other_Race_Alone'] = sp_count_state['Some_Other_Race_Alone'].astype('int64')

In [520]:
sp_count_state.info()
sp_count_state

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   geoid                                   459 non-null    object
 1   state                                   459 non-null    object
 2   year                                    459 non-null    int64 
 3   All                                     459 non-null    int64 
 4   American_Indian_Alaska_Native           459 non-null    int64 
 5   Asian_Pacific_Islander                  459 non-null    int64 
 6   Asian                                   459 non-null    int64 
 7   Black                                   459 non-null    int64 
 8   Hispanic                                459 non-null    int64 
 9   Not_Hispanic_or_Latino                  459 non-null    int64 
 10  Native_Hawaiian_Other_Pacific_Islander  459 non-null    int64 
 11  White_

Unnamed: 0,geoid,state,year,All,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,04000US01,Alabama,2009,361523,1664,0,1851,175090,10651,350872,0,168404,193119,2937,7007,4070,175911
1,04000US01,Alabama,2010,358938,1514,0,2633,167305,10926,348012,0,173737,185201,3759,7452,3693,180034
2,04000US01,Alabama,2011,361985,1703,0,2601,170094,11689,350296,0,173288,188697,4338,7170,2832,179956
3,04000US01,Alabama,2012,350646,1311,0,731,165127,10673,339973,0,169164,181482,3136,7418,4282,175783
4,04000US01,Alabama,2013,364703,2278,0,1688,167708,12522,352181,0,176747,187956,3736,8095,4359,184823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,04000US56,Wyoming,2013,30539,1490,0,0,549,3882,26657,0,24589,5950,1293,1728,435,26772
455,04000US56,Wyoming,2014,31992,1216,0,406,616,4549,27443,0,24744,7248,1224,1772,548,27982
456,04000US56,Wyoming,2015,28225,1441,0,0,0,4383,23842,0,21755,6470,913,1539,626,24935
457,04000US56,Wyoming,2016,28486,1266,0,0,344,4006,24480,0,22401,6085,0,0,1047,25809


In [521]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'state', 'year']
# Melt the DataFrame to transform it to long format
sp_count_state_long = pd.melt(sp_count_state, id_vars=id_vars, var_name='race', value_name='count_state')
sp_count_state_long

Unnamed: 0,geoid,state,year,race,count_state
0,04000US01,Alabama,2009,All,361523
1,04000US01,Alabama,2010,All,358938
2,04000US01,Alabama,2011,All,361985
3,04000US01,Alabama,2012,All,350646
4,04000US01,Alabama,2013,All,364703
...,...,...,...,...,...
6421,04000US56,Wyoming,2013,White,26772
6422,04000US56,Wyoming,2014,White,27982
6423,04000US56,Wyoming,2015,White,24935
6424,04000US56,Wyoming,2016,White,25809


In [445]:
sp_perc_state = pd.read_csv('../data/diversitydatakids/state_sp_hh/sp_hh_perc_state_one_yr.csv')
sp_perc_state

Unnamed: 0,geoid,name,year,total_est,total_se,aian_est,aian_se,api_est,api_se,asian_est,...,nnhwhite_est,nnhwhite_se,other_est,other_se,othermore_est,othermore_se,twomore_est,twomore_se,white_est,white_se
0,04000US01,Alabama,2009,19.562393,0.328247,17.225672,2.984325,,,10.841044,...,35.083904,1.291527,27.074116,3.728433,24.538609,2.448262,22.985260,3.215997,13.329803,0.337884
1,04000US01,Alabama,2010,19.774542,0.284784,17.902330,3.433076,,,15.123492,...,34.901505,1.128904,31.382534,4.366893,27.499168,2.751047,24.422989,3.470938,13.723975,0.308734
2,04000US01,Alabama,2011,19.624611,0.350792,18.679390,3.032340,,,14.613180,...,33.813332,1.354768,28.584608,4.385672,22.348980,2.472261,16.751450,2.395414,13.687282,0.343209
3,04000US01,Alabama,2012,19.003464,0.243738,13.076003,3.155318,,,3.941126,...,32.448639,0.954762,26.499916,4.309245,23.838293,2.424431,22.204937,2.864224,13.361351,0.284935
4,04000US01,Alabama,2013,20.011808,0.284754,25.126848,5.398319,,,9.580566,...,33.504879,1.076848,27.410126,4.563322,24.555603,2.276811,22.543442,2.146574,14.299652,0.319210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,04000US56,Wyoming,2013,13.633299,0.808866,43.276211,5.483419,,,0.000000,...,23.912868,9.500072,34.251656,10.803617,23.923578,6.438733,12.616010,6.162814,12.803137,0.779555
455,04000US56,Wyoming,2014,13.754439,0.673140,31.099745,6.402376,,,24.037891,...,24.436129,6.695575,31.050228,6.704297,22.125109,4.563561,13.474305,6.206021,12.936243,0.665404
456,04000US56,Wyoming,2015,12.328719,0.625178,39.961178,6.697563,,,,...,24.075314,6.985679,30.362488,11.203427,24.959455,6.466478,19.816397,6.804710,11.534369,0.682709
457,04000US56,Wyoming,2016,12.738631,0.696628,46.424641,5.825798,,,,...,23.317749,7.615441,,,,,21.968107,5.468230,12.255569,0.716772


In [446]:
sp_percent_state.drop(['total_se', 'aian_se', 'api_se', 'asian_se', 'black_se', 'hisp_se', 'nhisp_se', 'nhopi_se', 'nhwhite_se', 'nnhwhite_se', 'other_se', 'othermore_se', 'twomore_se', 'white_se'], axis=1, inplace=True)
sp_percent_state.head(3)

Unnamed: 0,geoid,name,year,total_est,aian_est,api_est,asian_est,black_est,hisp_est,nhisp_est,nhopi_est,nhwhite_est,nnhwhite_est,other_est,othermore_est,twomore_est,white_est
0,04000US01,Alabama,2009,19.562393,17.225672,,10.841044,37.083866,30.581717,19.350737,,12.978093,35.083904,27.074116,24.538609,22.98526,13.329803
1,04000US01,Alabama,2010,19.774542,17.90233,,15.123492,37.161877,26.6924,19.614941,,13.525516,34.901505,31.382534,27.499168,24.422989,13.723975
2,04000US01,Alabama,2011,19.624611,18.67939,,14.61318,36.17905,25.251129,19.479774,,13.469818,33.813332,28.584608,22.34898,16.75145,13.687282


In [447]:
sp_percent_state = sp_percent_state.rename(columns = {'name': 'state', 'total_est': 'sp_est_total',
                                                      'aian_est': 'American_Indian_Alaska_Native',
                                   'api_est': 'Asian_Pacific_Islander', 'asian_est': 'Asian', 'black_est': 'Black',
                                   'hisp_est': 'Hispanic', 'nhisp_est': 'Not_Hispanic_or_Latino',
                                   'nhopi_est': 'Native_Hawaiian_Other_Pacific_Islander',
                                   'nhwhite_est': 'White_Not_Hispanic_or_Latino', 'nnhwhite_est': 'non_Hispanic_Whites',
                                   'other_est': 'Other_Race', 'othermore_est': 'Some_Other_Race_Alone', 
                                   'twomore_est': 'Two_or_More_Races', 'white_est': 'White'})
sp_percent_state.head(3)

Unnamed: 0,geoid,state,year,sp_est_total,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,04000US01,Alabama,2009,19.562393,17.225672,,10.841044,37.083866,30.581717,19.350737,,12.978093,35.083904,27.074116,24.538609,22.98526,13.329803
1,04000US01,Alabama,2010,19.774542,17.90233,,15.123492,37.161877,26.6924,19.614941,,13.525516,34.901505,31.382534,27.499168,24.422989,13.723975
2,04000US01,Alabama,2011,19.624611,18.67939,,14.61318,36.17905,25.251129,19.479774,,13.469818,33.813332,28.584608,22.34898,16.75145,13.687282


In [448]:
sp_percent_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   geoid                                   459 non-null    object 
 1   state                                   459 non-null    object 
 2   year                                    459 non-null    int64  
 3   sp_est_total                            459 non-null    float64
 4   American_Indian_Alaska_Native           423 non-null    float64
 5   Asian_Pacific_Islander                  132 non-null    float64
 6   Asian                                   446 non-null    float64
 7   Black                                   442 non-null    float64
 8   Hispanic                                459 non-null    float64
 9   Not_Hispanic_or_Latino                  459 non-null    float64
 10  Native_Hawaiian_Other_Pacific_Islander  132 non-null    float6

In [449]:
#replace NaN with 0
sp_percent_state = sp_percent_state.fillna(0)
sp_percent_state

Unnamed: 0,geoid,state,year,sp_est_total,American_Indian_Alaska_Native,Asian_Pacific_Islander,Asian,Black,Hispanic,Not_Hispanic_or_Latino,Native_Hawaiian_Other_Pacific_Islander,White_Not_Hispanic_or_Latino,non_Hispanic_Whites,Other_Race,Some_Other_Race_Alone,Two_or_More_Races,White
0,04000US01,Alabama,2009,19.562393,17.225672,0.0,10.841044,37.083866,30.581717,19.350737,0.0,12.978093,35.083904,27.074116,24.538609,22.985260,13.329803
1,04000US01,Alabama,2010,19.774542,17.902330,0.0,15.123492,37.161877,26.692400,19.614941,0.0,13.525516,34.901505,31.382534,27.499168,24.422989,13.723975
2,04000US01,Alabama,2011,19.624611,18.679390,0.0,14.613180,36.179050,25.251129,19.479774,0.0,13.469818,33.813332,28.584608,22.348980,16.751450,13.687282
3,04000US01,Alabama,2012,19.003464,13.076003,0.0,3.941126,35.191135,23.807718,18.883831,0.0,13.155514,32.448639,26.499916,23.838293,22.204937,13.361351
4,04000US01,Alabama,2013,20.011808,25.126848,0.0,9.580566,35.682858,26.752981,19.834110,0.0,14.011327,33.504879,27.410126,24.555603,22.543442,14.299652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,04000US56,Wyoming,2013,13.633299,43.276211,0.0,0.000000,26.483356,26.067686,12.747775,0.0,12.348773,23.912868,34.251656,23.923578,12.616010,12.803137
455,04000US56,Wyoming,2014,13.754439,31.099745,0.0,24.037891,25.101873,24.085350,12.841413,0.0,12.193187,24.436129,31.050228,22.125109,13.474305,12.936243
456,04000US56,Wyoming,2015,12.328719,39.961178,0.0,0.000000,0.000000,24.122181,11.312017,0.0,10.766444,24.075314,30.362488,24.959455,19.816397,11.534369
457,04000US56,Wyoming,2016,12.738631,46.424641,0.0,0.000000,15.067893,24.202515,11.822260,0.0,11.340958,23.317749,0.000000,0.000000,21.968107,12.255569


In [None]:
# Identify the columns to be used as identifier variables
id_vars = ['geoid', 'country', 'year']
# Melt the DataFrame to transform it to long format
sm_count_country_long = pd.melt(sm_count_country, id_vars=id_vars, var_name='race', value_name='total_hh')
sm_count_country_long

In [501]:
sphh_ar_count_perc_country = sphh_ar_count_perc_country.reindex(columns=['geoid', 'year', 'race', 'country', 'count_country', 'perc_country'])
sphh_ar_count_perc_country

Unnamed: 0,geoid,year,race,country,count_country,perc_country
0,01000US,2009,All,United States,19719269,17.356031
1,01000US,2010,All,United States,20384264,17.792374
2,01000US,2011,All,United States,20564358,17.883337
3,01000US,2012,All,United States,20754812,17.896778
4,01000US,2013,All,United States,20821655,17.904781
...,...,...,...,...,...,...
121,01000US,2013,White,United States,13139067,14.556652
122,01000US,2014,White,United States,13244910,14.622648
123,01000US,2015,White,United States,13054222,14.341585
124,01000US,2016,White,United States,12982408,14.259237


sp_percent_state.info()

###NOTE: The file below of compiled shapefiles from doesn't seem to match the syntax of the geoid in the above files: 02000US1 for northeast region, for example; also it is only for state, county, or individual census tracts 

https://www2.census.gov/programs-surveys/acs/summary_file/2014/documentation/tech_docs/ACS_SF_TIGERLine_Shapefiles.pdf

sp_count_country.to_csv('../data/cleaned/one_yr_sp_count_country_wide.csv', index = False)

sp_count_region.to_csv('../data/cleaned/one_yr_sp_count_region_wide.csv', index = False)

sp_count_division.to_csv('../data/cleaned/sp_count_division_wide.csv', index = False)

sp_count_state.to_csv('../data/cleaned/sp_count_state_wide.csv', index = False)

sp_percent_country.to_csv('../data/cleaned/sp_percent_country_wide.csv', index = False)

sp_percent_region.to_csv('../data/cleaned/sp_percent_region_wide.csv', index = False)

sp_percent_division.to_csv('../data/cleaned/sp_percent_division_wide.csv', index = False)

sp_percent_state.to_csv('../data/cleaned/sp_percent_state_wide.csv', index = False)

#### total households file is from CPS, these files calculated from ACS
https://www.census.gov/programs-surveys/acs/data/summary-file/sequence-based.html

Python and ACS
cenpy is an interface to explore and query the Census Bureau APIs and return Pandas data frames. This package is intended for exploratory data analysis.
datamade is a simple wrapper for the Census Bureau APIs.