In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

%matplotlib inline

#### hh1 - Households by Type: 1940 to Present (Numbers in thousands)
Source:  Source:  U.S. Census Bureau, Decennial Census, 1940, and Current Population Survey, March and Annual Social and Economic Supplements, 1947 to 2022.										
https://www.census.gov/data/tables/time-series/demo/families/households.html

In [2]:
hh1 = pd.read_excel('../data/single_parent/census/historical_households/hh1_by_type_1940_pres.xls', skiprows=9, nrows=84)
pd.set_option('display.max_rows', None)
hh1

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Male householder,Female householder,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,,,,,,,,,,,
1,2022,131202.0,84265.0,61435.0,,7212.0,15618.0,,46937.0,22716.0,24221.0
2,2021r,129224.0,83711.0,61288.0,,6963.0,15461.0,,45533.0,21674.0,23859.0
3,2021,129931.0,83907.0,61454.0,,6963.0,15490.0,,46024.0,21781.0,24244.0
4,2020,128451.0,83677.0,62342.0,,6503.0,14832.0,,44774.0,21304.0,23470.0
5,2019,128579.0,83482.0,61959.0,,6480.0,15043.0,,45096.0,21582.0,23515.0
6,2018,127586.0,83088.0,61241.0,,6424.0,15423.0,,44498.0,21017.0,23481.0
7,2017,126224.0,82827.0,60804.0,,6452.0,15572.0,,43396.0,20539.0,22858.0
8,2016,125819.0,82184.0,60251.0,,6310.0,15622.0,,43635.0,20542.0,23093.0
9,2015,124587.0,81716.0,60010.0,,6162.0,15544.0,,42871.0,20143.0,22728.0


In [3]:
#rename columns
hh1 = hh1.rename(columns = {'Unnamed: 0': 'year', 'Unnamed: 1': 'total_hh', 'Unnamed: 2': 'total_fam_hh', 'Unnamed: 3': 'married_fam_hh', 'Male householder': 'male_hh_fam_hh', 'Female householder': 'female_hh_fam_hh'})
hh1.head(2)

#drop empty columns or columns not needed
hh1 = hh1.drop(['Unnamed: 4', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10'], axis=1)

#drop prior index rows, drop original rows and keep revised
hh1 = hh1.drop(labels=[0, 3, 14, 33, 39, 44, 49]).reset_index(drop=True)

#remove extra spaces in column headers
hh1 = hh1.rename(columns=lambda x: x.strip())

# remove extra spaces over all strings
hh1 = hh1.applymap(lambda x: x.strip() if isinstance(x, str) else x)

hh1.head(2)

Unnamed: 0,year,total_hh,total_fam_hh,married_fam_hh,male_hh_fam_hh,female_hh_fam_hh
0,2022,131202.0,84265.0,61435.0,7212.0,15618.0
1,2021r,129224.0,83711.0,61288.0,6963.0,15461.0


In [4]:
hh1 = hh1.astype({'year': 'str', 'total_hh': 'int64', 'total_fam_hh': 'int64', 'married_fam_hh': 'int64', 'male_hh_fam_hh': 'int64', 'female_hh_fam_hh': 'int64'})
hh1

Unnamed: 0,year,total_hh,total_fam_hh,married_fam_hh,male_hh_fam_hh,female_hh_fam_hh
0,2022,131202,84265,61435,7212,15618
1,2021r,129224,83711,61288,6963,15461
2,2020,128451,83677,62342,6503,14832
3,2019,128579,83482,61959,6480,15043
4,2018,127586,83088,61241,6424,15423
5,2017,126224,82827,60804,6452,15572
6,2016,125819,82184,60251,6310,15622
7,2015,124587,81716,60010,6162,15544
8,2014s,123229,81353,59629,6304,15420
9,2013,122459,80902,59204,6229,15469


In [5]:
#locate additional letters in year column
hh1_by_type = hh1[hh1['year'].str.match('\d{4}[a-zA-Z]?', na=False)]
#extract only the year
hh1_by_type['year'] = hh1_by_type['year'].str.extract('(\d{4})', expand=False).astype(int)
#sort by descending
hh1_by_type = hh1_by_type.sort_values('year', ascending=False)

hh1_by_type

Unnamed: 0,year,total_hh,total_fam_hh,married_fam_hh,male_hh_fam_hh,female_hh_fam_hh
0,2022,131202,84265,61435,7212,15618
1,2021,129224,83711,61288,6963,15461
2,2020,128451,83677,62342,6503,14832
3,2019,128579,83482,61959,6480,15043
4,2018,127586,83088,61241,6424,15423
5,2017,126224,82827,60804,6452,15572
6,2016,125819,82184,60251,6310,15622
7,2015,124587,81716,60010,6162,15544
8,2014,123229,81353,59629,6304,15420
9,2013,122459,80902,59204,6229,15469


In [6]:
hh1_by_type.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 0 to 76
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   year              77 non-null     int32
 1   total_hh          77 non-null     int64
 2   total_fam_hh      77 non-null     int64
 3   married_fam_hh    77 non-null     int64
 4   male_hh_fam_hh    77 non-null     int64
 5   female_hh_fam_hh  77 non-null     int64
dtypes: int32(1), int64(5)
memory usage: 3.9 KB


In [None]:
hh1_by_type.to_csv('../data/single_parent/census/historical_households/hh1_by_type.csv', index = False)