In [1]:
import os, re, struct
import pandas as pd
import numpy as np
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from collections import Counter

## 01. Download data dictionary to extract variable positions

In [2]:
data_dict_url = 'https://www2.census.gov/programs-surveys/cps/datasets/2020/basic/2020_Basic_CPS_Public_Use_Record_Layout_plus_IO_Code_list.txt'

fp = urlopen(data_dict_url)
mybytes = fp.read()
data_dict = mybytes.decode("ISO-8859-1")
fp.close()

## 02. Download data

`parse_raw()` uses the data dictionary to find the column in question

`get_data()` downloads the raw data and calls `parse_raw`

In [11]:
def parse_raw(col, raw_data):
    p = f'({col})\s+(\d+)\s+.*?\t+.*?(\d\d*).*?(\d\d+)'
    find_result = re.findall(p, data_dict)[0]
    temp_start = int(find_result[2]) - 1
    temp_end = int(find_result[3])
    return [row.decode("ISO-8859-1")[temp_start: temp_end].rstrip().strip() for row in raw_data]

In [27]:
def get_data(col, months):
    result_dict = {}
    for mon in months:
        print(f"loading data for {mon}")
        resp = urlopen(f'https://www2.census.gov/programs-surveys/cps/datasets/2020/basic/{mon}20pub.zip')
        zipfile = ZipFile(BytesIO(resp.read()))
        info = zipfile.infolist()
        raw_data = zipfile.open(info[0].filename).readlines()

        result_dict[mon] = parse_raw(col, raw_data)
    return result_dict

In [32]:
PWCMPWGT = get_data('PWCMPWGT', months)
PWSSWGT = get_data('PWSSWGT', months)

loading data for jan
loading data for feb
loading data for mar
loading data for apr


In [None]:
HRMONTH[mon] = get_data('HRMONTH', months)
PREMPNOT[mon] = get_data('PREMPNOT', months)
PESEX[mon] = get_data('PESEX', months)
PEEDUCA[mon] = get_data('PEEDUCA', months)
PTDTRACE[mon] = get_data('PTDTRACE', months)

In [None]:
def convert_int(d):
    for key, value in d.items():
        d[key] = [int(record) for record in value]

## 03. Store data in mongodb

In [30]:
import pymongo
from pymongo import MongoClient
client = MongoClient()
CPS = client.CPS
CPS

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'CPS')

In [33]:
CPS.PWSSWGT.insert_one(PWSSWGT)
CPS.PWCMPWGT.insert_one(PWCMPWGT)

<pymongo.results.InsertOneResult at 0x117a92fa0>

In [None]:
CPS.PREMPNOT.insert_one(PREMPNOT)
CPS.PESEX.insert_one(PESEX)
CPS.PEEDUCA.insert_one(PEEDUCA)
CPS.PTDTRACE.insert_one(PTDTRACE)