API limitations:
 - 50 series allowed per pull
 - 20 years allowed per pull
 - approx 5000 PPI series with some series having data starting in the 1960s means approx 300 pulls required - 100 for all series, * 3 for 20 year selections out to the 60s

In [1]:
import requests
import json
import pandas as pd
from bls_definitions import bls_ppi_codes
import datetime

In [2]:
key = "9648e84bbf0f4c38b01689279269a05e"
url = "https://api.bls.gov/publicAPI/v2/timeseries/data/"
startyear = "2004"
endyear = "2023"

In [3]:
selection_dict = {"seriesid": ['WPUFD4'], "startyear":startyear, "endyear":endyear, "registrationkey" : key}
selection = json.dumps(selection_dict)

In [4]:
headers = {'Content-type' : 'application/json'}

In [5]:
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)

In [6]:
p = p.json()['Results']['series']

In [7]:
date_list = [f"{i['year']}-{i['period'][1:]}-01" for i in p[0]['data']]

In [8]:
df = pd.DataFrame(index = pd.to_datetime(date_list[::-1]))

In [9]:
for i in p:
    df[i['seriesID']] = pd.Series([j['value'] for j in i['data']]).astype(float).iloc[::-1].values


Attempt with first 50 codes from bls list

In [10]:
codes = list(bls_ppi_codes.keys())[:50]

In [11]:
selection_dict = {"seriesid": codes, "startyear":startyear, "endyear":endyear, "registrationkey" : key}
selection = json.dumps(selection_dict)

In [12]:
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)

In [13]:
p = p.json()['Results']['series']

In [14]:
# df = pd.DataFrame()
data = {}
index = {}

In [15]:
for i in p:
    data[i['seriesID']] = pd.Series([j['value'] for j in i['data']]).astype(float).iloc[::-1].values
    index[i['seriesID']] = [f"{j['year']}-{j['period'][1:]}-01" for j in i['data']][::-1]


  data[i['seriesID']] = pd.Series([j['value'] for j in i['data']]).astype(float).iloc[::-1].values


In [16]:
missing_codes = []
for i in p:
    if len(i['data']) == 0:
        missing_codes.append(i['seriesID'])


In [17]:
dfs = {}
for i in data.keys():
    dfs[i] = pd.DataFrame(data = data[i], index = pd.to_datetime(index[i]))

In [18]:
final = pd.concat(dfs,axis=1)

In [19]:
final.columns = final.columns.get_level_values(0)

50 codes over three 20 year windows

In [20]:
endyear = datetime.datetime.now().year

date_ranges = {0 : (endyear-19, endyear),
               1 : (endyear-39, endyear-20),
               2 : (endyear-59, endyear-40)}

codes = list(bls_ppi_codes.keys())[:50]

In [21]:
dfs_diff_periods = {}
missing_series = {}

for i in date_ranges:
    # create variable selection dictionary
    selection_dict = {"seriesid": codes, "startyear":date_ranges[i][0], "endyear":date_ranges[i][1], "registrationkey" : key}
    selection = json.dumps(selection_dict)
    
    # send request to BLS
    p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)
    p = p.json()['Results']['series']
    
    # create dictionaries to hold values and index
    data = {}
    index = {}
    
    # save time series and index for each variable
    for j in p:
        data[j['seriesID']] = pd.Series([k['value'] for k in j['data']]).astype(float).iloc[::-1].values
        index[j['seriesID']] = [f"{k['year']}-{k['period'][1:]}-01" for k in j['data']][::-1]
    
    # identify where codes are missing in each period window
    missing_codes = []
    for j in p:
        if len(j['data']) == 0:
            missing_codes.append(j['seriesID'])
    missing_series[i] = missing_codes

    # convert series into dataframes and then combine into one dataframe for the period window
    dfs = {}
    for j in data.keys():
        dfs[j] = pd.DataFrame(data = data[j], index = pd.to_datetime(index[j]))

    final = pd.concat(dfs,axis=1)
    final.columns = final.columns.get_level_values(0)

    dfs_diff_periods[i] = final


  data[j['seriesID']] = pd.Series([k['value'] for k in j['data']]).astype(float).iloc[::-1].values


In [22]:
full_df = pd.concat(dfs_diff_periods)

In [23]:
full_df.index = full_df.index.droplevel(0)

In [24]:
full_df = full_df.sort_index()

In [25]:
full_df.to_csv('test.csv')

In [26]:
missing_sets = {}
for i in missing_series.keys():
    missing_sets[i] = set(missing_series[i])

variables_no_data = missing_sets[0].intersection(missing_sets[1]).intersection(missing_sets[2])

In [None]:
variables_no_data