API limitations:
 - 50 series allowed per pull
 - 20 years allowed per pull
 - approx 5000 PPI series with some series having data starting in the 1960s means approx 300 pulls required - 100 for all series, * 3 for 20 year selections out to the 60s

In [None]:
import requests
import json
import pandas as pd
from bls_definitions import bls_ppi_codes
import datetime

In [None]:
key = "9648e84bbf0f4c38b01689279269a05e"
url = "https://api.bls.gov/publicAPI/v2/timeseries/data/"
startyear = "2004"
endyear = "2023"

In [None]:
selection_dict = {"seriesid": ['WPUFD4'], "startyear":startyear, "endyear":endyear, "registrationkey" : key}
selection = json.dumps(selection_dict)

In [None]:
headers = {'Content-type' : 'application/json'}

In [None]:
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)

In [None]:
p = p.json()['Results']['series']

In [None]:
date_list = [f"{i['year']}-{i['period'][1:]}-01" for i in p[0]['data']]

In [None]:
df = pd.DataFrame(index = pd.to_datetime(date_list[::-1]))

In [None]:
for i in p:
    df[i['seriesID']] = pd.Series([j['value'] for j in i['data']]).astype(float).iloc[::-1].values


Attempt with first 50 codes from bls list

In [None]:
codes = list(bls_ppi_codes.keys())[:50]

In [None]:
selection_dict = {"seriesid": codes, "startyear":startyear, "endyear":endyear, "registrationkey" : key}
selection = json.dumps(selection_dict)

In [None]:
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)

In [None]:
p = p.json()['Results']['series']

In [None]:
# df = pd.DataFrame()
data = {}
index = {}

In [None]:
for i in p:
    data[i['seriesID']] = pd.Series([j['value'] for j in i['data']]).astype(float).iloc[::-1].values
    index[i['seriesID']] = [f"{j['year']}-{j['period'][1:]}-01" for j in i['data']][::-1]


In [None]:
missing_codes = []
for i in p:
    if len(i['data']) == 0:
        missing_codes.append(i['seriesID'])


In [None]:
dfs = {}
for i in data.keys():
    dfs[i] = pd.DataFrame(data = data[i], index = pd.to_datetime(index[i]))

In [None]:
final = pd.concat(dfs,axis=1)

In [None]:
final.columns = final.columns.get_level_values(0)

50 codes over three 20 year windows

In [None]:
endyear = datetime.datetime.now().year

date_ranges = {0 : (endyear-19, endyear),
               1 : (endyear-39, endyear-20),
               2 : (endyear-59, endyear-40)}

codes = list(bls_ppi_codes.keys())[:50]

In [None]:
dfs_diff_periods = {}
missing_series = {}

for i in date_ranges:
    # create variable selection dictionary
    selection_dict = {"seriesid": codes, "startyear":date_ranges[i][0], "endyear":date_ranges[i][1], "registrationkey" : key}
    selection = json.dumps(selection_dict)
    
    # send request to BLS
    p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)
    p = p.json()['Results']['series']
    
    # create dictionaries to hold values and index
    data = {}
    index = {}
    
    # save time series and index for each variable
    for j in p:
        data[j['seriesID']] = pd.Series([k['value'] for k in j['data']]).astype(float).iloc[::-1].values
        index[j['seriesID']] = [f"{k['year']}-{k['period'][1:]}-01" for k in j['data']][::-1]
    
    # identify where codes are missing in each period window
    missing_codes = []
    for j in p:
        if len(j['data']) == 0:
            missing_codes.append(j['seriesID'])
    missing_series[i] = missing_codes

    # convert series into dataframes and then combine into one dataframe for the period window
    dfs = {}
    for j in data.keys():
        dfs[j] = pd.DataFrame(data = data[j], index = pd.to_datetime(index[j]))

    final = pd.concat(dfs,axis=1)
    final.columns = final.columns.get_level_values(0)

    dfs_diff_periods[i] = final


In [None]:
full_df = pd.concat(dfs_diff_periods)

In [None]:
full_df.index = full_df.index.droplevel(0)

In [None]:
full_df = full_df.sort_index()

In [None]:
full_df.to_csv('test.csv')

In [None]:
missing_sets = {}
for i in missing_series.keys():
    missing_sets[i] = set(missing_series[i])

variables_no_data = missing_sets[0].intersection(missing_sets[1]).intersection(missing_sets[2])

In [None]:
variables_no_data

# Looping all codes in sets of 50 over three 20 year periods

In [None]:
# create date ranges to loop on
endyear = datetime.datetime.now().year

date_ranges = {0 : (endyear-19, endyear),
               1 : (endyear-39, endyear-20),
               2 : (endyear-59, endyear-40)}

In [None]:
bls_ppi_code_segments = [list(bls_ppi_codes.keys())[x:x+50]for x in range(0, len(list(bls_ppi_codes.keys())), 50)]

In [None]:
master_dfs = {}
master_missing_series = {}
segment_no = 0

# loop over sets of 50 codes
for i in bls_ppi_code_segments:
    dfs_diff_periods = {}
    missing_series = {}
    for j in date_ranges:
        # create variatble selection dictionary
        selection_dict = {"seriesid": i, "startyear":date_ranges[j][0], "endyear":date_ranges[j][1], "registrationkey" : key}
        selection = json.dumps(selection_dict)

        # send request to BLS
        p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=selection, headers=headers)
        p = p.json()['Results']['series']

        # create dictionaries to hold values and index
        data = {}
        index = {}

        # save time series and index for each variable
        for k in p:
            data[k['seriesID']] = pd.Series([l['value'] for l in k['data']]).astype(float).iloc[::-1].values
            index[k['seriesID']] = [f"{l['year']}-{l['period'][1:]}-01" for l in k['data']][::-1]

        # identify where codes are missing in each period window
        missing_codes = []
        for k in p:
            if len(k['data']) == 0:
                missing_codes.append(k['seriesID'])
        missing_series[j] = missing_codes

        # convert series into dataframes and then combine into one dataframe for the period window
        dfs = {}
        for k in data.keys():
            dfs[k] = pd.DataFrame(data = data[k], index = pd.to_datetime(index[k]))

        final = pd.concat(dfs,axis=1)
        final.columns = final.columns.get_level_values(0)

        dfs_diff_periods[j] = final
    
    # combine dfs of series into one dataframe
    segment_df = pd.concat(dfs_diff_periods)
    segment_df.index = segment_df.index.droplevel(0)
    segment_df = segment_df.sort_index()
    
    # save segment df to master dictionary
    master_dfs[segment_no] = segment_df

    # identify codes with no values across all periods
    missing_sets = {}
    for j in missing_series.keys():
        missing_sets[j] = set(missing_series[j])

    variables_no_data = missing_sets[0].intersection(missing_sets[1]).intersection(missing_sets[2])

    master_missing_series[segment_no] = variables_no_data

    print(segment_no)
    segment_no += 1
    

In [None]:
# # merge master dataframes into 
# all_ppi = pd.concat(master_dfs)
# all_ppi.index = all_ppi.index.droplevel(0)
# all_ppi = all_ppi.sort_index()

# # remove duplicate index values
# all_ppi = all_ppi[~all_ppi.index.duplicated(keep='first')]

In [None]:
# all_ppi.to_csv('test.csv')

In [75]:
all_ppi = pd.DataFrame(index=pd.date_range(start='1/1/1960', end =  str(datetime.datetime.now().month) + '/1/' + str(datetime.datetime.now().year), freq='ms'))

MemoryError: Unable to allocate 14.5 TiB for an array with shape (1995926400001,) and data type int64

In [None]:
master_dfs

In [None]:
for i in master_dfs.keys():
    for j in master_dfs[i].columns:
        all_ppi[j] = master_dfs[i][j]

In [None]:
all_ppi.to_csv('test.csv')

In [None]:
all_ppi