In [1]:
import pandas as pd
import datetime
import numpy as np

Convert API pulls to a dataframe

In [2]:
bls_dfs = {}

for i in range(0,100):
    bls_dfs[i] = pd.read_parquet('C:\\DSWG_PPI\\api_pulls\\'+str(i)+'api.gzip')

In [3]:
all_ppi = pd.DataFrame(index=pd.date_range(start='1/1/1960', end =  str(datetime.datetime.now().month) + '/1/' + str(datetime.datetime.now().year), freq='MS'))

In [4]:
for i in bls_dfs.keys():
    for j in bls_dfs[i].columns:
        all_ppi[j] = bls_dfs[i][j]

  all_ppi[j] = bls_dfs[i][j]


In [5]:
all_ppi_q = all_ppi.resample('Q').mean()

In [6]:
all_ppi_q.to_csv('all_ppi_q.csv')

Remove series with no data

In [7]:
ppi_no_data = list(all_ppi_q.loc[:,all_ppi_q.isna().all()].columns)

In [8]:
# remove series with no data
ppi_clean = all_ppi_q.loc[:,~all_ppi_q.isna().all()]

Calculate endpoints to identify discontinued series

In [9]:
endpoints = {}
for i in ppi_clean.columns:
    endpoints[i] = ppi_clean[i].last_valid_index()

In [10]:
pd.Series(endpoints).to_csv('endpoints.csv')

In [11]:
# count each instance of the endpoint dates
endpoints_count = {}

for i in endpoints.values():
    # Timestamp formatted to string and time removed
    i = str(i).split(" ")[0]
    if i not in endpoints_count.keys():
        endpoints_count[i] = 1
    else:
        endpoints_count[i] += 1

In [12]:
# sort the series into lists by endpoint
endpoints_list = {}

for i in endpoints.keys():
    # Timestamp formatted to string and time removed
    date = str(endpoints[i]).split(" ")[0]
    if date not in endpoints_list.keys():
        endpoints_list[date] = [i]
    else:
        endpoints_list[date].append(i)

In [13]:
# check there are no differences between list and count dictionaries - will print eroneous series if so
for i in endpoints_count.keys():
    if len(endpoints_list[i]) != endpoints_count[i]:
        print(i)

Remove discontinued series

In [14]:
# create a cutoff value - anything prior to the start of the previous year?
cutoff = str(datetime.datetime.now().year - 1) + '-01-01'

In [15]:
to_cut = {}

for i in endpoints_list.keys():
    if i < cutoff:
        to_cut[i] = endpoints_list[i]

In [16]:
for i in to_cut.keys():
    ppi_clean = ppi_clean.drop(to_cut[i], axis=1)

Map codes to digit levels

In [17]:
ppi_codes =  pd.DataFrame(all_ppi_q.columns, columns=['START_CODE'])

In [18]:
# remove leading 'PCU' part of the code
ppi_codes['CODE2'] = ppi_codes['START_CODE'].str.slice(3)

In [19]:
# split codes including '-'
ppi_codes['CODE3'] = ppi_codes['CODE2'].str.split('-').str[0]

In [20]:
# identify digit level
ppi_codes['DIGIT'] = np.nan

for i in ppi_codes.index:
   if len(ppi_codes.loc[i, 'CODE3']) == 3:
      ppi_codes.loc[i, 'DIGIT'] = 3
   elif len(ppi_codes.loc[i, 'CODE3']) == 4:
      ppi_codes.loc[i, 'DIGIT'] = 4
   elif len(ppi_codes.loc[i, 'CODE3']) == 5:
      ppi_codes.loc[i, 'DIGIT'] = 5
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 6:
      ppi_codes.loc[i, 'DIGIT'] = 6
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 6.5:
      ppi_codes.loc[i, 'DIGIT'] = 7
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 7:
      ppi_codes.loc[i, 'DIGIT'] = 8
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 7.5:
      ppi_codes.loc[i, 'DIGIT'] = 9
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 8:
      ppi_codes.loc[i, 'DIGIT'] = 10
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 8.5:
      ppi_codes.loc[i, 'DIGIT'] = 11
   elif len(ppi_codes.loc[i, 'CODE3']) / 2 == 9:
      ppi_codes.loc[i, 'DIGIT'] = 12
   

In [24]:
# set aggregate series digit to 'AGG'
aggs = ['PCUAMUM--AMUM--', 'PCUOMIN--OMIN--', 'PCUOMFG--OMFG--', 'PCUATRADEATRADE', 'PCUAWHLTRAWHLTR', 'PCUARETTRARETTR', 'PCUATRNWRATRNWR', 'PCUATRANSATRANS', 'PCUADLVWRADLVWR', 'PCUATTDSVATTDSV', 'PCUAINFO-AINFO-', 'PCUASHC--ASHC--', 'PCUASTDSVASTDSV']

for i in aggs:
    ppi_codes.loc[ppi_codes['START_CODE'] == i, 'DIGIT'] = 'AGG'

In [25]:
codes_by_digit = {}

for i in ppi_codes['DIGIT'].unique():
    codes_by_digit[i] = list(ppi_codes[ppi_codes['DIGIT'] == i]['START_CODE'])

In [26]:
codes_by_digit_clean = {}

for i in ppi_codes['DIGIT'].unique():
    codes_by_digit_clean[i] = []
    for j in ppi_clean.columns:
        if j in ppi_codes[ppi_codes['DIGIT'] == i]['START_CODE'].values:
            codes_by_digit_clean[i].append(j)



Sort PPI dataframe into multi digit sections

In [27]:
# all series
all_digit_dfs = {}

for i in codes_by_digit.keys():
    all_digit_dfs[i] = all_ppi_q[codes_by_digit[i]]


In [28]:
# create a excel writer object
with pd.ExcelWriter("C:\\DSWG_PPI\\all_ppi_q_by_digit.xlsx") as writer:
   for i in all_digit_dfs.keys():
      all_digit_dfs[i].to_excel(writer, sheet_name=str(i))

In [29]:
# clean series
all_digit_dfs_clean = {}

for i in codes_by_digit_clean.keys():
    all_digit_dfs_clean[i] = ppi_clean[codes_by_digit_clean[i]]


# Create Parent-Child Matrix

In [30]:
parent_child_matrices = {}

for i in range(4,13):
    # start with 4 digit (3 digit series are all headlines by definition)
    if i == 4:
        # create matrix df with index of all digit level codes
        matrix = pd.DataFrame(columns = ppi_codes['START_CODE'], index = all_digit_dfs[i].columns)

        for j in matrix.index:
            parents = [col for col in ppi_codes[ppi_codes['DIGIT'] == (i-1)]['START_CODE'] if j[3:(3+i-1)] in col[3:(3+i-1)]]
            matrix.loc[j, parents] = 1

        parent_child_matrices[i] = matrix
    #5 to 7 digit level
    elif i < 8:
        # create matrix df with index of all digit level codes
        matrix = pd.DataFrame(columns = ppi_codes['START_CODE'], index = all_digit_dfs[i].columns)
        for j in matrix.index:
            k = i 
            parents = [col for col in ppi_codes[ppi_codes['DIGIT'] == (k-1)]['START_CODE'] if j[3:(3+k-1)] in col[3:(3+k-1)]]
            # while loop entered if parent in immediate digit level down is not found. Will then check the next level
            # i.e. if 5 digit level code does not find a parent in the 4 digit level, then the 3 digit level is checked
            # if no parent found, nothing is assigned and code assumed to be at the highest level available
            while len(parents)==0 and k>4:
                k = k-1
                parents = [col for col in ppi_codes[ppi_codes['DIGIT'] == (k-1)]['START_CODE'] if j[3:(3+k-1)] in col[3:(3+k-1)]]
            matrix.loc[j, parents] = 1

        parent_child_matrices[i] = matrix
    #8 and beyond digit level - focus needs to be on the product codes
    else:
        # create matrix df with index of all digit level codes
        matrix = pd.DataFrame(columns = ppi_codes['START_CODE'], index = all_digit_dfs[i].columns)
        for j in matrix.index:
            k = i
            parents = [col for col in ppi_codes[ppi_codes['DIGIT'] == (k-1)]['START_CODE'] if j[9:(9+k-1)] in col[9:(9+k-1)]]
            while len(parents)==0 and k>4:
                k = k-1
                if k < 8:
                    parents = [col for col in ppi_codes[ppi_codes['DIGIT'] == (k-1)]['START_CODE'] if j[3:(3+k-1)] in col[3:(3+k-1)]]
                else:
                    parents = [col for col in ppi_codes[ppi_codes['DIGIT'] == (k-1)]['START_CODE'] if j[9:(9+k-1)] in col[9:(9+k-1)]]
            matrix.loc[j, parents] = 1
        parent_child_matrices[i] = matrix

In [31]:
for i in parent_child_matrices.keys():
    parent_child_matrices[i]['sum'] = parent_child_matrices[i].sum(axis=1)

In [32]:
# create a excel writer object
with pd.ExcelWriter("C:\\DSWG_PPI\\parent_child_matrices.xlsx") as writer:
   for i in parent_child_matrices.keys():
      parent_child_matrices[i].to_excel(writer, sheet_name=str(i))

Identify series with breaks in the time series

In [33]:
missing_time_series_all = {}
# loop over digit level
for i in all_digit_dfs.keys():
    missing_time_series_all[i] = []
    # loop over codes at each level
    for j in all_digit_dfs[i].columns:
        # if any nan values between the first and last datapoints there are missing values, append to list
        if all_digit_dfs[i].loc[all_digit_dfs[i][j].first_valid_index():all_digit_dfs[i][j].last_valid_index(),j].isna().any():
            missing_time_series_all[i].append(j)

In [34]:
missing_time_series_clean = {}
# loop over digit level
for i in all_digit_dfs_clean.keys():
    missing_time_series_clean[i] = []
    # loop over codes at each level
    for j in all_digit_dfs_clean[i].columns:
        # if any nan values between the first and last datapoints there are missing values, append to list
        if all_digit_dfs_clean[i].loc[all_digit_dfs_clean[i][j].first_valid_index():all_digit_dfs_clean[i][j].last_valid_index(),j].isna().any():
            missing_time_series_clean[i].append(j)

Fill missing series

In [35]:
for i in all_digit_dfs_clean.keys():
    for j in missing_time_series_clean[i]:
        all_digit_dfs_clean[i].loc[:,j] = all_digit_dfs_clean[i].loc[:,j].interpolate(method='linear', limit_area = 'inside') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


In [36]:
# create a excel writer object
with pd.ExcelWriter("C:\\DSWG_PPI\\clean_ppi_q_by_digit.xlsx") as writer:
   for i in all_digit_dfs_clean.keys():
      all_digit_dfs_clean[i].to_excel(writer, sheet_name=str(i))