In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
import time
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
path = r'D:\РЭШ\Research\PostThesis\data'

## 1. Prepare yearly CPI data
(this part is the same as when prepared yearly CPI-adjusted data)

In [7]:
def yearmonth_to_quarter(df, col_year, col_month):
    """Year and month numbers to format YYYYqN (2003, 09 to 2003q3)"""
    month_to_quarter = {3: 'q1', 6: 'q2', 9: 'q3', 12: 'q4'}
    quarters = df[col_year].astype(str) + df[col_month].replace(month_to_quarter)
    return quarters

def quarter_to_date(a):
    """Quarter of the format 2003q1 to date format 2003-01-01, a is an array"""
    quarter_to_month = {'1': '01', '2': '04', '3': '07', '4': '10'}
    dates = []
    for q in a:
        dates.append(datetime.strptime(q[:4] + '-' + quarter_to_month[q[-1]] + '-01', '%Y-%m-%d'))
    return dates

In [3]:
def dates_to_years(dates):
    """An array of dates to format YYYY (2003-09-01 to 2003)"""
    years = []
    for date in dates:
        years.append(date.year)
    return years

In [4]:
def date_to_quarter(date):
    """Date to format YYYYqN (2003-09-01 to 2003q3)"""
    month_to_quarter = {1: 'q1', 2: 'q1', 3: 'q1',
                        4: 'q2', 5: 'q2', 6: 'q2',
                        7: 'q3', 8: 'q3', 9: 'q3',
                        10: 'q4', 11: 'q4', 12: 'q4'}
    quarter = str(date.year) + month_to_quarter[date.month]
    return quarter

In [5]:
df_CPI = pd.read_excel(path+r'\original\CPI_quarterly.xlsx', sheet_name='Q2Q', index_col='quarter')
df_CPI

Unnamed: 0_level_0,End Q to end Q,Q to Q,Q to last year Q
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005q1,105.27,105.09,113.09
2005q2,102.58,103.20,113.76
2005q3,100.57,101.15,112.67
2005q4,102.12,101.44,111.28
2006q1,104.98,104.67,110.84
...,...,...,...
2022q4,101.33,100.55,112.18
2023q1,101.67,101.92,108.62
2023q2,101.06,101.12,102.69
2023q3,101.79,101.47,105.15


In [8]:
df_CPI['date'] = quarter_to_date(df_CPI.index)
df_CPI['year'] = dates_to_years(df_CPI['date'])
df_CPI

Unnamed: 0_level_0,End Q to end Q,Q to Q,Q to last year Q,date,year
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005q1,105.27,105.09,113.09,2005-01-01,2005
2005q2,102.58,103.20,113.76,2005-04-01,2005
2005q3,100.57,101.15,112.67,2005-07-01,2005
2005q4,102.12,101.44,111.28,2005-10-01,2005
2006q1,104.98,104.67,110.84,2006-01-01,2006
...,...,...,...,...,...
2022q4,101.33,100.55,112.18,2022-10-01,2022
2023q1,101.67,101.92,108.62,2023-01-01,2023
2023q2,101.06,101.12,102.69,2023-04-01,2023
2023q3,101.79,101.47,105.15,2023-07-01,2023


In [9]:
def yearly_cpi_accumulation(df, column):
    """Given data on quarter-to-quarter basis in a column, aggregate it into yearly data as geometric mean,
    i.e. use quarter to the same quarter of past year index for 4 quarters and get the geometric mean as an
    average inflation during the year
    
    df -- dataframe with CPI indices
    column -- the column name for CPI index
    """
    res = pd.DataFrame()
    for year in df['year'].unique():
        ind = 1
        count = 0
        for date in df.loc[df['year']==year]['date']:
            index = date_to_quarter(date)
            ind *= df.loc[df.index == index][column][index]
            count += 1
        ind = ind**(1/count)
        df_temp = pd.DataFrame({'year': [year], column: [ind], 'obs': [count]})
        res = pd.concat([res, df_temp], ignore_index = True)
        res.reset_index()
    return res

In [10]:
df_CPI_yearly = yearly_cpi_accumulation(df_CPI, 'Q to last year Q')
df_CPI_yearly = df_CPI_yearly.groupby(['year']).mean()
df_CPI_yearly

Unnamed: 0_level_0,Q to last year Q,obs
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2005,112.696337,4.0
2006,109.685389,4.0
2007,108.980338,4.0
2008,114.099294,4.0
2009,111.687674,4.0
2010,106.846404,4.0
2011,108.451093,4.0
2012,105.05277,4.0
2013,106.754304,4.0
2014,107.804001,4.0


In [11]:
df_CPI_yearly.to_excel(path+r'\CPI_yearly.xlsx')

Compare with FRED

In [12]:
df = pd.read_excel(path+r'\CPI_yearly+FRED.xlsx')

In [13]:
df

Unnamed: 0,year,CPI,obs,CPI_FRED
0,2005,112.696337,4,12.685304
1,2006,109.685389,4,9.668655
2,2007,108.980338,4,9.007299
3,2008,114.099294,4,14.110768
4,2009,111.687674,4,11.64733
5,2010,106.846404,4,6.849392
6,2011,108.451093,4,8.440465
7,2012,105.05277,4,5.074743
8,2013,106.754304,4,6.75371
9,2014,107.804001,4,7.823412


In [17]:
print(f"{np.corrcoef(np.array(df['CPI'][:17]-1), np.array(df['CPI_FRED'][:17]))[0,1]:.10f}")

0.9999901802


# 2. Prepare GDP Growth data

In [18]:
df = pd.read_excel(path+r'\Real_GDP_FRED_refined.xls')

In [22]:
len(np.array(df['Real GDP'][1:])/np.array(df['Real GDP'][:-1]))

33

In [23]:
len(df)

34

In [25]:
df['GDP growth'] = np.append(np.array([None]), np.array(df['Real GDP'][1:])/np.array(df['Real GDP'][:-1]))

In [27]:
df['year'] = dates_to_years(df['observation_date'])

In [28]:
df

Unnamed: 0,observation_date,Real GDP,GDP growth,year
0,1990-01-01,72910700.0,,1990
1,1991-01-01,69265160.0,0.95,1991
2,1992-01-01,59221710.0,0.855,1992
3,1993-01-01,54069420.0,0.913,1993
4,1994-01-01,47202610.0,0.873,1994
5,1995-01-01,45267300.0,0.959,1995
6,1996-01-01,43634000.0,0.963919,1996
7,1997-01-01,44237000.0,1.013819,1997
8,1998-01-01,41872400.0,0.946547,1998
9,1999-01-01,44531900.0,1.063514,1999


In [29]:
df.to_excel(path+'\GDP_growth.xlsx')