# Data Collection and Storage

1) Preliminary Setup
2) Download Data from FRED API
3) Save Data into Google Bigquery

# 1. Preliminary Setup

In [1]:
# install api for access financial data from FRED website https://fred.stlouisfed.org/
!pip install pyfredapi --upgrade --quiet

# install api for access google bigquery
!pip install google-cloud-bigquery --quiet

In [139]:
# import api for access financial data 
import pyfredapi as pf
# import api for access google bigquery
from google.cloud import bigquery as bq
# import for getting details for authentication and project id
import google.auth
# import pandas for data manipulation
import pandas as pd
# import numpy for data manipulation
import numpy as np
# import data list from another notebook
import Data_List

In [3]:
# Apply API Key for FRED from Below website
# https://fred.stlouisfed.org/docs/api/api_key.html

# 2. Download Data from FRED API

In [4]:
# Define lambda function for simple data cleanse
get = lambda series_id: pf.get_series(series_id)[["date","value"]].set_index("date").rename({"value":series_id},axis=1).dropna()

# Define a function to change data frequency
def change_freq(df, freq='M'):
    temp_df = df.groupby(pd.Grouper(freq=freq)).max().interpolate().round(3)
    temp_df['year'] = temp_df.index.year
    temp_df['month'] = temp_df.index.month
    temp_df['date'] = temp_df['year'].astype(str) + "-" + temp_df['month'].astype(str) + "-1"
    temp_df.index = pd.to_datetime(temp_df['date'])
    return temp_df.drop(columns=['year', 'month', 'date'])

In [10]:
# call the data list from another .py
dict_data = Data_List.dict_data

# define a dictionary for store the dataframe of various financial data
dict_df = {}
for data in dict_data:
    dict_df[data] = get(data)


In [None]:
# Data Cleansing

    if pf.get_series_info(series_id=data).frequency_short != 'M':
        dict_df[data] = change_freq(dict_df[data])

In [209]:
combine = pd.DataFrame()

for df in dict_df:
    if combine.empty:
        combine = dict_df[df].copy()
    else:
        combine = combine.join(dict_df[df])

In [210]:
combine

Unnamed: 0_level_0,FEDFUNDS,TB3MS,CPIAUCSL,UNRATE,GDP
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1954-07-01,0.80,0.72,26.860,5.8,390.996
1954-08-01,1.22,0.92,26.850,6.0,
1954-09-01,1.07,1.01,26.810,6.1,
1954-10-01,0.85,0.98,26.720,5.7,399.734
1954-11-01,0.83,0.93,26.780,5.3,
...,...,...,...,...,...
2023-11-01,5.33,5.27,308.024,3.7,
2023-12-01,5.33,5.24,308.742,3.7,
2024-01-01,5.33,5.22,309.685,3.7,
2024-02-01,5.33,5.24,311.054,3.9,


In [781]:
combine.tail(20)

Unnamed: 0_level_0,FEDFUNDS,TB3MS,CPIAUCSL,UNRATE,GDP
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-08-01,2.33,2.63,295.209,3.6,
2022-09-01,2.56,3.13,296.341,3.5,
2022-10-01,3.08,3.72,297.863,3.6,26408.405
2022-11-01,3.78,4.15,298.648,3.6,
2022-12-01,4.1,4.25,298.812,3.5,
2023-01-01,4.33,4.54,300.356,3.4,26813.601
2023-02-01,4.57,4.65,301.509,3.6,
2023-03-01,4.65,4.69,301.744,3.5,
2023-04-01,4.83,4.92,303.032,3.4,27063.012
2023-05-01,5.06,5.14,303.365,3.7,


In [188]:
# FEDFUNDS is available on 1st of next month
# TB3MS is available on 1st of next month
# CPIAUCSL is available near 12th of next month
# UNRATE is available near 8th of next month
# GDP (advance estimate) is available near 28th of next month of quarter
# GDP (second estimate) is available near 28th of next second month of quarter
# GDP (third estimate) is available near 28th of next third month of quarter

In [787]:
import requests
from bs4 import BeautifulSoup

In [786]:
result = requests.get('https://www.atlantafed.org/cqer/research/gdpnow/archives')

In [789]:
soup = BeautifulSoup(result.text, 'html.parser')

In [873]:
GDP_forecast_archives = soup.select_one('div[class="row GDPNowAllUpdates"] > p:nth-of-type(2)').contents

In [995]:
soup.select('div[class="row GDPNowAllUpdates"] > p:nth-of-type(2)')[0].contents

['The GDPNow model estimate for real GDP growth (seasonally adjusted annual rate) in the first quarter of 2024 is ',
 <strong>2.5 percent</strong>,
 ' on April 4, down from 2.8 percent on April 1. After this morning’s international release from the US Census Bureau and the US Bureau of Economic Analysis, the nowcast of the contribution of net exports to annualized first-quarter real GDP growth decreased from -0.48 percentage points to -0.57 percentage points.']

In [907]:
dict_quarter_to_month = {
    'first': '01',
    'second': '04',
    'third': '07',
    'fourth': '10'}

In [906]:
GDP_forecast_date_pre = GDP_forecast_archives[0]
word = GDP_forecast_date_pre.split()
quarter_index = word.index('quarter')
quarter_num = word[quarter_index-1]
year = word[quarter_index+2]

In [908]:
GDP_forecast_date = pd.to_datetime(year+'-'+dict_quarter_to_month[quarter_num]+'-01')

In [915]:
GDP_forecast_percent_pre = GDP_forecast_archives[1]
GDP_forecast_percent = float(GDP_forecast_percent_pre.text.split()[0])

In [917]:
GDP_forecast_date, GDP_forecast_percent

(Timestamp('2024-01-01 00:00:00'), 2.5)

In [1010]:
soup.select('div[class="row GDPNowAllUpdates"] > p:nth-of-type(1)')[0].text

'April 04, 2024'

In [None]:
    result = requests.get('https://www.atlantafed.org/cqer/research/gdpnow/archives')
    soup = BeautifulSoup(result.text, 'html.parser')

In [1028]:
def GDP_forecast(order=0):
    
    dict_quarter_to_month = {'first': '01', 'second': '04', 'third': '07', 'fourth': '10'}
    css_GDP_forecast_archives = 'div[class="row GDPNowAllUpdates"] > p:nth-of-type(2)'
    GDP_forecast_archives = soup.select(css_GDP_forecast_archives)[order].text.split()
    
    quarter_index = GDP_forecast_archives.index('quarter')
    quarter_num = GDP_forecast_archives[quarter_index - 1]
    year = GDP_forecast_archives[quarter_index + 2]
    GDP_forecast_date = pd.to_datetime(year + '-' + dict_quarter_to_month[quarter_num] + '-01')

    percent_index = GDP_forecast_archives.index('percent')
    percent_num = GDP_forecast_archives[percent_index - 1]
    GDP_forecast_percent = float(percent_num)

    css_publish_date = 'div[class="row GDPNowAllUpdates"] > p:nth-of-type(1)'
    publish_date = soup.select(css_publish_date)[order].text
    
    return GDP_forecast_date, GDP_forecast_percent, publish_date

In [1036]:
for i in range(40):
    print(GDP_forecast(i))

(Timestamp('2024-01-01 00:00:00'), 2.5, 'April 04, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.8, 'April 01, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.3, 'March 29, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.1, 'March 26, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.1, 'March 19, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.3, 'March 14, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.5, 'March 07, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.5, 'March 06, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.1, 'March 01, 2024')
(Timestamp('2024-01-01 00:00:00'), 3.0, 'February 29, 2024')
(Timestamp('2024-01-01 00:00:00'), 3.2, 'February 27, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.9, 'February 16, 2024')
(Timestamp('2024-01-01 00:00:00'), 2.9, 'February 15, 2024')
(Timestamp('2024-01-01 00:00:00'), 3.4, 'February 08, 2024')
(Timestamp('2024-01-01 00:00:00'), 3.4, 'February 07, 2024')
(Timestamp('2024-01-01 00:00:00'), 4.2, 'February 01, 2024')
(Timestamp('2024-01-01 00:00:00'), 3.0, 'January 26

In [1041]:
combine['GDP'].interpolate(limit_direction='backward').shift(2)

date
1954-07-01             NaN
1954-08-01             NaN
1954-09-01      390.996000
1954-10-01      393.908667
1954-11-01      396.821333
                  ...     
2023-11-01    27841.374667
2023-12-01    27956.998000
2024-01-01             NaN
2024-02-01             NaN
2024-03-01             NaN
Name: GDP, Length: 837, dtype: float64

In [192]:
# https://www.clevelandfed.org/indicators-and-data/inflation-nowcasting
# MONTH-OVER-MONTH PERCENT CHANGE for March 2024 is 0.34%
CPIAUCSL_percent = 0.34

# https://zh.tradingeconomics.com/united-states/unemployment-rate
# Unemployment-rate March 2024 predicted is 3.9% (growth = 0%)
UNRATE_percent = 0

# https://www.atlantafed.org/cqer/research/gdpnow
# GDP growth is 2.8%
GDP_percent = 2.8

In [247]:
def fillna_with_percent_of_last_value(series, percent=0):
    series = series.copy()
    i = 1
    while pd.isna(series.iloc[-i]):  
        i += 1
    else:   
        if i != 1:
            series.iloc[-1] = round(series.iloc[-i] * (1 + percent / 100),3)
        return series

In [248]:
fillna_with_percent_of_last_value(combine['CPIAUCSL'], CPIAUCSL_percent)

date
1954-07-01     26.860
1954-08-01     26.850
1954-09-01     26.810
1954-10-01     26.720
1954-11-01     26.780
               ...   
2023-11-01    308.024
2023-12-01    308.742
2024-01-01    309.685
2024-02-01    311.054
2024-03-01    312.112
Name: CPIAUCSL, Length: 837, dtype: float64

In [250]:
fillna_with_percent_of_last_value(combine['UNRATE'], UNRATE_percent)

date
1954-07-01    5.8
1954-08-01    6.0
1954-09-01    6.1
1954-10-01    5.7
1954-11-01    5.3
             ... 
2023-11-01    3.7
2023-12-01    3.7
2024-01-01    3.7
2024-02-01    3.9
2024-03-01    3.9
Name: UNRATE, Length: 837, dtype: float64

In [776]:
def check_na(series):
    i = 1
    while pd.isna(series.iloc[-i]):  
        i += 1
    return i - 1

In [777]:
def fillna_with_two_adj_value(series):
    series = series.copy()
    for i in range(-1, 1):
        j = i
        while pd.isna(series.iloc[j]):  
            j = j + 1 - 2 * (i < 0)
        else:   
            if i != j:
                series.iloc[i] = series.iloc[j] * 2 - series.iloc[j + 1 - 2 * (i < 0)]
    return series

In [778]:
def process_gdp(series, percent = 0):
    series = series.copy()
    na = check_na(series)
    shift = na - 3 - (na > 0)
    
    series_intrp_1 = series.interpolate(limit_direction='backward').shift(1)
    series_shift = series_intrp_1.shift(shift)
    series_percent = fillna_with_percent_of_last_value(series_shift, percent)
    series_intrp_2 = series_percent.interpolate(limit_direction='forward').shift(-shift)
    series_final = series_intrp_2.fillna(value=series_intrp_1)
    
    return fillna_with_two_adj_value(series_final)

In [779]:
process_gdp(combine['GDP'].iloc[:-5], GDP_percent)

date
1954-07-01      388.083333
1954-08-01      390.996000
1954-09-01      393.908667
1954-10-01      396.821333
1954-11-01      399.734000
                  ...     
2023-06-01    27245.384000
2023-07-01    27427.756000
2023-08-01    27610.128000
2023-09-01    27725.751333
2023-10-01    27841.374667
Name: GDP, Length: 832, dtype: float64

In [652]:
process_gdp(combine['GDP'], GDP_percent)

date
1954-07-01      388.083333
1954-08-01      390.996000
1954-09-01      393.908667
1954-10-01      396.821333
1954-11-01      399.734000
                  ...     
2023-11-01    27956.998000
2023-12-01    28217.930000
2024-01-01    28478.862000
2024-02-01    28739.794000
2024-03-01    29000.726000
Name: GDP, Length: 837, dtype: float64

In [764]:
combine['GDP'].iloc[:-5]

date
1954-07-01      390.996
1954-08-01          NaN
1954-09-01          NaN
1954-10-01      399.734
1954-11-01          NaN
                ...    
2023-06-01          NaN
2023-07-01    27610.128
2023-08-01          NaN
2023-09-01          NaN
2023-10-01    27956.998
Name: GDP, Length: 832, dtype: float64

In [655]:
def concat_series(series, na, percent):
    temp = process_gdp(series, percent)
    for i in range(1, na):
        temp = pd.concat([temp, process_gdp(series.iloc[:-i], percent)], axis=1)
    return temp

In [765]:
result = concat_series(combine['GDP'], 5, GDP_percent)

In [766]:
sum(result.mean(axis=1) - result.iloc[:,0])

-9.265477274311706e-12

In [767]:
result[~result.eq(result.iloc[:, 0], axis=0).all(1)]

Unnamed: 0_level_0,GDP,GDP,GDP,GDP,GDP
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-12-01,28217.93,28217.93,28217.93,28217.93,
2024-01-01,28478.862,28478.862,28478.862,,
2024-02-01,28739.794,28739.794,,,
2024-03-01,29000.726,,,,


In [263]:
GDP_intrp_1 = combine['GDP'].interpolate(limit_direction='backward').shift(2)

In [264]:
GDP_intrp_1_fill_1 = fillna_with_percent_of_last_value(GDP_intrp_1, GDP_percent)

In [265]:
GDP_intrp_2_fill_1 = GDP_intrp_1_fill_1.interpolate(limit_direction='forward').shift(-1)

In [266]:
GDP_intrp_2_fill_1

date
1954-07-01             NaN
1954-08-01      390.996000
1954-09-01      393.908667
1954-10-01      396.821333
1954-11-01      399.734000
                  ...     
2023-11-01    27956.998000
2023-12-01    28217.930000
2024-01-01    28478.862000
2024-02-01    28739.794000
2024-03-01             NaN
Name: GDP, Length: 837, dtype: float64

In [769]:
check = fillna_with_two_adj_value(GDP_intrp_2_fill_1)

In [773]:
check

date
1954-07-01      388.083333
1954-08-01      390.996000
1954-09-01      393.908667
1954-10-01      396.821333
1954-11-01      399.734000
                  ...     
2023-11-01    27956.998000
2023-12-01    28217.930000
2024-01-01    28478.862000
2024-02-01    28739.794000
2024-03-01    29000.726000
Name: GDP, Length: 837, dtype: float64

In [780]:
sum(~(check == process_gdp(combine['GDP'], GDP_percent)))

0

# 3. Save Data into Google Bigquery

In [6]:
# key is generated at Service Account and downloaded in json format
# default credentials is set under environment variable "GOOGLE_APPLICATION_CREDENTIALS"
# the value of "GOOGLE_APPLICATION_CREDENTIALS" is set to path of json e.g. '/Users/.../project_id.json'

# get project_id from json
credentials, project_id = google.auth.default()
database_id = 'data'

# save financial data into Google bigquery
for df_name in dict_df:
    table_id = df_name
    dict_df[df_name].to_gbq(f'{database_id}.{table_id}', project_id=project_id, if_exists='replace')

100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2182.26it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 8542.37it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 8338.58it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 6326.25it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 9078.58it/s]


In [7]:
# Check if data can be accessed
client = bq.Client()
for df_name in dict_df:
    table_id = df_name
    sql = f'select * from {project_id}.{database_id}.{table_id}'
    # save result into dataframe
    display(client.query(sql).to_dataframe())

Unnamed: 0,date,FEDFUNDS
0,1961-08-01,2.00
1,2008-06-01,2.00
2,2008-08-01,2.00
3,1956-02-01,2.50
4,1956-03-01,2.50
...,...,...
832,1983-10-01,9.48
833,1980-05-01,10.98
834,2008-11-01,0.39
835,2016-07-01,0.39


Unnamed: 0,date,TB3MS
0,1956-03-01,2.25
1,1960-12-01,2.25
2,2018-10-01,2.25
3,2008-01-01,2.75
4,1993-01-01,3.00
...,...,...
1078,1948-08-01,1.06
1079,1950-10-01,1.31
1080,1951-07-01,1.56
1081,1951-11-01,1.56


Unnamed: 0,date,CPIAUCSL
0,1947-03-01,22.000
1,1947-04-01,22.000
2,1957-05-01,28.000
3,1959-02-01,29.000
4,1967-12-01,34.000
...,...,...
921,2023-09-01,307.288
922,2022-09-01,296.341
923,2014-11-01,236.983
924,2015-10-01,237.733


Unnamed: 0,date,UNRATE
0,1953-05-01,2.5
1,1953-06-01,2.5
2,1951-05-01,3.0
3,1952-05-01,3.0
4,1952-06-01,3.0
...,...,...
909,2010-03-01,9.9
910,2010-04-01,9.9
911,1982-10-01,10.4
912,1983-01-01,10.4


Unnamed: 0,date,GDP
0,1947-01-01,243.164
1,1947-02-01,244.099
2,1947-03-01,245.033
3,1947-04-01,245.968
4,1947-05-01,247.174
...,...,...
917,2023-06-01,27427.756
918,2023-07-01,27610.128
919,2023-08-01,27725.751
920,2023-09-01,27841.375
