# Preamble

In [1]:
%matplotlib inline

## Notebook parameters

In [2]:
NAME = 'a_4_calculate_calendar_bias'
PROJECT = 'covid-empirical'
PYTHON_VERSION = '3.9'
USER = 'Ties'
CONDA_ENVIRONMENT = 'covid-empirical'
USE_EXTERNAL_PIPELINE = True

## Run preamble script

In [3]:
%run -i preamble.py 

----------------------------------------------------------------------------------
The following utility functions are loaded and available through `functions.<..>`:
----------------------------------------------------------------------------------

extract_data_edgar_link, fast_load_json, fast_store_json, flatten_multiindex_column, inDB, recreate_edgar_link

----------------------------------------------------------------
The following modules and functions are imported by preamble.py:
----------------------------------------------------------------

copy, delayed, importlib, json, math, np, orjson, os, pd, plt, pqdm_p, pqdm_t, random, re, requests, sys, time, yaml


## Notebook specific imports

In [4]:
import seaborn as sns
from scipy.stats import ks_2samp

In [5]:
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="white", rc=custom_params)

-------
# Create bias estimate for every calendar day
-------

## Create starting dataframe

In [28]:
all_date_df = pd.DataFrame(pd.date_range(start='1-1-2015', end='12-31-2022'))
all_date_df.columns = ['date']

In [29]:
all_date_df['day_of_week'] = all_date_df['date'].dt.day_name()
all_date_df['day_of_week_i'] = all_date_df['date'].dt.day_of_week
all_date_df['year'] = all_date_df['date'].dt.year
all_date_df['quarter'] = all_date_df['date'].dt.quarter
all_date_df['month'] = all_date_df['date'].dt.month_name()
all_date_df['day'] = all_date_df['date'].dt.day
all_date_df['week'] = all_date_df['date'].dt.week

In [30]:
tmp_df = pd.DataFrame()

for name, df in all_date_df.groupby(['year', 'month']):
    df = df.sort_values(['date', 'day_of_week_i'])
    df['week_rank'] = df.groupby('day_of_week')['day_of_week_i'].rank(method="first", ascending=True).astype(int)
    tmp_df = tmp_df.append(df)
    
all_date_df = tmp_df

### Identify the previous years equivalent day

In [31]:
lookup_dict = {}
for name, df in all_date_df.groupby(['year', 'month', 'day_of_week']):
    key = '_'.join([str(x) for x in name])
    lookup_dict[key] = {}
    for i, row in df.iterrows():
        lookup_dict[key][row['week_rank']] = row.to_dict()

In [40]:
all_date_df = all_date_df.sort_values('date')
tmp_list = []
for i, row in all_date_df.iterrows():
    ## Relative to last year
    prev_year = row['year'] - 1
    lookup_key = f'''{prev_year}_{row['month']}_{row['day_of_week']}'''
    
    prev_day = np.nan
    if lookup_key in lookup_dict.keys():
        lookup_list = lookup_dict[lookup_key]
        if row['week_rank'] in [1,2,3]:
            prev_year_row = lookup_list[row['week_rank']]
        elif row['week_rank'] in [4, 5]:
            max_week_rank = all_date_df[
                (all_date_df['year'] == row['year']) &
                (all_date_df['month'] == row['month']) & 
                (all_date_df['day_of_week'] == row['day_of_week'])
            ]['week_rank'].max()
            
            if row['week_rank'] == 4: 
                if max_week_rank == 4:
                    if 5 in lookup_list.keys():
                        prev_year_row = lookup_list[5]
                    else:
                        prev_year_row = lookup_list[4]
                else:
                    prev_year_row = lookup_list[4]
            if row['week_rank'] == 5: 
                if 5 in lookup_list.keys():
                    prev_year_row = lookup_list[5]
                else:
                    prev_year_row = lookup_list[4]
        else:
            raise "Impossible week_rank"
            
        prev_day = prev_year_row['day']
    
    row['same_day_last_year'] = prev_day
    tmp_list.append(row.to_dict())
new_df = pd.DataFrame(tmp_list)
assert len(new_df.index) == len(all_date_df.index), 'Observations changed?'
all_date_df = new_df

In [42]:
all_date_df = all_date_df.dropna()

In [43]:
all_date_df['same_day_last_year'] = all_date_df['same_day_last_year'].astype(int)

### Calculate calendar bias

In [44]:
all_date_df['calendar_bias'] = all_date_df['day'] - all_date_df['same_day_last_year']

### Quick debug to make sure it worked

In [48]:
all_date_df['calendar_bias'].value_counts()

-1    1585
-2     540
 6     242
 5     190
Name: calendar_bias, dtype: int64

In [45]:
all_date_df[(all_date_df.year.isin([2019, 2020, 2021])) & (all_date_df.month == 'December') & (all_date_df.day_of_week == 'Thursday')]

Unnamed: 0,date,day_of_week,day_of_week_i,year,quarter,month,day,week,week_rank,same_day_last_year,calendar_bias
1799,2019-12-05,Thursday,3,2019,4,December,5,49,1,6,-1
1806,2019-12-12,Thursday,3,2019,4,December,12,50,2,13,-1
1813,2019-12-19,Thursday,3,2019,4,December,19,51,3,20,-1
1820,2019-12-26,Thursday,3,2019,4,December,26,52,4,27,-1
2163,2020-12-03,Thursday,3,2020,4,December,3,49,1,5,-2
2170,2020-12-10,Thursday,3,2020,4,December,10,50,2,12,-2
2177,2020-12-17,Thursday,3,2020,4,December,17,51,3,19,-2
2184,2020-12-24,Thursday,3,2020,4,December,24,52,4,26,-2
2191,2020-12-31,Thursday,3,2020,4,December,31,53,5,26,5
2527,2021-12-02,Thursday,3,2021,4,December,2,48,1,3,-1


### Save bias

In [46]:
all_date_df.to_excel(pipeline / 'out' / 'calendar_bias_per_day.xlsx')

In [47]:
all_date_df[['date', 'calendar_bias', 'week_rank', ]].sample(5).T  

Unnamed: 0,1686,539,1817,1779,801
date,2019-08-14 00:00:00,2016-06-23 00:00:00,2019-12-23 00:00:00,2019-11-15 00:00:00,2017-03-12 00:00:00
calendar_bias,6,-2,-1,-1,-1
week_rank,2,4,4,3,2
