In [None]:
# SCRIPT 0: IMPORT LIBRARIES
import yfinance as yf
import pandas as pd
import numpy as np
import holidays
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.outliers_influence import variance_inflation_factor
from linearmodels.panel import PanelOLS
from stargazer.stargazer import Stargazer
import os

In [None]:
# SCRIPT 1: SETUP - DEFINE CONSTANTS AND TICKERS

TICKERS = {
    "Large Cap": [
        'EQNR.OL', 'DNB.OL', 'KOG.OL', 'MOWI.OL', 'TEL.OL', 'NHY.OL', 'AKRBP.OL',
        'ORK.OL', 'STB.OL', 'YAR.OL', 'SUBC.OL', 'GJF.OL', 'SALM.OL', 'TGS.OL',
        'TOM.OL', 'VAR.OL', 'NEL.OL', 'FRO.OL', 'BWLPG.OL', 'HAUTO.OL',
        'NOD.OL', 'WAWI.OL', 'NAS.OL', 'BAKKA.OL', 'WWI.OL', 'AFK.OL',
        'AUSS.OL', 'SCATC.OL', 'MPCC.OL', 'HAFNI.OL'
    ],
    "Mid Cap": [
        'AKER.OL', 'LSG.OL', 'KIT.OL', 'AKSO.OL', 'PARB.OL', 'BONHR.OL',
        'BOUV.OL', 'DNO.OL', 'ENTRA.OL', 'FLNG.OL', 'MING.OL', 'NAPA.OL',
        'NORBT.OL', 'OLT.OL', 'PCIB.OL', 'REACH.OL', 'WSTEP.OL', 'KOA.OL',
        'HSPG.OL', 'SOFF.OL', 'ABG.OL', 'BGBIO.OL', 'EMGS.OL', 'EXTX.OL',
        'HAVI.OL', 'HELG.OL', 'IDEX.OL', 'JIN.OL', 'MULTI.OL', 'NYKD.OL'
    ],
    "Small Cap": [
        'QEC.OL', 'RECSI.OL', 'SPOL.OL', 'AZT.OL', 'KID.OL', 'SATS.OL',
        'AURG.OL', 'PEN.OL', 'LINK.OL', 'PROT.OL', 'IOX.OL', 'ACC.OL',
        'TECH.OL', 'CONTX.OL', 'NONG.OL', 'BEWI.OL', 'ELO.OL', 'GSF.OL',
        'PRS.OL', 'AIRX.OL', 'OBSRV.OL', 'HUNT.OL', 'AKVA.OL', 'HEX.OL',
        'SOFTX.OL', 'ASA.OL', 'NORTH.OL', 'CAPSL.OL', 'LYTIX.OL', 'VOW.OL'
    ]
}

START_DATE = "2014-01-01"
END_DATE = "2024-12-31"
OUTPUT_CSV = "oslo_bors_labelled_data.csv"


In [None]:
# SCRIPT 1.1: CORWIN-SCHULTZ AND ABDI-RANALDO SPREAD ESTIMATORS
def corwin_schultz_spread(group):
    """
    Corwin-Schultz (2012) High-Low spread estimator.
    This function calculates a daily spread estimate.
    
    Reference: Corwin, S. A., & Schultz, P. (2012). A simple way to estimate 
    bid-ask spreads from daily high and low prices. The Journal of Finance, 67(2), 719-760.
    """
    group = group.sort_values('Date')
    
    high = group['High'].values
    low = group['Low'].values
    
    spreads = []
    negative_count = 0
    
    for i in range(len(group)):
        if i == 0:
            spreads.append(np.nan)
        else:
            h0, l0 = high[i], low[i]
            h1, l1 = high[i-1], low[i-1]
            
            if h0 > 0 and l0 > 0 and h1 > 0 and l1 > 0:
                beta = (np.log(h0/l0))**2 + (np.log(h1/l1))**2
                h_max = max(h0, h1)
                l_min = min(l0, l1)
                gamma = (np.log(h_max/l_min))**2
                
                k = (3 - 2*np.sqrt(2))
                alpha = (np.sqrt(2*beta) - np.sqrt(beta)) / k - np.sqrt(gamma / k)
                
                spread = 2 * (np.exp(alpha) - 1) / (1 + np.exp(alpha))
                
                if spread >= 0:
                    spreads.append(spread)
                else:
                    spreads.append(np.nan)
                    negative_count += 1
            else:
                spreads.append(np.nan)
    
    if negative_count > 0:
        ticker = group['Ticker'].iloc[0] if 'Ticker' in group.columns else 'Unknown'
        # print(f"  Warning: {ticker} had {negative_count} negative Corwin-Schultz estimates (set to NaN)")
    
    group['Spread'] = spreads
    return group


def abdi_ranaldo_spread(group):
    """
    Calculates the NORMALIZED Abdi and Ranaldo (2017) "BAR" spread estimator.
    This version converts the absolute spread (in currency) to a relative 
    spread (as a percentage of the price), making it comparable across stocks.
    """
    group = group.sort_values('Date')

    delta_p_t = group['Close'].diff()
    delta_p_t_minus_1 = delta_p_t.shift(1)

    if delta_p_t.count() < 2:
        return pd.Series(np.nan, index=group.index, name='Spread')

    cov_term = (delta_p_t * delta_p_t_minus_1)
    
    # Calculate the absolute spread (in currency units)
    absolute_spreads = 2 * np.sqrt(np.maximum(0, -cov_term))
    
    # NORMALIZATION STEP
    # To get a relative spread, divide by the average of today's and yesterday's close price.
    # Rolling window to get the average price at each point in time.
    avg_price = group['Close'].rolling(window=2, min_periods=1).mean()
    
    # Calculate the relative spread. Use .values to avoid index alignment issues.
    relative_spreads = absolute_spreads / avg_price.values
    
    # Assign the comparable, relative spreads to the DataFrame
    group['Spread'] = relative_spreads
    return group


In [None]:
# SCRIPT 1.2: ADD PERIOD LABELS FOR HOLIDAY ANALYSIS
def add_period_labels(df):
    """
    Adds period labels for holiday analysis.
    """
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    years = range(df['Date'].dt.year.min(), df['Date'].dt.year.max() + 1)
    norway_holidays = holidays.Norway(years=years)
    
    df['week'] = df['Date'].dt.isocalendar().week.astype(int)
    df['month'] = df['Date'].dt.month
    df['period_label'] = 'control_year'

    for yr in years:
        christmas_ref_start = pd.Timestamp(f'{yr}-12-15')
        christmas_ref_end = pd.Timestamp(f'{yr}-12-31')
        christmas_days = pd.bdate_range(start=christmas_ref_start, end=christmas_ref_end, freq='C', holidays=list(norway_holidays.keys()), weekmask='Mon Tue Wed Thu Fri')
        
        if len(christmas_days) > 0:
            pre_christmas = pd.bdate_range(end=christmas_days[0] - pd.Timedelta(days=1), periods=5, freq='C', holidays=list(norway_holidays.keys()), weekmask='Mon Tue Wed Thu Fri')
            try:
                post_christmas_start = pd.Timestamp(f'{yr+1}-01-02')
                post_christmas = pd.bdate_range(start=post_christmas_start, periods=5, freq='C', holidays=list(norway_holidays.keys()), weekmask='Mon Tue Wed Thu Fri')
            except:
                post_christmas = pd.bdate_range(start=christmas_days[-1] + pd.Timedelta(days=1), periods=5, freq='C', holidays=list(norway_holidays.keys()), weekmask='Mon Tue Wed Thu Fri')
            
            df.loc[df['Date'].isin(pre_christmas), 'period_label'] = 'pre_christmas'
            df.loc[df['Date'].isin(christmas_days), 'period_label'] = 'christmas'
            df.loc[df['Date'].isin(post_christmas), 'period_label'] = 'post_christmas'

    for yr in years:
        easter_days = [d for d, h in norway_holidays.items() if ('Påske' in h or h in ['Skjærtorsdag', 'Langfredag']) and d.year == yr]
        
        if easter_days:
            easter_start = min(easter_days)
            easter_end = max(easter_days)
            pre_easter = pd.bdate_range(end=easter_start - pd.Timedelta(days=1), periods=5, freq='C', holidays=list(norway_holidays.keys()), weekmask='Mon Tue Wed Thu Fri')
            post_easter = pd.bdate_range(start=easter_end + pd.Timedelta(days=1), periods=5, freq='C', holidays=list(norway_holidays.keys()), weekmask='Mon Tue Wed Thu Fri')
            df.loc[df['Date'].isin(pre_easter), 'period_label'] = 'pre_easter'
            df.loc[df['Date'].isin(post_easter), 'period_label'] = 'post_easter'

    df.loc[df['week'].between(28, 30), 'period_label'] = 'summer_holiday'
    df.loc[(df['month'].between(6, 8)) & (~df['week'].between(28, 30)), 'period_label'] = 'summer_excl_holiday'

    return df


In [None]:
# SCRIPT 1.3: CHOOSE SPREAD ESTIMATOR (Simplified)
user_input = input("Which spread estimator do you want to use? (Enter 'CS' or 'AR'): ")

# Determine the spread method based on user input
if user_input.upper() == 'AR':
    METHOD_CHOICE = 'AR'
    SPREAD_METHOD_NAME = 'Abdi & Ranaldo (2017) Spread'
else:
    METHOD_CHOICE = 'CS'
    SPREAD_METHOD_NAME = 'Corwin-Schultz (2012) Spread'

print(f"The analysis will use the '{SPREAD_METHOD_NAME}'")


In [None]:
# SCRIPT 1.4: MAIN DATA PROCESSING FUNCTION
def main():
    print("-" * 80)
    print("OSLO BØRS HOLIDAY EFFECTS ANALYSIS - DATA PREPARATION")
    # Use f-string to print the full name
    print(f"Using spread estimator: {SPREAD_METHOD_NAME}")
    print("-" * 80)
    
    all_tickers = [ticker for sublist in TICKERS.values() for ticker in sublist]
    print(f"\nDownloading data for {len(all_tickers)} tickers...")
    print(f"Date range: {START_DATE} to {END_DATE}")
    
    raw_data = yf.download(all_tickers, start=START_DATE, end=END_DATE, progress=True, auto_adjust=True)

    if raw_data.empty:
        print("ERROR: No data downloaded")
        return

    print("\n" + "-"*80)
    print("TRANSFORMING DATA STRUCTURE")
    print("-"*80)
    df = raw_data.stack(future_stack=True).reset_index()
    df = df.rename(columns={'level_1': 'Ticker'})
    
    ticker_to_cap = {ticker: cap for cap, tickers in TICKERS.items() for ticker in tickers}
    df['Cap_Group'] = df['Ticker'].map(ticker_to_cap)
    
    initial_len = len(df)
    df.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume', 'Cap_Group'], inplace=True)
    print(f"Removed {initial_len - len(df):,} rows with missing price/volume data")

    # Spread Calculation
    print("\n" + "-"*80)
    if METHOD_CHOICE == 'CS':
        print("CALCULATING CORWIN-SCHULTZ SPREADS (DAILY ESTIMATE)")
        df = df.groupby('Ticker', group_keys=False).apply(corwin_schultz_spread)
    elif METHOD_CHOICE == 'AR':
        print("CALCULATING ABDI & RANALDO (2017) 'BAR' SPREADS")
        df = df.groupby('Ticker', group_keys=False).apply(abdi_ranaldo_spread)
    else:
        raise ValueError("Invalid method choice. Please choose 'CS' or 'AR'.")
    print("Spread calculation complete.")

    # Data Quality Check and Cleaning
    print("\n" + "-"*80)
    print(f"DATA QUALITY CHECK FOR METHOD: {METHOD_CHOICE}")
    print("-" * 80)
    
    total_spreads = len(df[df.index > 0]) # Avoid counting first NaN
    valid_spreads = df['Spread'].notna().sum()
    invalid_spreads = total_spreads - valid_spreads
    
    print(f"\nCalculation summary for '{METHOD_CHOICE}':")
    print(f"  Total observations:     {total_spreads:,}")
    print(f"  Valid spreads:          {valid_spreads:,} ({valid_spreads/total_spreads*100:.2f}%)")
    print(f"  Invalid/missing:        {invalid_spreads:,} ({invalid_spreads/total_spreads*100:.2f}%)")
    
    spread_before = len(df)
    df = df.dropna(subset=['Spread'])
    print(f"\nRemoved {spread_before - len(df):,} rows with missing spreads")

    # Roll's Spread Calculation
    print("\n" + "-" * 80)
    print("CALCULATING ROLL'S SPREADS (PER-STOCK ROBUSTNESS CHECK)")
    print("-" * 80)
    roll_spreads = df.groupby('Ticker').apply(get_roll_spread)
    df['RollsSpread'] = df['Ticker'].map(roll_spreads)
    
    valid_rolls = roll_spreads.notna().sum()
    print(f"Roll's spread calculation summary:")
    print(f"  Tickers with valid spread: {valid_rolls} / {df['Ticker'].nunique()}")
    print(f"  Average calculated Roll's spread: {roll_spreads.mean():.6f}")

    # Period Labeling and Final Export
    print("\n" + "-"*80)
    print("ADDING PERIOD LABELS")
    print("-"*80)
    df = add_period_labels(df)
    
    print("\n" + "-"*80)
    print("FINAL DATA EXPORT")
    print("-"*80)
    
    final_cols = ['Date', 'Ticker', 'Cap_Group', 'Open', 'High', 'Low', 'Close', 'Volume', 'Spread', 'RollsSpread', 'period_label']
    df_final = df[[col for col in final_cols if col in df.columns]]
    df_final.to_csv(OUTPUT_CSV, index=False)
    
    print(f"Data saved to: {OUTPUT_CSV}")
    print(f"Total observations: {len(df_final):,}")
    
    print(f"\n'{METHOD_CHOICE}' Spread ('Spread') statistics:")
    print(f"  Mean:   {df_final['Spread'].mean():.6f}")
    
    print(f"\nRoll's Spread ('RollsSpread') statistics:")
    print(f"  Mean:   {df_final['RollsSpread'].mean():.6f}")
    print("-"*80)


if __name__ == "__main__":
    main()


In [None]:
# SCRIPT 2: DESCRIPTIVE STATISTICS AND VISUALIZATION (OPTIMIZED)

df = pd.read_csv('oslo_bors_labelled_data.csv', parse_dates=['Date'])

if not os.path.exists('output'):
    os.makedirs('output')

sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.2)

print("-"*80)
print("DESCRIPTIVE STATISTICS AND VISUALIZATION")
print("-"*80)

# Dataset overview
print(f"\nTotal observations: {len(df):,}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Unique tickers: {df['Ticker'].nunique()}")
print(f"Trading days: {df['Date'].nunique():,}")

# Summary by cap group
print("\n" + "-"*80)
print("SUMMARY BY CAP GROUP")
print("-"*80)
summary_cap = df.groupby('Cap_Group').agg({
    'Spread': ['mean', 'median', 'std'],
    'Volume': ['mean', 'median'],
    'Ticker': 'nunique'
}).round(6)
summary_cap.columns = ['Spread_Mean', 'Spread_Median', 'Spread_Std', 'Volume_Mean', 'Volume_Median', 'N_Stocks']
print(summary_cap)

# Summary by period
print("\n" + "-"*80)
print("SUMMARY BY PERIOD")
print("-"*80)
summary_period = df.groupby('period_label').agg({
    'Spread': ['mean', 'median'],
    'Volume': ['mean', 'median'],
    'Date': 'count'
}).round(6)
summary_period.columns = ['Spread_Mean', 'Spread_Median', 'Volume_Mean', 'Volume_Median', 'N_Obs']
print(summary_period)

# Calculate percentage changes vs control
control_spread = summary_period.loc['control_year', 'Spread_Mean']
control_volume = summary_period.loc['control_year', 'Volume_Mean']
summary_period['Spread_Change_%'] = ((summary_period['Spread_Mean'] - control_spread) / control_spread * 100).round(2)
summary_period['Volume_Change_%'] = ((summary_period['Volume_Mean'] - control_volume) / control_volume * 100).round(2)

print("\n" + "-"*80)
print("CHANGES VS CONTROL PERIOD")
print("-"*80)
print(summary_period[['Spread_Mean', 'Spread_Change_%', 'Volume_Mean', 'Volume_Change_%']])

summary_cap.to_csv('output/summary_by_cap.csv')
summary_period.to_csv('output/summary_by_period.csv')

# FIGURE 1: CHRISTMAS & EASTER ANALYSIS
print("\n" + "-"*80)
print("GENERATING FIGURE 1: CHRISTMAS & EASTER")
print("-"*80)

christmas_easter_periods = ['control_year', 'pre_easter', 'post_easter',
                            'pre_christmas', 'christmas', 'post_christmas']
df_ce = df[df['period_label'].isin(christmas_easter_periods)].copy()
df_ce['period_label'] = pd.Categorical(df_ce['period_label'],
                                       categories=christmas_easter_periods,
                                       ordered=True)

# Figure 1A: Spread (added confidence intervals)
plt.figure(figsize=(14, 8))
sns.pointplot(x='period_label', y='Spread', hue='Cap_Group', data=df_ce,
             hue_order=['Large Cap', 'Mid Cap', 'Small Cap'],
             dodge=True,
             palette='viridis',
             errorbar='ci',
             capsize=0.1)
plt.title(f'Figure 1A: Mean Spread (Christmas & Easter Periods) | {SPREAD_METHOD_NAME}', fontsize=18, fontweight='bold')
plt.xlabel('Period', fontsize=14)
plt.ylabel('Mean Spread', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Cap Group', title_fontsize=12, fontsize=11)
plt.tight_layout()
plt.savefig("output/Figure_1A_CE_Spread.png", dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_1A_CE_Spread.png")

# Figure 1B: Volume
volume_ce = df_ce.groupby(['period_label', 'Cap_Group'], observed=False)['Volume'].mean().reset_index()
volume_ce['Volume_millions'] = volume_ce['Volume'] / 1e6
plt.figure(figsize=(14, 8))
sns.barplot(x='period_label', y='Volume_millions', hue='Cap_Group', data=volume_ce,
           hue_order=['Large Cap', 'Mid Cap', 'Small Cap'],
           palette='magma',
           errorbar='ci')
plt.title(f'Figure 1B: Mean Daily Volume (Christmas & Easter Periods) | {SPREAD_METHOD_NAME}', fontsize=18, fontweight='bold')
plt.xlabel('Period', fontsize=14)
plt.ylabel('Mean Volume (millions)', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Cap Group', title_fontsize=12, fontsize=11)
plt.tight_layout()
plt.savefig("output/Figure_1B_CE_Volume.png", dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_1B_CE_Volume.png")

# FIGURE 2: SUMMER ANALYSIS
print("\n" + "-"*80)
print("GENERATING FIGURE 2: SUMMER HOLIDAY")
print("-"*80)

summer_periods = ['control_year', 'summer_excl_holiday', 'summer_holiday']
df_summer = df[df['period_label'].isin(summer_periods)].copy()
df_summer['period_label'] = pd.Categorical(df_summer['period_label'],
                                           categories=summer_periods,
                                           ordered=True)

# Figure 2A: Spread
plt.figure(figsize=(10, 7))
sns.pointplot(x='period_label', y='Spread', hue='Cap_Group', data=df_summer,
             hue_order=['Large Cap', 'Mid Cap', 'Small Cap'],
             dodge=True,
             palette='viridis',
             errorbar='ci',
             capsize=0.1)
plt.title(f'Figure 2A: Mean Spread (Summer Holiday) | {SPREAD_METHOD_NAME}', fontsize=18, fontweight='bold')
plt.xlabel('Period', fontsize=14)
plt.ylabel('Mean Spread', fontsize=14)
plt.xticks(rotation=0)
plt.legend(title='Cap Group', title_fontsize=12, fontsize=11)
plt.tight_layout()
plt.savefig("output/Figure_2A_Summer_Spread.png", dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_2A_Summer_Spread.png")

# Figure 2B: Volume
volume_summer = df_summer.groupby(['period_label', 'Cap_Group'], observed=False)['Volume'].mean().reset_index()
volume_summer['Volume_millions'] = volume_summer['Volume'] / 1e6
plt.figure(figsize=(10, 7))
sns.barplot(x='period_label', y='Volume_millions', hue='Cap_Group', data=volume_summer,
           hue_order=['Large Cap', 'Mid Cap', 'Small Cap'],
           palette='magma',
           errorbar='ci')
plt.title(f'Figure 2B: Mean Daily Volume (Summer Holiday) | {SPREAD_METHOD_NAME}', fontsize=18, fontweight='bold')
plt.xlabel('Period', fontsize=14)
plt.ylabel('Mean Volume (millions)', fontsize=14)
plt.xticks(rotation=0)
plt.legend(title='Cap Group', title_fontsize=12, fontsize=11)
plt.tight_layout()
plt.savefig("output/Figure_2B_Summer_Volume.png", dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_2B_Summer_Volume.png")

# FIGURE 3: P-VALUE HEATMAPS WITH MULTIPLE TESTING CORRECTION
print("\n" + "-"*80)
print("GENERATING FIGURE 3: P-VALUE HEATMAPS (WITH MULTIPLE CORRECTIONS)")
print("-"*80)

# Import multiple testing correction
from statsmodels.stats.multitest import multipletests

p_values_spread = {}
p_values_volume = {}

# Collect all raw p-values for correction
all_p_spread = []
all_p_volume = []
test_info = []  # To keep track of which test corresponds to which p-value

for group in ['Large Cap', 'Mid Cap', 'Small Cap']:
    p_values_spread[group] = {}
    p_values_volume[group] = {}
    
    control = df_ce[(df_ce['period_label'] == 'control_year') & (df_ce['Cap_Group'] == group)]
    
    for period in [p for p in christmas_easter_periods if p != 'control_year']:
        event = df_ce[(df_ce['period_label'] == period) & (df_ce['Cap_Group'] == group)]
        
        if len(event) > 1 and len(control) > 1:
            _, p_spread = ttest_ind(event['Spread'].dropna(),
                                   control['Spread'].dropna(),
                                   equal_var=False)
            _, p_volume = ttest_ind(event['Volume'].dropna(),
                                   control['Volume'].dropna(),
                                   equal_var=False)
            
            all_p_spread.append(p_spread)
            all_p_volume.append(p_volume)
            test_info.append((group, period))
            
            p_values_spread[group][period] = p_spread
            p_values_volume[group][period] = p_volume

# Multiple Testing Correction

# Method 1: Bonferroni Correction (Conservative)
print("\n--- Applying Bonferroni Correction ---")
print(f'Total tests performed: {len(all_p_spread)}')
print(f'Bonferroni-corrected alpha level: {0.05 / len(all_p_spread):.5f}\n')

reject_spread_bonf, pvals_corrected_spread_bonf, _, _ = multipletests(
    all_p_spread, alpha=0.05, method='bonferroni'
)
reject_volume_bonf, pvals_corrected_volume_bonf, _, _ = multipletests(
    all_p_volume, alpha=0.05, method='bonferroni'
)

# Method 2: Benjamini-Hochberg (FDR) Correction (Less Conservative)
print("--- Applying Benjamini-Hochberg (FDR) Correction ---")
reject_spread_fdr, pvals_corrected_spread_fdr, _, _ = multipletests(
    all_p_spread, alpha=0.05, method='fdr_bh'
)
reject_volume_fdr, pvals_corrected_volume_fdr, _, _ = multipletests(
    all_p_volume, alpha=0.05, method='fdr_bh'
)
print("FDR correction applied.\n")

# Create DataFrames for Heatmaps and Comparison

# Create DataFrame for raw p-values (for heatmap annotation)
p_values_spread_df = pd.DataFrame(p_values_spread).T
p_values_volume_df = pd.DataFrame(p_values_volume).T

# Create DataFrames for corrected p-values
p_values_corrected_df_spread_bonf = pd.DataFrame(index=p_values_spread_df.index, columns=p_values_spread_df.columns)
p_values_corrected_df_volume_bonf = pd.DataFrame(index=p_values_volume_df.index, columns=p_values_volume_df.columns)
p_values_corrected_df_spread_fdr = pd.DataFrame(index=p_values_spread_df.index, columns=p_values_spread_df.columns)
p_values_corrected_df_volume_fdr = pd.DataFrame(index=p_values_volume_df.index, columns=p_values_volume_df.columns)

# Populate corrected p-value DataFrames
for i, (group, period) in enumerate(test_info):
    p_values_corrected_df_spread_bonf.loc[group, period] = pvals_corrected_spread_bonf[i]
    p_values_corrected_df_volume_bonf.loc[group, period] = pvals_corrected_volume_bonf[i]
    p_values_corrected_df_spread_fdr.loc[group, period] = pvals_corrected_spread_fdr[i]
    p_values_corrected_df_volume_fdr.loc[group, period] = pvals_corrected_volume_fdr[i]

# --- Print Comparison Tables ---
print("\n" + "-"*80)
print("COMPARISON OF CORRECTED P-VALUES (SPREAD)")
print("-"*80)
print("\n--- Bonferroni Corrected P-Values (Spread) ---")
print(p_values_corrected_df_spread_bonf.astype(float).round(4))
print("\n--- FDR Corrected P-Values (Spread) ---")
print(p_values_corrected_df_spread_fdr.astype(float).round(4))

print("\n" + "-"*80)
print("COMPARISON OF CORRECTED P-VALUES (VOLUME)")
print("-"*80)
print("\n--- Bonferroni Corrected P-Values (Volume) ---")
print(p_values_corrected_df_volume_bonf.astype(float).round(4))
print("\n--- FDR Corrected P-Values (Volume) ---")
print(p_values_corrected_df_volume_fdr.astype(float).round(4))

# Figure 3A: Spread p-values with Bonferroni-corrected significance
fig, ax = plt.subplots(figsize=(12, 7))
sns.heatmap(p_values_spread_df, annot=True, cmap='viridis_r', fmt=".3f",
            linewidths=.5, ax=ax, cbar_kws={'label': 'P-value (uncorrected)'})

# Highlight Bonferroni-significant cells (This part remains unchanged)
for i in range(len(p_values_corrected_df_spread_bonf.index)):
    for j in range(len(p_values_corrected_df_spread_bonf.columns)):
        val = p_values_corrected_df_spread_bonf.iloc[i, j]
        if val < 0.05:
            ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False,
                                      edgecolor='red', lw=3))

ax.set_title(f'Figure 3A: P-values for Spread (vs. Control) | {SPREAD_METHOD_NAME}\nRed border = p < 0.05 (Bonferroni-corrected)',
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig("output/Figure_3A_PValues_Spread.png", dpi=300, bbox_inches='tight')
plt.close()
print("\nSaved: Figure_3A_PValues_Spread.png")

# Figure 3B: Volume p-values with Bonferroni-corrected significance
fig, ax = plt.subplots(figsize=(12, 7))
sns.heatmap(p_values_volume_df, annot=True, cmap='plasma_r', fmt=".3f",
            linewidths=.5, ax=ax, cbar_kws={'label': 'P-value (uncorrected)'})

# Highlight Bonferroni-significant cells (This part remains unchanged)
for i in range(len(p_values_corrected_df_volume_bonf.index)):
    for j in range(len(p_values_corrected_df_volume_bonf.columns)):
        val = p_values_corrected_df_volume_bonf.iloc[i, j]
        if val < 0.05:
            ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False,
                                      edgecolor='red', lw=3))

ax.set_title(f'Figure 3B: P-values for Volume (vs. Control) | {SPREAD_METHOD_NAME}\nRed border = p < 0.05 (Bonferroni-corrected)',
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig("output/Figure_3B_PValues_Volume.png", dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_3B_PValues_Volume.png")

# FIGURE 4: HEATMAP - SPREAD BY CAP AND PERIOD
print("\n" + "-"*80)
print("GENERATING FIGURE 4: HEATMAP")
print("-"*80)

period_order = ['control_year', 'pre_easter', 'post_easter',
                'pre_christmas', 'christmas', 'post_christmas',
                'summer_excl_holiday', 'summer_holiday']
period_order = [p for p in period_order if p in df['period_label'].unique()]

heatmap_data = df.groupby(['Cap_Group', 'period_label'])['Spread'].mean().unstack()
heatmap_data = heatmap_data[period_order]

fig, ax = plt.subplots(figsize=(16, 6))

sns.heatmap(heatmap_data,
            annot=True,
            fmt='.5f',
            cmap='YlOrRd',
            linewidths=0.5,
            cbar_kws={'label': 'Avg Spread'},
            ax=ax,
            annot_kws={'fontsize': 11})

ax.set_title(f'Figure 4: Average Spread by Cap Group and Period | {SPREAD_METHOD_NAME}',
             fontweight='bold',
             fontsize=18,
             pad=15)

ax.set_xlabel('Period', fontsize=14, fontweight='bold')
ax.set_ylabel('Cap Group', fontsize=14, fontweight='bold')

ax.tick_params(axis='y', labelsize=13)
ax.tick_params(axis='x', labelsize=12)

ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

plt.tight_layout()
plt.savefig('output/Figure_4_Heatmap_Spread.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_4_Heatmap_Spread.png")

# FIGURE 5: TIME SERIES - MONTHLY AVERAGE SPREAD
print("\n" + "-"*80)
print(f"GENERATING FIGURE 5: TIME SERIES | {SPREAD_METHOD_NAME}")
print("-"*80)

df['YearMonth'] = df['Date'].dt.to_period('M')
monthly_spread = df.groupby('YearMonth')['Spread'].mean()

fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(monthly_spread.index.to_timestamp(), monthly_spread.values,
        color='darkblue', linewidth=1.5, alpha=0.8)
ax.set_title(f'Figure 5: Monthly Average Spread (2014-2024) | {SPREAD_METHOD_NAME}', fontweight='bold', fontsize=16)
ax.set_xlabel('Date', fontsize=14)
ax.set_ylabel('Average Spread', fontsize=14)
ax.grid(alpha=0.3)

covid_start = pd.Timestamp('2020-03-01')
covid_end = pd.Timestamp('2020-12-31')
ax.axvspan(covid_start, covid_end, alpha=0.2, color='red', label='COVID-19')
ax.legend(fontsize=12)

plt.tight_layout()
plt.savefig('output/Figure_5_Timeseries_Monthly.png', dpi=300, bbox_inches='tight')
plt.close()
print("Saved: Figure_5_Timeseries_Monthly.png")

print("\n" + "-"*80)
print("COMPLETE - 5 essential figures created")
print("-"*80)
print("\nAll outputs saved to 'output/' folder")


In [None]:
# SCRIPT 3: OLS REGRESSION ANALYSIS WITH DYNAMIC DESCRIPTIVE TITLES

print("-" * 80)
print("SCRIPT 3: OLS REGRESSION ANALYSIS")
# This will now work because SPREAD_METHOD_NAME was defined in a previous cell
print(f"Using Dependent Variable: {SPREAD_METHOD_NAME}")
print("-" * 80)

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Load data
try:
    df = pd.read_csv(INPUT_CSV, parse_dates=['Date'])
    print(f"Successfully loaded data from {INPUT_CSV}")
except FileNotFoundError:
    print(f"FATAL ERROR: The input file '{INPUT_CSV}' was not found. Please ensure SCRIPT 1 has been run successfully.")
    exit()

# Data Preparation
print("\n--- Preparing data for regression analysis ---")
df = df.sort_values(['Ticker', 'Date'])
df['Return'] = df.groupby('Ticker')['Close'].pct_change()
df['Volatility'] = df.groupby('Ticker')['Return'].transform(lambda x: x.shift(1).rolling(window=20, min_periods=10).std())
df['Log_Volume'] = np.log1p(df['Volume'])
df['Log_Volume_lag1'] = df.groupby('Ticker')['Log_Volume'].shift(1)

# Holiday and Market Cap Dummies
df['Pre_Easter'] = (df['period_label'] == 'pre_easter').astype(int)
df['Post_Easter'] = (df['period_label'] == 'post_easter').astype(int)
df['Christmas'] = (df['period_label'] == 'christmas').astype(int)
df['Pre_Christmas'] = (df['period_label'] == 'pre_christmas').astype(int)
df['Post_Christmas'] = (df['period_label'] == 'post_christmas').astype(int)
df['Summer'] = (df['period_label'] == 'summer_holiday').astype(int)
df['Large_Cap'] = (df['Cap_Group'] == 'Large Cap').astype(int)
df['Mid_Cap'] = (df['Cap_Group'] == 'Mid Cap').astype(int)

# Create regression sample
df_reg = df.dropna(subset=['Spread', 'Log_Volume_lag1', 'Volatility']).copy()
print(f"Regression sample created with {len(df_reg):,} observations.")

# Models 1A-C: Baseline Holiday Effects by Market Cap
print("\n--- MODELS 1A-1C: BASELINE HOLIDAY EFFECTS BY MARKET CAP ---")
baseline_models = {}
for i, cap in enumerate(['Large Cap', 'Mid Cap', 'Small Cap']):
    print(f"\nProcessing Model 1{chr(65+i)}: {cap}")
    df_cap = df_reg[df_reg['Cap_Group'] == cap].copy()
    
    y = df_cap['Spread']
    X = df_cap[['Pre_Easter', 'Post_Easter', 'Christmas', 'Pre_Christmas', 'Post_Christmas', 'Summer']]
    X = sm.add_constant(X)
    
    model = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': df_cap['Ticker']})
    baseline_models[cap] = model
    
    # Create and save summary as an image with the dynamic title
    fig, ax = plt.subplots(figsize=(12, 10))
    ax.axis('off')
    title_text = f"Table 1{chr(65+i)}: Baseline Spread Regression - {cap} Stocks\nDependent Variable: {SPREAD_METHOD_NAME}"
    ax.text(0.5, 0.98, title_text, ha='center', va='top', fontsize=12, fontweight='bold', transform=ax.transAxes)
    summary_text = str(model.summary())
    ax.text(0.01, 0.93, summary_text, fontdict={'fontname': 'monospace', 'fontsize': 9}, va='top', ha='left', transform=ax.transAxes)
    
    filename = f"output/Table_1{chr(65+i)}_Baseline_{cap.replace(' ', '_')}.png"
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"SUCCESS: Saved model summary to {filename}")

# Model 2: Full Sample with Controls
print("\n--- MODEL 2: SPREAD WITH CONTROL VARIABLES ---")
X2 = df_reg[['Pre_Easter', 'Post_Easter', 'Christmas', 'Pre_Christmas', 'Post_Christmas', 'Summer',
             'Log_Volume_lag1', 'Volatility', 'Large_Cap', 'Mid_Cap']]
X2 = sm.add_constant(X2)
y2 = df_reg['Spread']
model2 = sm.OLS(y2, X2).fit(cov_type='cluster', cov_kwds={'groups': df_reg['Ticker']})

fig, ax = plt.subplots(figsize=(12, 11))
ax.axis('off')
title_text = (f'Table 2: Spread Determinants with Control Variables (Full Sample)\n'
              f'Dependent Variable: {SPREAD_METHOD_NAME} | Controls: Lagged Volume, Volatility, Market Cap')
ax.text(0.5, 0.98, title_text, ha='center', va='top', fontsize=12, fontweight='bold', transform=ax.transAxes)
summary_text = str(model2.summary())
ax.text(0.01, 0.93, summary_text, fontdict={'fontname': 'monospace', 'fontsize': 9}, va='top', ha='left', transform=ax.transAxes)
filename_2 = 'output/Table_2_Spread_Full_Controls.png'
plt.savefig(filename_2, dpi=300, bbox_inches='tight')
plt.close()
print(f"SUCCESS: Saved model summary to {filename_2}")

print("\n" + "-"*80)
print("Script finished. All regression tables have been saved as PNG images in the 'output' folder.")
print("-" * 80)


In [None]:
# SCRIPT 4: ROBUSTNESS CHECK - EXCLUDE COVID PERIOD
print("-" * 80)
print("SCRIPT 5: ROBUSTNESS CHECK - EXCLUDING COVID PERIOD")
print("-" * 80)

# Define the input file and output directory
INPUT_CSV = "oslo_bors_labelled_data.csv"
output_dir = 'output'

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Load the data
try:
    df = pd.read_csv(INPUT_CSV, parse_dates=['Date'])
    print(f"Successfully loaded data from '{INPUT_CSV}'")
except FileNotFoundError:
    print(f"FATAL ERROR: The input file '{INPUT_CSV}' was not found. Please ensure it's in the correct directory.")
    exit() # Stop the script if the file doesn't exist

# Exclude the COVID-19 Period
print("\n--- Excluding the entire year 2020 for robustness check ---")
df_robust = df[df['Date'].dt.year != 2020].copy()
print(f"Original sample size: {len(df):,} observations.")
print(f"Robust sample size (2020 excluded): {len(df_robust):,} observations.")
print(f"Removed {len(df) - len(df_robust):,} observations.")

# Step 2: Re-create all necessary variables
print("\n--- Re-creating analysis variables for the robust sample ---")
df_robust = df_robust.sort_values(['Ticker', 'Date'])
df_robust['Return'] = df_robust.groupby('Ticker')['Close'].pct_change()
df_robust['Volatility'] = df_robust.groupby('Ticker')['Return'].transform(lambda x: x.shift(1).rolling(window=20, min_periods=10).std())
df_robust['Log_Volume'] = np.log1p(df_robust['Volume'])
df_robust['Log_Volume_lag1'] = df_robust.groupby('Ticker')['Log_Volume'].shift(1)
df_robust['Large_Cap'] = (df_robust['Cap_Group'] == 'Large Cap').astype(int)
df_robust['Mid_Cap'] = (df_robust['Cap_Group'] == 'Mid Cap').astype(int)
df_robust['Summer'] = (df_robust['period_label'] == 'summer_holiday').astype(int)
df_robust['Christmas'] = (df_robust['period_label'] == 'christmas').astype(int)
# Easter and other periods are intentionally excluded as per your original script's logic for the main models

# Drop rows with missing values for the regression
initial_len = len(df_robust)
df_robust = df_robust.dropna(subset=['Spread', 'Log_Volume_lag1', 'Volatility']).copy()
print(f"Removed {initial_len - len(df_robust):,} rows with missing values needed for regression.")
print(f"Final robust sample size: {len(df_robust):,} observations.")

# Step 3: Run the Single Robustness Regression Model
print("\n--- Running the main robustness regression model (OLS) ---")

# Define dependent and independent variables
y = df_robust['Spread']
X = df_robust[['Summer', 'Christmas', 'Log_Volume_lag1', 'Volatility', 'Large_Cap', 'Mid_Cap']]
X = sm.add_constant(X)

# Fit the OLS model with clustered standard errors
model_robust = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': df_robust['Ticker']})

# Step 4: Save the Regression Output as a PNG Image
print("\n--- Saving the regression summary as a PNG image ---")
fig, ax = plt.subplots(figsize=(12, 11))
ax.axis('off')

# Set a descriptive title
title_text = ('Table 5: Robustness Check - Excluding COVID-19 Period (2020)\n'
              f'Dependent Variable: {SPREAD_METHOD_NAME} | Controls: Lagged Volume, Volatility, Market Cap')
ax.text(0.5, 0.98, title_text, ha='center', va='top', fontsize=12, fontweight='bold', transform=ax.transAxes)

# Add the model summary text to the figure
summary_text = str(model_robust.summary())
ax.text(0.01, 0.93, summary_text, fontdict={'fontname': 'monospace', 'fontsize': 9}, va='top', ha='left', transform=ax.transAxes)

# Define the output filename and save the figure
output_filename_png = 'output/Table_5_Robustness_No_COVID.png'
plt.savefig(output_filename_png, dpi=300, bbox_inches='tight')
plt.close() # Close the figure to free up memory
print(f"SUCCESS: Regression summary saved to '{output_filename_png}'")

# Step 5: Create and Save a Coefficient Summary Table
print("\n--- Creating and saving a summary of key coefficients ---")

# Extract key coefficients into a DataFrame
coef_table = pd.DataFrame({
    'Variable': ['Summer', 'Christmas', 'Large_Cap', 'Mid_Cap'],
    'Coefficient': model_robust.params[['Summer', 'Christmas', 'Large_Cap', 'Mid_Cap']],
    'Std_Error': model_robust.bse[['Summer', 'Christmas', 'Large_Cap', 'Mid_Cap']],
    'P_value': model_robust.pvalues[['Summer', 'Christmas', 'Large_Cap', 'Mid_Cap']]
}).reset_index(drop=True)

# Add significance stars
coef_table['Significance'] = coef_table['P_value'].apply(lambda x: '***' if x < 0.01 else ('**' if x < 0.05 else ('*' if x < 0.1 else '')))
print("\nKey Coefficients (Robust Sample):")
print(coef_table.to_string(index=False))

# Save the coefficient table to a CSV file
output_filename_csv = 'output/Robustness_Coefficients_Summary.csv'
coef_table.to_csv(output_filename_csv, index=False)
print(f"\nSUCCESS: Coefficient summary saved to '{output_filename_csv}'")


print("\n" + "-"*80)
print("ROBUSTNESS CHECK SCRIPT COMPLETE")
print("-" * 80)
