# Main thesis regressions

In [None]:
# Make sure you're in the right directory
%cd "/Users/simon/code/thesis/"

In [None]:
from collections import OrderedDict
from pathlib import Path
from pprint import pprint
import warnings

import linearmodels
import numpy as np
import pandas as pd
import plotly_express as px
import statsmodels.api as sm
from scipy.stats import anderson_ksamp
from tqdm.notebook import tqdm

from combine_stats_and_frag.load_daily_data import load_frag_data, load_market_quality_statistics, load_copustat

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Load data 

## Fragmentation data

In [None]:
frag = load_frag_data()

# filter
print(frag.shape)
print("First date: \t", frag.index.min())
print("Last date: \t", frag.index.max())

frag.set_index("isin", append=True, inplace=True)

## Compustat data

In [None]:
compustat = load_copustat()

## Market quality data

In [None]:
filename = "20200129_09-02-47_liquidity_stats.csv"
filepath = Path(f"statistics/daily_liquidity/{filename}")
assert filepath.is_file()

In [None]:
# load stats
daily_stats = load_market_quality_statistics(filepath=filepath)

# append "isin" to index
daily_stats.set_index("isin", append=True, inplace=True)

print(daily_stats.shape)
print("First date: \t", daily_stats.index.get_level_values("date").min())
print("Last date: \t", daily_stats.index.get_level_values("date").max())

In [None]:
daily_stats.rename(columns={"num_transactions": "num_orders_aggr"}, inplace=True)
daily_stats.rename(columns={"num_orders_total": "num_orders_passive"}, inplace=True)

In [None]:
daily_stats["quoted_rel_spread_bps_time_weighted"] *= 100
daily_stats["eff_rel_spread_bps_weighted"] *= 100

## Combine the three dataframes into one

In [None]:
# combine
stats = daily_stats.join(frag, how="left", lsuffix="_IMI", sort=False)
stats = stats.join(compustat, how="left", rsuffix="_compu", sort=False)

# first level of index needs to be entity variable
stats = stats.reset_index("date").set_index("date", append=True)

print("First date: \t", stats.index.get_level_values("date").min())
print("Last date: \t", stats.index.get_level_values("date").max())
print(stats.shape)

# Create quartiles

### By turnover

In [None]:
# condition = stats.index.get_level_values("date") < pd.Timestamp("2019-07-01")
turnover_stats = stats["turnover"].reset_index("isin").groupby("isin").median()

lower_quartile = turnover_stats["turnover"].quantile(0.25)
median = turnover_stats["turnover"].median()
upper_quartile = turnover_stats["turnover"].quantile(0.75)

conditions = {"3 bottom turnover": turnover_stats["turnover"] < lower_quartile,
              "2 low turnover": (lower_quartile <= turnover_stats["turnover"]) & (turnover_stats["turnover"] < median),
              "1 high turnover": (median <= turnover_stats["turnover"]) & (turnover_stats["turnover"] < upper_quartile),
              "0 top turnover": upper_quartile <= turnover_stats["turnover"]
             }

stats.reset_index("date", inplace=True)

for quartile, condition in conditions.items():
    isins = turnover_stats[condition].index
    stats.loc[isins, "turnover_category"] = quartile 
    
stats.set_index("date", append=True, inplace=True)

In [None]:
num_stocks = stats["turnover_category"].reset_index().groupby("turnover_category")["isin"].nunique()
print(f"Total number of stocks {num_stocks.sum()}")
num_stocks

### Excluding low turnover stocks?  

In [None]:
# exclude bottom turnover from sample?
stats = stats[~stats["turnover_category"].isin(["3 bottom turnover", "2 low turnover"])]

In [None]:
num_stocks = stats["turnover_category"].reset_index().groupby("turnover_category")["isin"].nunique()
print(f"Total number of stocks {num_stocks.sum()}")
num_stocks

In [None]:
relevant_isins = stats.index.get_level_values("isin").unique()
relevant_isins = relevant_isins.to_frame().reset_index(drop=True)
# # Export isins to csv?
# relevant_isins.to_csv("relevant_isins.csv", index=False)

### Market share quartiles

In [None]:
frag_measure = "market_share"  # "non_fragmentation_index" 

In [None]:
frag_per_isin = stats.groupby(["after_nonequivalence", "isin"])[frag_measure].quantile(0.5)
frag_per_isin = frag_per_isin.unstack("after_nonequivalence")
frag_per_isin[frag_measure] = frag_per_isin[True] - frag_per_isin[False]
frag_per_isin.drop(columns=[False, True], inplace=True)

In [None]:
condition = stats.index.get_level_values("date") < pd.Timestamp("2019-07-01")
frag_per_isin = stats.loc[condition, [frag_measure]].reset_index("isin")
frag_per_isin = frag_per_isin.groupby(["isin"]).quantile(0.50)

In [None]:
# # Option 1: simple 
# # a stock is not fragmented, if on more than 50% of all trading days, there was no trading on other venues (see cell above)
# nonfragmentation = frag_per_isin[frag_measure] == 1
# frag_per_isin.loc[nonfragmentation, "fragmentation"] = "not fragmented"
# frag_per_isin.loc[~nonfragmentation, "fragmentation"] = "fragmented"

# Option 2: by quartiles
lower_quartile = frag_per_isin[frag_measure].quantile(0.25)
median = frag_per_isin[frag_measure].median()
upper_quartile = frag_per_isin[frag_measure].quantile(0.75)
conditions = {
    "Q1": frag_per_isin[frag_measure] < lower_quartile,
    "Q2": (lower_quartile <= frag_per_isin[frag_measure]) & (frag_per_isin[frag_measure] < median),
    "Q3": (median <= frag_per_isin[frag_measure]) & (frag_per_isin[frag_measure] < upper_quartile),
    "Q4": upper_quartile <= frag_per_isin[frag_measure],
}
for fragmentation, condition in conditions.items():
    frag_per_isin.loc[condition, "fragmentation"] = fragmentation 

In [None]:
frag_per_isin["fragmentation"].value_counts()

# left join to stats
stats = stats.join(frag_per_isin["fragmentation"], on="isin")

In [None]:
# showing those isin's that did not have 375 observations
num_dates = stats.reset_index().groupby(["fragmentation", "isin"])["date"].nunique()
num_dates[num_dates != 375]

In [None]:
condition = stats.index.get_level_values("date") < pd.Timestamp("2019-07-01")
num_stocks = stats.reset_index().groupby(["fragmentation"])[["isin"]].nunique()  # .describe()
print(f"Total number of stocks {num_stocks['isin'].sum()}")
num_stocks

In [None]:
# remember: groups can change over time, that's why there are more stocks than total above
stats.reset_index().groupby(["group", "fragmentation"])[["isin"]].nunique()

In [None]:
stats.reset_index().groupby(["fragmentation", "turnover_category", "group"])[["isin"]].nunique()

In [None]:
stats[condition].reset_index().groupby(["fragmentation"])[[frag_measure]].describe()

In [None]:
stats.reset_index().groupby(["after_nonequivalence"])[["isin"]].describe()

## Market Cap variable

In [None]:
stats["market_cap"] = stats["shares_outstanding"] * stats["price_close"]
market_cap_average_log = np.log(stats.groupby("isin")["market_cap"].mean())
market_cap_average_log.name = "market_cap_average_log"
stats = stats.join(market_cap_average_log)

In [None]:
(stats.reset_index().groupby(["fragmentation"])[["market_cap_average_log"]].describe()).round(2)

## Fragmentation table

In [None]:
table = list()
for measure in ("market_share", "lit_frag", "market_cap", "turnover"):
    descriptive = stats.reset_index().groupby(["fragmentation"])[[measure]].describe()
    if measure == "market_cap":
        descriptive /= 1e6
        descriptive = descriptive.applymap("{:.0f}".format)
    elif measure == "turnover":
        descriptive /= 1e6
        descriptive = descriptive.applymap("{:.1f}".format)
    else:
        descriptive = descriptive.applymap("{:.2f}".format)
        
    descriptive = descriptive.loc[:, pd.IndexSlice[: , ["mean", "50%", "std"]]]
    table.append(descriptive)
    
table = pd.concat(table, axis=1)
table.rename(
    columns={
        "market_share": "SIX market share",
        "lit_frag": "LitFrag",
        "market_cap": "Market Cap",
        "turnover": "Turnover",
        "mean": "Mean",
        "std": "StDev",
        "50%": "Median"
    }, 
    inplace=True,
)
table = table.T.reindex(["Mean", "Median", "StDev"], level=1).T

num_stocks = stats.reset_index().groupby("fragmentation")["isin"].nunique()
num_stocks = num_stocks.rename("Num stocks").to_frame()
num_stocks.columns = pd.MultiIndex.from_product([num_stocks.columns, ['']])
table = table.join(num_stocks)

for idx in range(4):
    idx += 1
    table.loc[f"Q{idx}", "Fragmentation"] = f"Quartile {idx}"
table.set_index("Fragmentation", inplace=True)

table = table[["Num stocks", "SIX market share", "LitFrag", "Turnover", "Market Cap"]]

In [None]:
table

In [None]:
print(table.to_latex())

## Time variables & dummies

In [None]:
# stats.loc[stats["fragmentation"].isin(["3_little_fragmented", "4_not_fragmented"]), "frag_dummy"] = 0
# stats["frag_dummy"].fillna(value=1, inplace=True)
# stats["frag_dummy"] = stats["frag_dummy"].astype(int)
# stats.reset_index().groupby(["frag_dummy"])[["isin"]].describe()

In [None]:
# stats[stats["frag_dummy"] == 1].index.get_level_values("isin").unique().to_frame().reset_index(drop=True).to_csv("frag_isins.csv", index=False)

In [None]:
dates = stats.index.get_level_values("date")
stats.loc[7 <= dates.month, "half_year"] = "H2"
stats["half_year"].fillna(value="H1", inplace=True)
stats["semester"] = dates.year.astype("str") + "_" + stats["half_year"]

In [None]:
stats["dummy_2019"] = dates.year == 2019

## Calculate daily returns & Amihud 2002

In [None]:
stats.sort_index(inplace=True)

stats["abs_simple_returns"] = np.abs(stats["price_close"] / stats["price_close"].groupby("isin").shift(1) - 1)
stats["amihud"] = stats["abs_simple_returns"] / stats["turnover"] * 1e9  # _simple_simple

stats[["amihud", "semester", "fragmentation"]].groupby(["fragmentation", "semester"]).mean()

In [None]:
# plot single measure for a quartile
measure = "eff_rel_spread_bps_weighted"
plot_data = stats.loc[stats["fragmentation"] == "Q4", measure].reset_index().dropna()
# px.scatter(plot_data, x="date", y=measure, color="isin")

In [None]:
isin = "CH0012549785"

# measures = ["price_mean", "price_close", "price_log", "price_reciprocal"]
measures = ["quoted_rel_spread_bps_time_weighted", "eff_rel_spread_bps_weighted", "min_tick_size"]
# measures = ["market_cap", "market_cap_average_log", "price_close", "shares_outstanding"]

plot_data = stats.loc[isin, measures]
plot_data = plot_data.stack().reset_index().rename(columns={"level_1": "measure", 0: "value"})
# px.scatter(plot_data, x="date", y="value", color="measure")

# Panel Regressions

## Define regressions

In [None]:
def run_panel_regression(
        data: pd.DataFrame, 
        measures: list,
        control_variables: list,
        entity_effects: bool,
        time_effects: bool
    ):
    detailed_results = OrderedDict()

    for idx, measure in enumerate(measures):

        if measure.startswith(("time", "depth", "num", "message_counts", "value")) and not measure.endswith("percent"):
            dependent = np.log(data[measure])
#             measure = measure + "_log"
        else:
            dependent = data[measure]
            
        if measure == "amihud":
            control_variables = [var for var in exog_vars if var not in ["log_turnover", "RV_slow"]]
            
        elif measure == "RV_slow" or measure == "VSMI":
            control_variables = [var for var in exog_vars if var not in ["VSMI", "RV_slow"]]
        
        elif measure in exog_vars:
            control_variables = [var for var in exog_vars if var != measure]
        
        else:
            control_variables = exog_vars
        
        exogenous = sm.add_constant(data[control_variables])

        model = linearmodels.PanelOLS(dependent=dependent,
                                      exog=exogenous,
                                      entity_effects=entity_effects,
                                      time_effects=time_effects,
                                     )
        try:
            result = model.fit(cov_type='clustered',
                               cluster_entity=True,
                               cluster_time=True,
                              )
        except Exception as exception:
            print(measure)
            print(exception)
            continue

        # store the result
        detailed_results[measure] = result
        
    return detailed_results
    

def deep_dive_coef(detailed_results, variable: str):

    coef_results = pd.DataFrame(columns=["param", "lower", "upper", "tstat", "pvalue"])  # , "lower", "upper"
        
    for measure, result in detailed_results.items():
        param = result.params[variable]
        lower, upper = result.conf_int().loc[variable]
        tstat = result.tstats[variable]
        pvalue = result.pvalues[variable]
        coef_results.loc[measure] = (param, lower, upper, tstat, pvalue)  # , lower, upper
    
    return coef_results



In [None]:
def run_ols(data, measures, exog_vars):

    detailed_results = OrderedDict()
    
    for idx, measure in enumerate(measures):
        
        if measure == "amihud":
            control_variables = [var for var in exog_vars if var not in ["log_turnover", "RV_slow"]]
            
        elif measure == "RV_slow" or measure == "VSMI":
            control_variables = [var for var in exog_vars if var not in ["VSMI", "RV_slow"]]
        
        elif measure in exog_vars:
            control_variables = [var for var in exog_vars if var != measure]
        
        else:
            control_variables = exog_vars
        
        exog = sm.add_constant(data[control_variables])

        if measure.startswith(("time", "depth", "num", "message_counts", "value")) and not measure.endswith("percent"):
            endog = np.log(data[measure])
        else:
            endog = data[measure]

        model = linearmodels.PooledOLS(endog, exog)
        result = model.fit(
            cov_type='clustered',
            cluster_entity=True,
            cluster_time=True,
        )

        # store the result
        detailed_results[measure] = result
        
    return detailed_results

In [None]:
def highlight_lower_than(pvalue):
    if pvalue < 0.01:
        color = "navajowhite"  # "darkgrey"
#             output = "{:.3f} *".format(value)
    elif pvalue < 0.05:
        color = "blanchedalmond"  # "silver"
    elif pvalue < 0.1:
        color = "cornsilk"  # "gainsboro"
    else:
        color = None
    return f"background-color: {color}"

def highlight_significance(data, pvalues):
    background_colors = pvalues.applymap(highlight_lower_than)
    return background_colors

def font_color(value):
    color = 'red' if value < 0 else 'black'
    return f"color: {color}"

def display_results(combined_results):
    
    params = combined_results["param"]
    pvalues = combined_results["pvalue"]
    
    styled = params.round(3).style.applymap(font_color).apply(highlight_significance, pvalues=pvalues, axis=None)

    return styled

In [None]:
def convert_to_significance(pvalue):
    if pvalue < 0.01:
        return "***"
    elif pvalue < 0.05:
        return "**"
    elif pvalue < 0.05:
        return "*"
    else:
        return ""
    
def format_pvalues(series):
    return series.apply(lambda val: val.apply(convert_to_significance))

def format_stars(table, precision=3):
    
    lower = table[["lower"]].round(precision).astype(str)
    lower.columns = lower.columns.droplevel()
    upper = table[["upper"]].round(precision).astype(str)
    upper.columns = upper.columns.droplevel()
    confidence = "[" + lower + ",  " + upper + "]"
    confidence.columns = pd.MultiIndex.from_product([['conf'], confidence.columns])
    
    format_num = "{:." + f"{precision}" + "f}"
    params = table["param"].applymap(lambda num: format_num.format(num))
    pvalues = table["pvalue"]
    tstats = table[["tstat"]].applymap(lambda num: "(" + format_num.format(num) + ")")
    
    params = pvalues.applymap(convert_to_significance) + params
    params.columns = pd.MultiIndex.from_product([['coef'], params.columns])
    
    formatted = pd.concat([params, tstats, confidence])
    formatted.columns.rename("coef_type", level=0, inplace=True)
    formatted = formatted.stack("coef_type")
    formatted.columns.rename("frag_quartile", inplace=True)

    formatted = formatted.reindex(sorted(formatted.columns), axis=1)
    formatted.sort_values(by=["measure", "coef_type"], ascending=True, inplace=True)
    
    return formatted 

In [None]:
liquidity_measures = [
    'quoted_rel_spread_bps_time_weighted',
    'eff_rel_spread_bps_weighted',
    'depth_time_weighted_average',
]
amihud_turnover_measures = ["log_turnover", "RV_slow", "amihud"]

counts_measures = measures = [
    'AT_proxy',
    'num_orders_aggr',
    'num_orders_passive',
    'num_orders_deleted',
    'num_orders_filled',
    'value_entered_mean',
    'value_entered_median',
    'value_entered_total',
    'value_filled_total',
]


all_measures = liquidity_measures + amihud_turnover_measures + counts_measures
measures = all_measures

control_vars = [
#     "RV_slow",
    "VSMI",  # Riordan & Storkenmaier 2012 JFM, p.427, quotes Hendershott & Moulton 2011 JFM, p.583
    "min_tick_size",
    "price_log",
]


explaining_variable = "after_nonequivalence"  # "dummy_2019"

exog_vars = [explaining_variable] + control_vars
exog_vars

## Run the regression

In [None]:
detailed_results = dict()
coef_results = dict()

conditions = {
    "": pd.Series(True, index=stats.index),  # all_
#     "2019_only_": stats.index.get_level_values("date").year == 2019,
#     "H2_only_": stats["half_year"] == "H2",
#     "before_": stats.index.get_level_values("date") < pd.Timestamp("2019-07-01")
}

for condition_name, condition in conditions.items():
    
    subset = stats[condition]
    
#     # Full sample
#     regression_name = f"{condition_name}Full sample"
#     detailed_result = run_panel_regression(subset, measures, exog_vars, entity_effects=True, time_effects=False)
#     detailed_results[regression_name] = detailed_result
#     coef_result = deep_dive_coef(detailed_result, explaining_variable)
#     coef_results[regression_name] = coef_result
    
    # Per fragmentation quartile
    for frag_dummy, data in tqdm(subset.groupby("fragmentation")):

        regression_name = f"{condition_name}{frag_dummy}"
        detailed_result = run_panel_regression(data, measures, exog_vars, entity_effects=True, time_effects=False)
        detailed_results[regression_name] = detailed_result
        coef_result = deep_dive_coef(detailed_result, explaining_variable)
        coef_results[regression_name] = coef_result

### Create the tables

In [None]:
combined = pd.concat(coef_results)
combined.index.set_names(["fragmentation", "measure"], inplace=True)
combined = combined.unstack("fragmentation")
combined.columns.set_names(["coef_type", "fragmentation"], inplace=True)
combined = combined.reindex(combined.columns.sortlevel(level="fragmentation")[0], axis=1)

In [None]:
# Define here which variables we'd like to see
subset = liquidity_measures + amihud_turnover_measures   # counts_measures / liquidity_measures / amihud_turnover_measures

subset = combined.loc[subset].copy()
export_this = format_stars(subset, precision=2)
export_this.reset_index("coef_type", inplace=True)

In [None]:
export_this["coef_type"] = export_this["coef_type"].astype("category")
export_this["coef_type"] = export_this["coef_type"].cat.reorder_categories(["coef", "tstat", "conf"], ordered=True)
export_this = export_this.sort_values(["measure", "coef_type"]).drop(columns="coef_type")

In [None]:
export_this.rename(
    index={
        "quoted_rel_spread_bps_time_weighted": "QSpread",
        "eff_rel_spread_bps_weighted": "ESpread",
        "depth_time_weighted_average": "lnDepth",
        "AT_proxy": "AT_proxy",
        "num_orders_aggr":"Num aggressive Orders",
        "num_orders_deleted": "Num deleted Orders",
        "num_orders_filled": "Num filled Orders",
        "num_orders_passive": "Num passive Orders",
        "value_entered_total": "Log Volume Entered",
        "value_filled_total": "Log Volume Filled",
    },
    columns={col: "Quartile " + col[-1] for col in export_this.columns},
    inplace=True,
)

In [None]:
export_this

In [None]:
print(export_this.to_latex())

In [None]:
display_results(combined)

In [None]:
measure = measures[0]
pprint(measures)
print(f"\nSelected: {measure}")
samples = combined.columns.get_level_values("fragmentation").unique().tolist()
regr_table = linearmodels.panel.compare([detailed_results.get(sample).get(measure) for sample in samples], precision="pvalues")
regr_table

# OLS with stock-level controls
Riordan & Storkenmeier 2012, Hendershott & Moulton 2011

In [None]:
if "market_cap_average_log" not in control_vars:
    control_vars += ["market_cap_average_log"]

exog_vars = [explaining_variable] + control_vars

exog_vars

In [None]:
detailed_results = dict()
coef_results = dict()

conditions = {
    "": pd.Series(True, index=stats.index),  # all_
#     "2019_only_": stats.index.get_level_values("date").year == 2019,
#     "H2_only_": stats["half_year"] == "H2",
#     "before": stats.index.get_level_values("date") < pd.Timestamp("2019-07-01")
}

for condition_name, condition in tqdm(conditions.items()):
    
    subset = stats[condition]
    
#     # Full sample
#     regression_name = f"{condition_name}Full sample"
#     detailed_result = run_panel_regression(subset, measures, exog_vars, entity_effects=True, time_effects=False)
#     detailed_results[regression_name] = detailed_result
#     coef_result = deep_dive_coef(detailed_result, explaining_variable[0])
#     coef_results[regression_name] = coef_result
    
    # Per fragmentation quartile
    for frag_dummy, data in subset.groupby("fragmentation"):

        regression_name = f"{condition_name}{frag_dummy}"
        detailed_result = run_ols(data, measures, exog_vars)
        detailed_results[regression_name] = detailed_result
        coef_result = deep_dive_coef(detailed_result, explaining_variable)
        coef_results[regression_name] = coef_result

In [None]:
combined = pd.concat(coef_results)
combined.index.set_names(["fragmentation", "measure"], inplace=True)
combined = combined.unstack("fragmentation")
combined.columns.set_names(["coef_type", "fragmentation"], inplace=True)
combined = combined.reindex(combined.columns.sortlevel(level="fragmentation")[0], axis=1)

In [None]:
export_this = format_stars(combined, precision=3)
# print(export_this.to_latex(sparsify=True))
export_this

In [None]:
display_results(combined)

In [None]:
pprint(measures)
measure = measures[0]
print(f"\nSelected: {measure}")
samples = combined.columns.get_level_values("fragmentation").unique().tolist()
linearmodels.panel.compare([detailed_results.get(sample).get(measure) for sample in samples], precision="pvalues")

# Changes of averages

similar to Riordan & Storkenmaier JFM 2012 p.426

In [None]:
measures_subset = all_measures + ["turnover"]
frag_measure = "fragmentation"

averages = stats.groupby(["after_nonequivalence", frag_measure])[measures_subset].describe().sort_index(level=frag_measure)

# transform CHF cols to CHF 1000
depth_cols = [col for col in averages.columns if col[0].startswith("depth") or col[0].startswith("turnover")]
averages[depth_cols] = averages[depth_cols] / 1000

averages.columns = averages.columns.swaplevel()
averages = averages[["mean"]]  #, "std", "50%", 
averages.rename(columns={"50%": "median"}, inplace=True)

averages.columns = averages.columns.swaplevel()
averages = averages.unstack("fragmentation").sort_index(axis=1).round(3)

# averages.loc["diff"] = averages.diff().loc[True]
averages.loc["relative change"] = (averages.loc[True] - averages.loc[False]) / np.abs(averages.loc[False])

averages = averages.stack("fragmentation").sort_index(level="fragmentation")

In [None]:
averages = averages.round(2)
averages.reset_index("after_nonequivalence", inplace=True)
averages["after_nonequivalence"] = averages["after_nonequivalence"].replace({True: "post non-eq", False: "pre non-eq"})
averages["after_nonequivalence"] = averages["after_nonequivalence"].astype("category")
cat_order = ["pre non-eq", "post non-eq", "relative change"]
averages["after_nonequivalence"] = averages["after_nonequivalence"].cat.reorder_categories(cat_order, ordered=True)
averages.set_index("after_nonequivalence", append=True, inplace=True)

In [None]:
averages.columns = averages.columns.swaplevel().droplevel()

averages = averages.stack().unstack("fragmentation")
averages.index = averages.index.swaplevel()
averages.sort_index(inplace=True)
averages.style.format("{:.2f}")

In [None]:
def font_negative_positive(value):
    if value < 0:
        color = 'red'
    elif value > 0:
        color = "limegreen"
    else:
        color = "black"
    return f"color: {color}"

In [None]:
idx = pd.IndexSlice
averages_table = averages.style.format("{:.2f}")
# averages_table

In [None]:
diffs = averages.loc[idx[:, "relative change"], :]
diffs.style.applymap(font_negative_positive).format("{:.2f}")

In [None]:
print(diffs.to_latex())

# Changes of sums

In [None]:
# sum of messages
stats["year"] = stats.index.get_level_values("date").year

measures_subset = ["turnover", "num_orders_passive"]

frag_measure = "fragmentation"

sums = stats.groupby(["date", frag_measure])[measures_subset].sum().sort_index(level=frag_measure)

sums.rename(columns={"num_orders_aggr": "num_orders_aggr"}, inplace=True)
sums = sums.stack().to_frame().reset_index()
sums.rename(columns={0: "value", "level_2": "measure"}, inplace=True)

In [None]:
fig = px.scatter(sums, x="date", y="value", color="fragmentation", facet_row="measure")
fig.update_yaxes(matches=None)  # free y-axis scale
fig

In [None]:

# sums.loc["diff"] = sums.diff().loc[True]

# sums = sums.stack("fragmentation").sort_index(level="fragmentation")
# relative_difference = sums.loc[True] / sums.loc[False] - 1

# sums = sums.T / 1e6  # in millions
# sums.drop(["AT_proxy"], inplace=True)
sums = sums.stack().stack().to_frame().reset_index().rename(columns={0: "# mn", "level_3": "measure", "fragmentation": "Fragmentation quartile"})

sums["semester"] = sums["year"].astype(str) + "_" + sums["half_year"]


In [None]:
color = "semester"
facet = "measure"
xaxis = "Fragmentation quartile"
plot_data = sums.sort_values([color, facet], inplace=True)
fig = px.bar(
    sums,
    x=xaxis,
    y="# mn",
    color=color,
    facet_row=facet,
    height=881,
    text="# mn",
    template='plotly_white',
    color_discrete_sequence=["#000000", "#E69F00", "#56B4E9", "#009E73",  "#F0E442", "#0072B2", "#D55E00", "#CC79A7"],  # px.colors.qualitative.Prism,
)
fig.update_yaxes(matches=None)  # free y-axis scale
fig.update_layout(barmode='group', xaxis={'categoryorder':'category ascending'})
fig.update_traces(textposition='outside', texttemplate="%{value:.2f}")
fig.for_each_trace(
    lambda trace: trace.update(
        name=trace.name.replace(f"{color}=", ""),
    )
)
for annotation in fig.layout.annotations:
    annotation.text = annotation.text.split("=")[1]
# fig

In [None]:
stats["num_orders_filled"].sum() / 1e6

# Archive 

## K-sample test

In [None]:
def calculate_ksample_test(series, measure):
    series = series[[measure, "after_nonequivalence"]]
    before = series.loc[~series.after_nonequivalence.astype(bool), measure]
    after = series.loc[series.after_nonequivalence.astype(bool), measure]
    
    # if we don't find data before or after non-equivalence
    if (all(before.isna()) == True) and (all(after.isna()) == True):
        return pd.Series({"statistic": np.nan, "significance_level": np.nan})
    else:
        try:
            result = anderson_ksamp([before.values, after.values])
        except:
            print(all(before.isna()))
            print(after.isna().all())
        return pd.Series({"statistic": result.statistic, "significance_level": result.significance_level})

In [None]:
measure = "time_to_removal_mean"
results = stats.groupby("isin").apply(lambda this_stock: calculate_ksample_test(this_stock, measure))
results.dropna(how="all", inplace=True)
results["significant 2.5%"] = results["significance_level"] < 0.025
# results = results.join(frag_before_nonequivalence)

In [None]:
results = results.join(stats["fragmentation"].reset_index(level="date"), on="isin")

In [None]:
results.groupby("fragmentation")["significant 2.5%"].value_counts()