# Distribution Shifts

+ Consider our stock data. 
+ We are interested in testing changes in return distribution for our sample data around the time of the onset of the COVID 19 pandemic.

In [1]:
%load_ext dotenv
%dotenv ../src/.env
import sys
sys.path.append("../src")
from logger import get_logger
_logs = get_logger(__name__)

In [2]:
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
from glob import glob

In [4]:
ft_dir = os.getenv("FEATURES_DATA")
ft_glob = glob(ft_dir+'/*.parquet')
df = dd.read_parquet(ft_glob).compute().reset_index()

## Data Preparation

+ First, prepare four datasets, each with returns between March of a given year and March of the following year.
+ For each data set, we can compute some descriptive statistics.
+ We observe that there may be some distribution changes.

In [26]:
df_2018 = df[(df['Date'] >= '2018-03-01') & (df['Date']  < '2019-03-01')]
df_2019 = df[(df['Date'] >= '2019-03-01') & (df['Date']  < '2020-03-01')]
df_2020 = df[(df['Date'] >= '2020-03-01') & (df['Date']  < '2021-03-01')]
df_2021 = df[(df['Date'] >= '2021-03-01') & (df['Date']  < '2022-03-01')]
df_2022 = df[(df['Date'] >= '2022-03-01') & (df['Date']  < '2023-03-01')]

In [18]:
df_2018['returns'].describe()

count    122487.000000
mean          0.008255
std           0.322370
min          -0.973106
25%          -0.007951
50%           0.000911
75%           0.009274
max          50.656051
Name: returns, dtype: float64

In [20]:
df_2019['returns'].describe()

count    124327.000000
mean          0.008617
std           0.282145
min          -0.892941
25%          -0.007526
50%           0.001031
75%           0.008961
max          40.907243
Name: returns, dtype: float64

In [21]:
df_2020['returns'].describe()

count    124509.000000
mean          0.010303
std           0.475536
min          -0.876122
25%          -0.012946
50%           0.001299
75%           0.016166
max         136.020301
Name: returns, dtype: float64

In [22]:
df_2021['returns'].describe()

count    125752.000000
mean          0.013406
std           0.709492
min          -0.690219
25%          -0.008819
50%           0.000742
75%           0.010318
max         209.045513
Name: returns, dtype: float64

In [28]:
df_2022['returns'].describe()

count    125018.000000
mean          0.009317
std           0.338328
min          -0.846907
25%          -0.012281
50%           0.000163
75%           0.012641
max          68.399998
Name: returns, dtype: float64

# Komogorov-Smirnov Test

+ The KS test can be accessed via the scipy library: `scipy.stats.kstest`
+ This function can be used to perform two sample tests.
+ The null hypothesis is that the two distributions are identical.

In [23]:
from scipy.stats import kstest

kstest(df_2018['returns'].dropna(), 
       df_2019['returns'].dropna())

KstestResult(statistic=0.011831758881548748, pvalue=6.237843774936075e-08, statistic_location=0.017902198410837622, statistic_sign=-1)

In [24]:
kstest(df_2019['returns'].dropna(), 
       df_2020['returns'].dropna())

KstestResult(statistic=0.13601382991582422, pvalue=0.0, statistic_location=0.017462597314621187, statistic_sign=1)

In [25]:
kstest(df_2020['returns'].dropna(), 
       df_2021['returns'].dropna())

KstestResult(statistic=0.1004555916634241, pvalue=0.0, statistic_location=0.017596482114136558, statistic_sign=-1)

In [27]:
kstest(df_2021['returns'].dropna(), 
       df_2022['returns'].dropna())

KstestResult(statistic=0.06230430332505052, pvalue=5.26826597957707e-212, statistic_location=-0.011759943380979188, statistic_sign=-1)