# Stylized Facts of Cryptocurrency

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
import glob
import re
import math
from statsmodels.tsa.stattools import acf, pacf

plt.rcParams['figure.figsize'] = (15.0, 5.0)

In [None]:
# Read from cryptocurrencies

files = [f for f in glob.glob('../data/cryptos/*price.csv')]
files.sort()
crypto_names = list(map(lambda f: re.match(re.compile('.*/(.*)_price.csv'), f).group(1), files))
crypto_dic = dict();
data_sizes = pd.DataFrame(index=['size'], columns=crypto_names)
for idx, file in enumerate(files):
    df = pd.read_csv(file)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date')
    df.set_index('Date', inplace=True)
    prices = df['Close']
    returns = pd.Series(np.log(prices / prices.shift()))
    returns.dropna(inplace=True)
    crypto_dic[crypto_names[idx]] = returns;
    data_sizes[crypto_names[idx]][0] = returns.size
data_sizes

In [None]:
# Read from currencies

files = [f for f in glob.glob('../data/currencies/*.csv')]
files.sort()
rate_names = list(map(lambda f: re.match(re.compile('.*/(.*) Historical Data.csv'), f).group(1), files))
rate_dic = dict();
data_sizes = pd.DataFrame(index=['size'], columns=rate_names)
for idx, file in enumerate(files):
    df = pd.read_csv(file)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(by='Date')
    df.set_index('Date', inplace=True)
    
    prices = df['Price']
    new_idx = pd.DatetimeIndex(start=df.index[0], end=df.index[-1], freq='D')
    prices = prices[new_idx]
    prices.interpolate(inplace=True)
    
    returns = pd.Series(np.log(prices / prices.shift()))
    returns.dropna(inplace=True)
    rate_dic[rate_names[idx]] = returns;
    data_sizes[rate_names[idx]][0] = returns.size
data_sizes

## Basic Statistics
* Q1: What does A* mean, another version of central limit theorem?
* Q2: z-score here might be meaningless since it is used to assess the null hypothesis that the expected return is zero

In [None]:
def do_basic_stats(returns_dic):
    columns = ['10^4 mean', '10^2 std', 'max', 'min', 'skew', 'skew_abs', 'kurtosis', 'G%', 'A*%', 'z']
    df = pd.DataFrame(index=returns_dic.keys(), dtype='float', columns=columns)
    N = 365;
    for key, returns in returns_dic.items():
        G = math.exp(N * returns.mean()) - 1
        Astar = (1 + G) * math.exp(N * 0.5 * math.pow(returns.std(), 2.0)) - 1
        z = returns.mean() * math.sqrt(returns.size) / returns.std() 

        df.loc[key] = [returns.mean() * 10000, returns.std() * 100, 
                       returns.max(), returns.min(), 
                       returns.skew(), abs(returns.skew()), returns.kurtosis(),
                       G * 100, Astar * 100, 
                       z]
    df.round(2)
    return df

def plot_stats_relation(df):
    ax = df.plot.scatter(x='kurtosis', y='skew_abs')
    for i, text in enumerate(df.index):
        ax.annotate(text, (df['kurtosis'].iat[i], df['skew_abs'].iat[i]))

    ax = df.plot.scatter(x='kurtosis', y='10^2 std')
    for i, text in enumerate(df.index):
        ax.annotate(text, (df['kurtosis'].iat[i], df['10^2 std'].iat[i]))

    ax = df.plot.scatter(x='kurtosis', y='10^4 mean')
    for i, text in enumerate(df.index):
        ax.annotate(text, (df['kurtosis'].iat[i], df['10^4 mean'].iat[i]))

    ax = df.plot.scatter(x='10^4 mean', y='10^2 std')
    for i, text in enumerate(df.index):
        ax.annotate(text, (df['10^4 mean'].iat[i], df['10^2 std'].iat[i]))

    ax = df.plot.scatter(x='min', y='max')
    for i, text in enumerate(df.index):
        ax.annotate(text, (df['min'].iat[i], df['max'].iat[i]))

In [None]:
df = do_basic_stats(crypto_dic)
df

In [None]:
plot_stats_relation(df)

In [None]:
df = do_basic_stats(rate_dic)
df

In [None]:
plot_stats_relation(df)

## Shape of Returns Distribution
Q: What's the threshold of skewness and kurtosis as a normal distribution.

In [None]:
def do_shape_stats(returns_dic):
    columns = ['std', 'skew', 'skew std', 'z-skew', 'kurtosis', 'kurtosis std', 'z-kurtosis']
    df = pd.DataFrame(index=returns_dic.keys(), dtype='float', columns=columns)
    for key, returns in returns_dic.items():
        skew_std = math.sqrt(6.0 / returns.size)
        kurto_std = math.sqrt(24.0 / returns.size)
        df.loc[key] = [returns.std(), 
                       returns.skew(), skew_std, abs(returns.skew()) / skew_std,
                       returns.kurtosis(), kurto_std, returns.kurtosis() / kurto_std]
    df.round(2)
    return df
    
def plot_shape(returns): 
    plt.title('Returns Time Series')
    returns.plot()
    plt.show()
    
    plt.title('Distribution Shape')
    returns.plot('kde')
    x = np.linspace(returns.min(), returns.max(), returns.size)
    plt.plot(x, mlab.normpdf(x, returns.mean(), returns.std()), color='orange')
    returns.plot('hist', bins=40, secondary_y=True, color='green', alpha=0.5)
    line = mlines.Line2D([], [], color='orange', label="Normal distribution")
    plt.legend(handles=[line])
    plt.show()

In [None]:
do_shape_stats(crypto_dic)

In [None]:
do_shape_stats(rate_dic)

### Example shape graph

In [None]:
returns = crypto_dic['bitcoin']
plot_shape(returns)

In [None]:
returns = rate_dic['EUR_USD']
plot_shape(returns)

## Calendar Effects
Q: How to correct evaluate calendar effect significants?

In [None]:
def do_dayofweek(returns_dic, stat):
    columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df = pd.DataFrame(index=returns_dic.keys(), dtype='float', columns=columns)
    
    if stat == 'mean':
        for key, returns in returns_dic.items():
            df.loc[key] = 10000 * returns.groupby(returns.index.dayofweek).mean().values
    elif stat == 'std':
        for key, returns in returns_dic.items():
            df.loc[key] = 100 * returns.groupby(returns.index.dayofweek).std().values
    
    return df

### Average percentage day-of-week returns

In [None]:
do_dayofweek(crypto_dic, 'mean')

In [None]:
do_dayofweek(rate_dic, 'mean')

### Average percentage day-of-week std

In [None]:
do_dayofweek(crypto_dic, 'std')

In [None]:
do_dayofweek(rate_dic, 'std')

###  Example

In [None]:
def do_dayofweek_stats(returns_dic):
    N = 365
    columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df = pd.DataFrame(dtype='float', columns=columns)

    returns = returns_dic['bitcoin']
    groups = returns.groupby(returns.index.dayofweek)

    calG = lambda x: math.exp(N * x.mean()) - 1
    calAstar = lambda x: (1 + calG(x)) * math.exp(N * 0.5 * math.pow(x.std(), 2.0)) - 1
    calz = lambda x: x.mean() * math.sqrt(x.size) / x.std() 

    df.loc['10^4 mean']= 10000 * groups.mean().values
    df.loc['10^2 std'] = 100 * groups.std().values
    df.loc['max'] = groups.max().values
    df.loc['min'] = groups.min().values
    df.loc['skew'] = groups.skew().values
    df.loc['kurtosis'] = groups.apply(lambda x: x.kurtosis()).values
    df.loc['G%'] = groups.apply(calG).values; 
    df.loc['A*%'] = groups.apply(calAstar).values; 
    df.loc['z'] = groups.apply(calz).values; 

    df.round(2)
    return df

In [None]:
do_dayofweek_stats(crypto_dic)

## Autocorrelation
Q: 95% confident interval rule seemed is violated.

In [None]:
def plot_acf(returns, title):
    count = 20
    lag_acf = acf(returns, nlags=count)
    lag_pacf = pacf(returns, nlags=count, method='ols') 

    ######################### ACF ##########################################

    plt.subplot(121)
    plt.bar(x=range(len(lag_acf)), height=lag_acf, alpha=0.3, color='green')
    plt.plot(lag_acf)
    plt.axhline(y=0, linestyle='--',color='blue')
    plt.axhline(y=-1.96/np.sqrt(len(returns)), linestyle='--', color='pink')
    plt.axhline(y=1.96/np.sqrt(len(returns)), linestyle='--', color='blue')
    plt.title(title + ' -- autocorrelation')

    ######################### PACF ##########################################

    plt.subplot(122)
    plt.bar(x=range(len(lag_pacf)), height=lag_pacf, alpha=0.3, color='green')
    plt.plot(lag_pacf)
    plt.axhline(y=0, linestyle='--',color='blue')
    plt.axhline(y=-1.96/np.sqrt(len(returns)), linestyle='--', color='pink')
    plt.axhline(y=1.96/np.sqrt(len(returns)), linestyle='--', color='blue')
    plt.title(title + ' -- partial autocorrelation')
    plt.show()
    
    df = pd.DataFrame(columns=['acf', 'pacf'])
    for i in range(1, count + 1):
        df.loc[i] = [lag_acf[i], lag_pacf[i]]
    df = df.round(4)
    print(df)
    
def plot_acf_forall(returns):
    returns_abs = np.abs(returns)
    returns_square = np.square(returns)
    returns_adjust = np.log(np.abs(np.subtract(returns, returns.mean())))

    plot_acf(returns, 'returns')
    plot_acf(returns_abs, 'absolute returns')
    plot_acf(returns_square, 'square returns')
    plot_acf(returns_adjust, 'logarithms of absolute, mean-adjusted returns')

In [None]:
returns = crypto_dic['bitcoin']
plot_acf_forall(returns)

In [None]:
returns = rate_dic['EUR_USD']
plot_acf_forall(returns)