In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as pltdates
import matplotlib.cm as cm
import pandas as pd
import numpy as np
import pathlib

# This tells Jupyter not to autoreload code changes on the notebook from disk
%load_ext autoreload
%autoreload 0

# this tells Jupyter to plot inline
%matplotlib inline

In [None]:
# read data from csv files
btc = pd.read_csv(pathlib.Path('../datasets/BTC-USD.csv').absolute(), index_col=0, parse_dates=True)
dash = pd.read_csv(pathlib.Path('../datasets/DASH-USD.csv').absolute(), index_col=0, parse_dates=True)
etc = pd.read_csv(pathlib.Path('../datasets/ETC-USD.csv').absolute(), index_col=0, parse_dates=True)
eth = pd.read_csv(pathlib.Path('../datasets/ETH-USD.csv').absolute(), index_col=0, parse_dates=True)
lsk = pd.read_csv(pathlib.Path('../datasets/LSK-USD.csv').absolute(), index_col=0, parse_dates=True)
ltc = pd.read_csv(pathlib.Path('../datasets/LTC-USD.csv').absolute(), index_col=0, parse_dates=True)
nxt = pd.read_csv(pathlib.Path('../datasets/NXT-USD.csv').absolute(), index_col=0, parse_dates=True)
pot = pd.read_csv(pathlib.Path('../datasets/POT-USD.csv').absolute(), index_col=0, parse_dates=True)
xem = pd.read_csv(pathlib.Path('../datasets/XEM-USD.csv').absolute(), index_col=0, parse_dates=True)
xmr = pd.read_csv(pathlib.Path('../datasets/XMR-USD.csv').absolute(), index_col=0, parse_dates=True)
xrp = pd.read_csv(pathlib.Path('../datasets/XRP-USD.csv').absolute(), index_col=0, parse_dates=True)

# combine data into a single DF
coins = pd.DataFrame({
    'day': btc.index,
    'btc': btc['close'],
    'dash': dash['close'],
    'etc': etc['close'],
    'eth': eth['close'],
    'lsk': lsk['close'],
    'ltc': ltc['close'],
    'nxt': nxt['close'],
    'pot': pot['close'],
    'xem': xem['close'],
    'xmr': xmr['close'],
    'xrp': xrp['close']
})
coins.set_index('day', inplace=True)

In [None]:
# we will use log percent returns instead of prices, as percent returns 
# are a normalized metric (they live between 0 and 1)
coins_log_returns = np.log(1 + coins.pct_change())

In [None]:
# calculate the correlation matrix
corr_matrix = coins_log_returns.corr(method='pearson')
corr_matrix

In [95]:
# Plot it
def plot_correlation_matrix(corr_matrix, plot_title):
    fig_1 = plt.figure()
    fig_1.set_size_inches(8, 8)
    axis = fig_1.add_subplot(1, 1, 1)

    color_map = cm.get_cmap('magma', 50)  # load a colormap

    # get a plotted image 
    axis_img = axis.imshow(corr_matrix, interpolation="nearest", cmap=color_map)

    # set coin names on as axis ticks
    coin_names = [name.upper() for name in list(corr_matrix.index)]
    axis.set_xticks(np.arange(0,corr_matrix.shape[0], corr_matrix.shape[0]*1./len(coin_names)))
    axis.set_yticks(np.arange(0,corr_matrix.shape[1], corr_matrix.shape[1]*1./len(coin_names)))
    axis.set_xticklabels(coin_names, fontsize=16, rotation=70)
    axis.set_yticklabels(coin_names, fontsize=16)

    fig_1.colorbar(axis_img)  # Add colorbar legenda

    plt.title(plot_title)  # plot title

    plt.show()

plot_correlation_matrix(corr_matrix, 'Correlation matrix on crypto log returns \n [full timescale]')

In [110]:
# what happens if we calculate the same matrix on a different time scale?
# eg. from 2017-11-01 to 2017-12-01
BOX_START = '2017-11-01'
BOX_END = '2017-12-01'
coins_boxed = coins[BOX_START:BOX_END]
coins_boxed_log_returns = np.log(1 + coins_boxed.pct_change())
corr_matrix = coins_boxed_log_returns.corr(method='pearson')
corr_matrix

In [111]:
# let's plot it
plot_correlation_matrix(corr_matrix, 'Correlation matrix on crypto log returns \n [2017-11-01, 2017-12-01]')

In [112]:
# And again, from 2017-12-25 to 2018-01-01
BOX_START = '2017-12-25'
BOX_END = '2018-01-01'
coins_boxed = coins[BOX_START:BOX_END]
coins_boxed_log_returns = np.log(1 + coins_boxed.pct_change())
corr_matrix = coins_boxed_log_returns.corr(method='pearson')
corr_matrix

In [118]:
# let's plot it
plot_correlation_matrix(corr_matrix, 'Correlation matrix on crypto log returns \n [2017-12-25, 2018-01-01]')

In [120]:
# woohaa! Looks like there is a high correlation among a subset of the coins!
# let's check out by plotting the log returns for those coins
fig_3 = plt.figure(figsize=(8, 8))
axes = fig_3.add_subplot(1, 1, 1)
axes.plot(coins_boxed.index, coins_boxed_log_returns['btc'], label='BTC log returns')
axes.plot(coins_boxed.index, coins_boxed_log_returns['dash'], label='DASH log returns')
axes.plot(coins_boxed.index, coins_boxed_log_returns['etc'], label='ETC log returns')
axes.plot(coins_boxed.index, coins_boxed_log_returns['eth'], label='ETH log returns')
axes.plot(coins_boxed.index, coins_boxed_log_returns['lsk'], label='LSK log returns')
axes.plot(coins_boxed.index, coins_boxed_log_returns['ltc'], label='LTC log returns')

axes.legend(loc='best')
axes.grid(True)
axes.set_title('Log returns for highly-correlated coins \n [2017-12-25, 2018-01-01]')
axes.set_ylabel('Price [$]')
axes.set_xticklabels(axes.get_xticklabels(), rotation=45)
axes.xaxis.set_major_formatter(pltdates.DateFormatter('%Y-%m-%d'))

In [121]:
## STUDY CASE: would a portfolio composed by BTC, ETH and LTC be a good investment over time? 
## Let's find out, on a monthly basis

In [160]:
# this gives a DataFrame with the "rolling" correlation matrix,
# calculated on 30-days sliding windows
coins_log_returns = np.log(1 + coins[['btc','ltc', 'eth']].pct_change())
rolling_corr_matrices = coins_log_returns.rolling(window=30).corr(
    pairwise=True, other=coins_log_returns)
rolling_corr_matrices

In [169]:
# unstack the multi-index dataframe, so it's easier to handle
unstacked_df = rolling_corr_matrices.unstack(level=-1)

# let's plot how correlation coefficients vary over time
fig_4 = plt.figure(figsize=(15, 9))
axes = fig_4.add_subplot(1, 1, 1)
axes.plot(unstacked_df.index, unstacked_df['btc']['ltc'], label = 'BTC-LTC log returns correlation')
axes.plot(unstacked_df.index, unstacked_df['btc']['eth'], label = 'BTC-ETH log returns correlation')
axes.plot(unstacked_df.index, unstacked_df['ltc']['eth'], label = 'LTC-ETH log returns correlation')

# Legend, Y-axis labels and X-axis ticker formats
axes.legend(loc='best')
axes.xaxis.set_major_formatter(pltdates.DateFormatter('%m/%y'))

In [None]:
# Final answer: correlation analysis tells us that such a portfolio would have
# been well differentiated in late August 2017 and in the second half of December 2017