In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
number_of_runs = 10

mid_price_files = []

for i in range(number_of_runs):
    file = f"/Users/admin/Workspace/apxr/output/apxr_mid_prices{i+1}.csv"
    mid_price_files.append(file)

In [None]:
from numpy import genfromtxt

mid_price_data = genfromtxt(mid_price_files[0], delimiter=',')

for f in mid_price_files[1:]:
    mid_price_data += genfromtxt(f, delimiter=',')

mid_price_data /= len(mid_price_files)

mid_prices = pd.DataFrame(mid_price_data, columns=['price'])

mid_prices['returns'] = mid_prices["price"].pct_change()

mid_prices.head(10)

In [None]:
trade_price_files = []

for i in range(number_of_runs):
    file = f"/Users/admin/Workspace/apxr/output/apxr_trades{i+1}.csv"
    trade_price_files.append(file)

In [None]:
trade_price_data = []

for filename in trade_price_files:
    tr = pd.read_csv(filename, header=None)
    tr.columns = columns=['price']
    trade_price_data.append(tr)

trade_price_data_avg = pd.concat(trade_price_data, axis=1).mean(axis=1)
trade_prices = pd.DataFrame(trade_price_data_avg, columns=['price'])

trade_prices['returns'] = trade_prices["price"].pct_change()

trade_prices.head(10)

## Fat tailed distribution of returns

In [None]:
time = []
kurt = []

for lag in range(2500):
    lagged_returns = mid_prices['price'].pct_change(lag)
    kurtosis = lagged_returns.kurt()
    kurt.append(kurtosis)
    time.append(lag)

# Remove first observation since it is zero and Nan
time = time[1:]
kurt = kurt[1:]

In [None]:
plt.figure(figsize=(10, 7))

plt.plot(time, kurt, label='Kurt')
plt.legend()
plt.xlabel('Time scale')
plt.ylabel('Kurtosis')
plt.title('Kurtosis of returns and lags')
plt.tight_layout()

## Volatility clustering

In [None]:
import sys
!{sys.executable} -m pip install hurst

In [None]:
from hurst import compute_Hc, random_walk

# Evaluate Hurst equation
H, c, data = compute_Hc(np.cumsum(mid_prices['returns'].dropna().abs()), kind='random_walk', simplified=True)

# Plot
f, ax = plt.subplots(figsize=(10, 7))
ax.plot(data[0], c*data[0]**H, color='deepskyblue')
ax.scatter(data[0], data[1], color='purple')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Time interval')
ax.set_ylabel('R/S ratio')
ax.grid(True)
plt.show()

# The rescaled range is a statistical measure of the
# variability of a time series. Its purpose is to provide
# an assessment of how the apparent variability of a series
# changes with the length of the time-period being considered.
# The slope of this line gives the Hurst exponent, H.

print("H={:.4f}, c={:.4f}".format(H,c))

In [None]:
H_list = []

for lag in range(1, 2500):
    H, c, data = compute_Hc(np.cumsum(mid_prices["returns"].dropna().abs()), kind='random_walk', simplified=True)
    H_list.append(H)

In [None]:
f, ax = plt.subplots(figsize=(10, 7))
plt.plot(np.arange(2499), H_list)
ax.set_xlabel('Time interval')
ax.set_ylabel('Hurst exponent')
ax.grid(True)
plt.title('Volatility clustering')
plt.show()

## Autocorrelation of returns

In [None]:
def autocorrelation(x):
    result = np.correlate(x, x, mode='full')
    return result[int(len(result)/2):]

In [None]:
# First lag returns of mid prices
returns_first_lag = mid_prices['returns']
returns_first_lag_ac = autocorrelation(returns_first_lag[2:])

# First lag returns of trade prices
tp_first_lag = trade_prices['returns']
tp_first_lag_ac = autocorrelation(tp_first_lag[2:])

In [None]:
summary = pd.DataFrame(returns_first_lag_ac, columns=['Mid price'])
summary.describe()

In [None]:
summary = pd.DataFrame(tp_first_lag_ac, columns=['Trade price'])
summary.describe()

## Long memory in order flow

In [None]:
order_side_files = []

for i in range(number_of_runs):
    file = f"/Users/admin/Workspace/apxr/output/apxr_order_sides{i+1}.csv"
    order_side_files.append(file)

In [None]:
order_side_stats = []
order_side_h = []

for filename in order_side_files:
    oss = pd.read_csv(filename, header=None)
    oss.columns = columns=["side"]
    # First order lag autocorrelation of the order sign
    avg = np.log(np.mean(autocorrelation(oss['side'])))
    order_side_stats.append(avg)
    # Hurst exponent
    H, c, data = compute_Hc(oss['side'].dropna().abs(), kind='change', simplified=True)
    order_side_h.append(H)

In [None]:
os_stats = pd.DataFrame(order_side_stats, columns=['Order side statistics'])
os_stats.describe()

In [None]:
os_h = pd.DataFrame(order_side_h, columns=['H order sides'])
os_h.describe()

## Concave price impact

In [None]:
price_impact_files = []

for i in range(number_of_runs):
    file = f"/Users/admin/Workspace/apxr/output/apxr_price_impacts{i+1}.csv"
    price_impact_files.append(file)

In [None]:
price_impact_data = []

for filename in price_impact_files:
    impact = pd.read_csv(filename, header=None)
    impact.columns = columns=['volume', 'impact']
    impact['volume'] = np.log(impact['volume'])
    price_impact_data.append(impact)

price_impact_data_all = pd.concat(price_impact_data)
price_impact = pd.DataFrame(price_impact_data_all, columns=['volume', 'impact'])

price_impact.head(10)

In [None]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(price_impact["volume"].values.reshape(-1, 1), price_impact["impact"])
clf.coef_

In [None]:
f, ax = plt.subplots(figsize=(10, 7))
plt.plot(np.arange(1, 200000), clf.coef_ * np.arange(1, 200000) + clf.intercept_)
ax.set_xlabel('Volume')
ax.set_ylabel('Price impact')
ax.grid(True)
plt.title('Concave price impact')
plt.show()

## Extreme price events

In [None]:
def is_extreme(position, df):
    """Returns true for a position if it consistently takes
    up/down position for 10 times before switching to the
    opposite and the net change is at least 0.8% of initial
    price"""
    initial_price = df.iloc[0]['price']
    prices = df['price'].values
    curr_price = prices[position]
    increasing = True
    for i in range(position + 1, position + 10):
        if curr_price >= prices[i]:
            increasing = False
            break
    if increasing:
        delta = abs(prices[position] - prices[position + 10])
        if delta > initial_price * 0.08:
            return True
    decreaing = True
    for i in range(position + 1, position + 10):
        if curr_price <= prices[i]:
            decreaing = False
            break
    if decreaing:
        delta = abs(prices[position] - prices[position + 10])
        if delta > initial_price * 0.08:
            return True
    return False

In [None]:
extreme_events = []
for i in range(len(mid_prices) - 10):
    if is_extreme(i, mid_prices):
        # save position
        extreme_events.append(i)

In [None]:
f, ax = plt.subplots(figsize=(10, 7))
plt.plot(np.arange(1), extreme_events)
ax.set_xlabel('Event Duration')
ax.set_ylabel('Relative number of events')
ax.set_xlim((0, 50))
ax.grid(True)
plt.title('Extreme price events')
plt.show()