In [1]:
import pandas as pd
import keras
from IPython.display import display, HTML
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
import matplotlib.pyplot as plt
import numpy as np
import datetime

np.random.seed(42)

Using TensorFlow backend.


In [284]:
import matplotlib.pyplot as plt

In [2]:
from math import pi
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file

output_notebook()

In [3]:
a = pd.Series([1,2,3,4,5,6,7,8,9])

print(a.rolling(3).mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
dtype: float64


In [4]:
HOURS_IN_DAY = 24
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60
AGGREGATION_PERIOD = 30 #Model uses 30 minutes candles

DAY_WINDOW = int(HOURS_IN_DAY * MINUTES_IN_HOUR / AGGREGATION_PERIOD)

In [5]:
# Force CPU usage
import tensorflow as tf
from keras import backend as K

num_cores = 8

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : 1, 'GPU' : 0})
session = tf.Session(config=config)
K.set_session(session)

In [6]:
# For plot

def prepare_standardplot(title, xlabel):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(title)
    ax1.set_ylabel('categorical cross entropy')
    ax1.set_xlabel(xlabel)
    ax1.set_yscale('log')
    ax2.set_ylabel('accuracy [% correct]')
    ax2.set_xlabel(xlabel)
    return fig, ax1, ax2

def finalize_standardplot(fig, ax1, ax2):
    ax1handles, ax1labels = ax1.get_legend_handles_labels()
    if len(ax1labels) > 0:
        ax1.legend(ax1handles, ax1labels)
    ax2handles, ax2labels = ax2.get_legend_handles_labels()
    if len(ax2labels) > 0:
        ax2.legend(ax2handles, ax2labels)
    fig.tight_layout()
    plt.subplots_adjust(top=0.9)

def plot_history(history, title):
    fig, ax1, ax2 = prepare_standardplot(title, 'epoch')
    ax1.plot(history.history['loss'], label = "training")
    ax2.plot(history.history['binary_accuracy'], label = "training")
    finalize_standardplot(fig, ax1, ax2)
    return fig

In [7]:
def aggregate_market_values(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
    
    data = data.groupby(data.Timestamp // aggregation_factor).agg({
        'Open' : 'first',
        'High' : np.max,
        'Low' : np.min,
        'Close' : 'last',
        'Volume_(BTC)' : np.sum ,
        'Volume_(Currency)' : np.sum,
        'Weighted_Price' : np.mean,
    }).reset_index()
    
    data.Timestamp *= aggregation_factor
    
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data

def first_in_window(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
            
    data = data.groupby(data.Timestamp // aggregation_factor).first().reset_index(drop=True)
            
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data
    

In [518]:
df_raw_part1 = pd.read_csv('Data/bitstampUSD_1-min_data_2012-01-01_to_2018-01-08.csv')
df_raw_part2 = pd.read_csv('Data/bitstampUSD_30-min_data_january.csv', date_parser=True)

# Aggregate first part of data into chunks of 30 mins, second part already aggregated
df_p1 = aggregate_market_values(df_raw_part1, 30, unix_time=True)
df_p1.Timestamp = pd.to_datetime(df_p1.Timestamp, unit='s')

df_p2 = df_raw_part2
df_p2.Timestamp = pd.to_datetime(df_p2.Timestamp)

df_raw = pd.concat([df_p1, df_p2]).reset_index(drop=True)

display(df_raw[105370:105372])
display(df_raw.tail())
print(df_raw.dtypes)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
105370,16166.78,16176.96,16102.05,16173.98,2018-01-08 00:00:00,25.633791,413278.4,16122.40664
105371,15956.66,16300.0,15954.16,16293.99,2018-01-08 00:30:00,221.65,3577715.0,16140.98


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
108615,8453.2,8505.0,8435.94,8503.41,2018-03-16 14:30:00,250.99,2126237.5,8471.47
108616,8511.47,8527.24,8450.1,8452.99,2018-03-16 15:00:00,321.19,2727121.08,8490.73
108617,8554.04,8613.06,8482.47,8510.0,2018-03-16 15:30:00,563.11,4813543.76,8548.16
108618,8541.17,8572.28,8472.24,8554.42,2018-03-16 16:00:00,305.72,2605758.19,8523.24
108619,8541.19,8553.83,8522.02,8547.41,2018-03-16 16:30:00,30.78,262893.41,8539.96


Close                       float64
High                        float64
Low                         float64
Open                        float64
Timestamp            datetime64[ns]
Volume_(BTC)                float64
Volume_(Currency)           float64
Weighted_Price              float64
dtype: object


In [9]:
df_plot = df_raw.copy()

aggregation_factor = 24 * 60 #24h candles

df_plot = aggregate_market_values(df_plot, 12 * 60)

inc = df_plot.Close >= df_plot.Open
dec = df_plot.Open > df_plot.Close
barWidth = 0.66 * aggregation_factor * 60 * 1000 # 30 minutes in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.segment(df_plot.Timestamp, df_plot.High, df_plot.Timestamp, df_plot.Low, color="black")
p.vbar(df_plot.Timestamp[inc], barWidth, df_plot.Open[inc], df_plot.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_plot.Timestamp[dec], barWidth, df_plot.Open[dec], df_plot.Close[dec], fill_color="#FF2828", line_color="black")

show(p)

In [200]:
def add_MA_n_days_age(num_days):
    num_days_str = str(num_days)
    
    #df[['Open_W_MA_'+num_days_str,'High_W_MA_'+num_days_str,'Low_W_MA_'+num_days_str,'Close_W_MA_'+num_days_str]] = df[['Open_W','High_W','Low_W','Close_W']].rolling(window=day_window * num_days).mean()
    df[['MA_'+num_days_str,'High_MA_'+num_days_str,'Low_MA_'+num_days_str,'Close_MA_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).mean()
    #df[['Open_EMA_'+num_days_str,'High_EMA_'+num_days_str,'Low_EMA_'+num_days_str,'Close_EMA_'+num_days_str]] = df[['Open','High','Low','Close']].ewm(span=day_window * num_days).mean()


def add_prices_n_periods_ago(data, periods, step=DAY_WINDOW):
    return data['Smoothed_Price'].copy().shift(step * periods)

def dummy_increased_since(data, n, step=DAY_WINDOW):
    """
    1 if price increased since n * step, 0 otherwise
    """
    return 1 * (data['Smoothed_Price'] > data['Smoothed_Price'].shift(int(step * n)))

def dummy_piecewise_increased_since(data, n, interval, step=DAY_WINDOW):
    """
    1 if price increased between n * step - interval and n * step, 0 otherwise
    """
    return 1 * (data['Smoothed_Price'].shift(int(step * n)) > data['Smoothed_Price'].shift(int(step * (n + interval))))

def mean_square_loss(predicted_labels, true_labels):
    assert len(predicted_labels) == len(true_labels)
    return np.mean((predicted_labels - true_labels)**2)

def accuracy(x,y):
    return np.mean(x==y)

## Oscillators

In [690]:
def RSI(df, gain_column='Close', period=14):
    data = pd.DataFrame(df[gain_column].copy())
    
    #Compute differences
    data['delta'] = data[gain_column] - data.shift(1)[gain_column]
    
    #Compute gains and losses
    data['gain'] = data[data['delta'] >= 0]['delta']
    data['loss'] = data[data['delta'] <= 0]['delta']
    data['loss'] = -data['loss']
    data['gain'].fillna(0.0, inplace = True)
    data['loss'].fillna(0.0, inplace = True)
    
    #Compute average of gains and losses
    data['avg_gain'] = data['gain'].rolling(period).mean()
    data['avg_loss'] = data['loss'].rolling(period).mean()
    
    #Compute relative strength
    data['RS'] = data['avg_gain'] / data['avg_loss']
    
    return 100 - 100.0/(1.0 + data.RS)
    
def EMA(data, s):    
    return data.ewm(span=s, adjust=False).mean()

def MACD(df, price_column='Close', period1=12, period2=26, smooth_factor=9):
    
    data = df.copy()
    
    data['EMA_'+str(period1)] = EMA(data[price_column], period1)
    data['EMA_'+str(period2)] = EMA(data[price_column], period2)
    
    data['MACD_Line'] = data['EMA_'+str(period1)] - data['EMA_'+str(period2)]
    
    data['Signal_Line'] = EMA(data['MACD_Line'], smooth_factor)
    
    data['MACD_Histogram'] = data['MACD_Line'] - data['Signal_Line']
    
    return data
    
def BB(dataframe, price_column='Close', period = 20):
    df = dataframe.copy()
    
    #Compute middle Band
    df['MA'] = df[price_column].rolling(period).mean()
    
    # Compute std
    df['std'] = df[price_column].rolling(period).std()

    #Compute Upper Band
    df['Upper'] = df['MA'] + 2*df['std']
    
    #Compute Lower Band
    df['Lower'] = df['MA'] - 2*df['std']
    return df

def ATR(df, period=14):
    data = df.copy()
    
    data['v1'] = data.High - data.Low
    data['v2'] = (data.High - data.shift(1).Close).apply(abs)
    data['v3'] = (data.Low - data.shift(1).Close).apply(abs)
    data['True_Range'] = data[['v1','v2','v3']].max(axis=1)
    
    
    return EMA(data.True_Range, period)


def DM(df, period_adx=14):
    data = df.copy()
    data['UpMove'] = data.High - data.shift(1).High
    data['DownMove'] = data.shift(1).Low - data.Low
    
    #Compute directional movement
    data['pDM'] = 0
    data['mDM'] = 0
    data.loc[(data['UpMove'] > data['DownMove']) & (data['UpMove'] > 0), 'pDM'] = data.loc[(data['UpMove'] > data['DownMove']) & (data['UpMove'] > 0), 'UpMove']
    data.loc[(data['UpMove'] < data['DownMove']) & (data['DownMove'] > 0), 'mDM'] = data.loc[(data['UpMove'] < data['DownMove']) & (data['DownMove'] > 0), 'DownMove']

    data['pDI'] = 100.0 * EMA(data['pDM'], period_adx) / data['ATR']
    data['mDI'] = 100.0 * EMA(data['mDM'], period_adx) / data['ATR']
    
    data['ADX'] = EMA(100.0 * ((data['pDI'] - data['mDI']).apply(abs) / (data['pDI'] + data['mDI'])), period_adx)
    
    return data

In [691]:
df_processed = df_processed.reset_index(drop=True)
df_processed['ATR'] = ATR(df_processed, 14.0)
display(df_processed[-100:-99])

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX,ADX_tmp
107103,8673.48,8750.0,8673.48,8743.63,2018-03-14 15:00:00,227.85,1983204.35,8703.93,8952.69,0,...,21.228281,103.715694,-38.0,27.52,0.0,27.52,12.785629,29.602741,54.912213,54.912213


In [692]:
df_processed = DM(df_processed)
display(df_processed[-101:-99])

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX,ADX_tmp
107102,8743.63,8788.0,8701.0,8783.8,2018-03-14 14:30:00,164.57,1438788.22,8742.67,8975.58625,0,...,23.820577,107.899646,4.2,-33.99,4.2,0.0,14.180595,28.908651,57.256571,57.256571
107103,8673.48,8750.0,8673.48,8743.63,2018-03-14 15:00:00,227.85,1983204.35,8703.93,8952.69,0,...,21.228281,103.715694,-38.0,27.52,0.0,27.52,12.785629,29.602741,54.912213,54.912213


In [693]:
p = make_plot_figure("test", 'test', 'test')
df_tmp = df_processed[-200:]
p.line(df_tmp.Timestamp[:], df_tmp['ADX'][:], line_color='red', line_width=2, line_alpha=1, legend='Signal line')
p.line(df_tmp.Timestamp[:], df_tmp['pDI'][:], line_color='blue', line_width=2, line_alpha=1, legend='Signal line')
p.line(df_tmp.Timestamp[:], df_tmp['mDI'][:], line_color='orange', line_width=2, line_alpha=1, legend='Signal line')
show(p)

In [333]:
def plot_df(df, column):
    p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=950, title = "MSFT Candlestick")
    p.xaxis.major_label_orientation = pi/4
    p.grid.grid_line_alpha=0.3

    p.line(df.Timestamp[:], df[column][:], line_color='#ff0000', line_width=2, line_alpha=1, legend=column)
   

    show(p)

In [402]:
def make_plot_figure(plot_title, xlabel, ylabel):
    p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=950, title = plot_title)
    p.xaxis.major_label_orientation = pi/4
    p.xaxis.axis_label = xlabel
    p.xaxis.axis_label_text_font_style='normal'
    p.yaxis.axis_label = ylabel
    p.yaxis.axis_label_text_font_style='normal'
    p.grid.grid_line_alpha=0.3
    
    return p

In [404]:
df_test = BB(df_processed)[-700:]

p = make_plot_figure("Feature: Bollinger Bands", 'Time', 'BTC price in USD')

p.line(df_test.Timestamp[:], df_test['MA'][:], line_color='#ffa02b', line_width=2, line_alpha=1, legend='SMA')
p.line(df_test.Timestamp[:], df_test['Upper'][:], line_color='#2b8aff', line_width=2, line_alpha=0.6, legend='Upper Band')
p.line(df_test.Timestamp[:], df_test['Lower'][:], line_color='#2b8aff', line_width=2, line_alpha=0.6, legend='Lower Band')

inc = df_test.Close >= df_test.Open
dec = df_test.Open > df_test.Close
barWidth = 0.66 * 30 * 60 * 1000 # 30 minutes in ms

p.segment(df_test.Timestamp, df_test.High, df_test.Timestamp, df_test.Low, color="black")
p.vbar(df_test.Timestamp[inc], barWidth, df_test.Open[inc], df_test.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_test.Timestamp[dec], barWidth, df_test.Open[dec], df_test.Close[dec], fill_color="#FF2828", line_color="black")


show(p)

In [631]:
df_test = df_processed.reset_index(drop=True)
df_test = MACD(df_test)[-500:]

In [632]:
p = make_plot_figure("Feature: MACD", "Time", "")
p2 = make_plot_figure("BTCUSD - Price and EMAs", "Time", "BTC Price in USD")


p2.line(df_test.Timestamp[:], df_test['Close'], line_color="black", line_alpha=0.5, legend='Closing Price')
p2.line(df_test.Timestamp[:], df_test['EMA_12'], line_color="red", line_alpha=0.8, line_width=2, legend='EMA 12')
p2.line(df_test.Timestamp[:], df_test['EMA_26'], line_color="green", line_alpha=0.8, line_width=2, legend='EMA 26')

p.line(df_test.Timestamp[:], df_test['Signal_Line'][:], line_color='#ffa02b', line_width=2, line_alpha=1, legend='Signal line')
p.line(df_test.Timestamp[:], df_test['MACD_Line'][:], line_color='#2b8aff', line_width=2, line_alpha=1, legend='MACD Line')
p.quad(top=df_test['MACD_Histogram'][:], bottom=0, left=df_test.Timestamp[:-1], right=df_test.Timestamp[1:], color='#ff2b8a', legend='MACD Histogram')

show(p2)
show(p)



# ==========================================================
# 
# ==========================================================

In [694]:
SMOOTHING_PERIOD = int(0.5 * 2 * 24) # 6h
SHIFTING_PERIOD = 0 #SMOOTHING_PERIOD

df_processed = df_raw.copy()
df_processed['Smoothed_Price'] = (0.5 * (df_processed.Close + df_processed.Open)).shift(-int(SHIFTING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean()
#df_processed['Inc_1_day'] = dummy_increased_since(df_processed, 1)
#df_processed['Inc_12_hours'] = dummy_increased_since(df_processed, 0.5)
#df_processed['Inc_6_hour'] = dummy_increased_since(df_processed, 0.25)
#df_processed['Inc_1_hour_30min'] = dummy_increased_since(df_processed, 3, 1)
df_processed['Inc_1_hour'] = dummy_increased_since(df_processed, 2, 1)
df_processed['Inc_30_min'] = dummy_increased_since(df_processed, 1, 1)
#df_processed['PInc_30-60'] = dummy_piecewise_increased_since(df_processed, 1, 1, 1)
#df_processed['PInc_60-90'] = dummy_piecewise_increased_since(df_processed, 2, 1, 1)
df_processed = df_processed.reset_index(drop=True)
df_processed = MACD(df_processed)
df_processed = BB(df_processed)
df_processed['RSI'] = RSI(df_processed, gain_column='Close')
df_processed['ATR'] = ATR(df_processed)
df_processed = DM(df_processed)
df_processed = df_processed.dropna(how='any')
print(DAY_WINDOW*0.04167)
#display(df_processed[10090:10120:])

2.00016


In [695]:
display(df_processed)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,Lower,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX
23,4.58,4.58,4.58,4.58,2011-12-31 19:00:00,270.000,1.236600e+03,4.580000,4.433333,0,...,4.269356,100.000000,0.012487,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
24,4.58,4.58,4.58,4.58,2011-12-31 19:30:00,270.000,1.236600e+03,4.580000,4.441250,0,...,4.271392,100.000000,0.010822,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
25,4.58,4.58,4.58,4.58,2011-12-31 20:00:00,270.000,1.236600e+03,4.580000,4.449167,1,...,4.275711,100.000000,0.009379,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
26,4.58,4.58,4.58,4.58,2011-12-31 20:30:00,270.000,1.236600e+03,4.580000,4.457083,1,...,4.282127,100.000000,0.008129,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
27,4.58,4.58,4.58,4.58,2011-12-31 21:00:00,270.000,1.236600e+03,4.580000,4.465000,1,...,4.290541,100.000000,0.007045,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
28,4.58,4.58,4.58,4.58,2011-12-31 21:30:00,270.000,1.236600e+03,4.580000,4.472917,1,...,4.300920,100.000000,0.006105,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
29,4.58,4.58,4.58,4.58,2011-12-31 22:00:00,270.000,1.236600e+03,4.580000,4.480833,1,...,4.313290,100.000000,0.005291,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
30,4.58,4.58,4.58,4.58,2011-12-31 22:30:00,270.000,1.236600e+03,4.580000,4.488750,1,...,4.327743,100.000000,0.004586,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
31,4.58,4.58,4.58,4.58,2011-12-31 23:00:00,270.000,1.236600e+03,4.580000,4.496667,1,...,4.344455,100.000000,0.003974,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000
32,4.58,4.58,4.58,4.58,2011-12-31 23:30:00,270.000,1.236600e+03,4.580000,4.504583,1,...,4.363715,100.000000,0.003444,0.00,0.00,0.00,0.00,100.000000,0.000000,100.000000


In [696]:
last_n_points = 5000

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(df_processed.Timestamp[-last_n_points:], df_processed.Open[-last_n_points:], line_alpha=0.6, line_color="black", legend='bihourly price')
p.line(df_processed.Timestamp[-last_n_points::48], df_processed.Open[-last_n_points::48], line_width=2, line_alpha=1, line_color="black", legend='daily price')
p.line(df_processed.Timestamp[-last_n_points:], df_processed.Smoothed_Price[-last_n_points:], line_width=2, line_alpha=1, line_color="red", legend=("Rolling avg. "+str(SMOOTHING_PERIOD)))
#p.scatter(df_processed.Timestamp[-last_n_points:], 0.5*(df_processed.Open[-last_n_points:] + df_processed.Close[-last_n_points:]), line_alpha=1, color="#00FF00", legend="daily mean")


display(df_processed.head(1))
show(p)

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

#p.scatter(df_processed.Timestamp[-last_n_points::48], df_processed.Inc_1_day[-last_n_points::48], line_alpha=1, color="#0000FF", legend="increase since 24h")
#p.scatter(df_processed.Timestamp[-last_n_points::48], 0.1+df_processed.Inc_12_hours[-last_n_points::48], line_alpha=1, color="#FF0000", legend="increase since 12h")

display(df_processed.head(1))
show(p)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,Lower,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX
23,4.58,4.58,4.58,4.58,2011-12-31 19:00:00,270.0,1236.6,4.58,4.433333,0,...,4.269356,100.0,0.012487,0.0,0.0,0.0,0.0,100.0,0.0,100.0


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,Lower,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX
23,4.58,4.58,4.58,4.58,2011-12-31 19:00:00,270.0,1236.6,4.58,4.433333,0,...,4.269356,100.0,0.012487,0.0,0.0,0.0,0.0,100.0,0.0,100.0


# Objective 

We aim to predict price changes across intervals of 24 hours. More specifically, at the end of each day, the model should predict the price of Bitcoin in the following 24 hours.

In [697]:
display(df_processed.head())

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,Lower,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX
23,4.58,4.58,4.58,4.58,2011-12-31 19:00:00,270.0,1236.6,4.58,4.433333,0,...,4.269356,100.0,0.012487,0.0,0.0,0.0,0.0,100.0,0.0,100.0
24,4.58,4.58,4.58,4.58,2011-12-31 19:30:00,270.0,1236.6,4.58,4.44125,0,...,4.271392,100.0,0.010822,0.0,0.0,0.0,0.0,100.0,0.0,100.0
25,4.58,4.58,4.58,4.58,2011-12-31 20:00:00,270.0,1236.6,4.58,4.449167,1,...,4.275711,100.0,0.009379,0.0,0.0,0.0,0.0,100.0,0.0,100.0
26,4.58,4.58,4.58,4.58,2011-12-31 20:30:00,270.0,1236.6,4.58,4.457083,1,...,4.282127,100.0,0.008129,0.0,0.0,0.0,0.0,100.0,0.0,100.0
27,4.58,4.58,4.58,4.58,2011-12-31 21:00:00,270.0,1236.6,4.58,4.465,1,...,4.290541,100.0,0.007045,0.0,0.0,0.0,0.0,100.0,0.0,100.0


## Simple model

To assess the performance of our model, we devise a simple naïve model as a benchmark. Our simple model looks at the price change in the last 24H and assume that this change will repeat in the next 24H. More specifically, let $P_p, P_c, P_f$ be respectively the previous, current and future price of Bitcoin (in intervals of 24 hours). We have:

$$ P_f = P_c + (P_c - P_p) $$

In [708]:
def naive_predictor(previous, current):
    return current.Smoothed_Price + ((current.Smoothed_Price - previous.Smoothed_Price) / previous.Smoothed_Price) * current.Smoothed_Price

def naive_predict_absolute_price(data):  
    P_c = data.Smoothed_Price
    P_p = data.Smoothed_Price.shift(1)
    
    return pd.concat([data.Timestamp, 2 * P_c +  - P_p], axis=1)

def naive_predict_price_diff(data):  
    ret = data.copy()
    P_c = ret.Smoothed_Price
    P_p = ret.Smoothed_Price.shift(1)
    
    ret['Simple_Predicted_Price'] = P_c - P_p
    return ret

def create_labels(data, step_size=1):
    return data.Smoothed_Price.shift(-step_size) - data.Smoothed_Price

In [699]:
def split_dataset(data, fraction):
    l = len(data)
    
    train = data[0 : int(l * fraction)]
    test = data[ int(l*fraction) :]
    
    return train, test

NORMALIZE = True
ADD_PRICE_N_AGO = False
ADD_MA_N_AGO = False
LOG = False

df = df_processed.copy()
df['Label'] = create_labels(df, step_size = DAY_WINDOW)
df = df.dropna()

if LOG:
    df.Smoothed_Price = df.Smoothed_Price.apply(lambda x: np.log(x))
if ADD_PRICE_N_AGO:
    for i in range(1,31):
        add_prices_n_periods_ago(i, step=4)
if ADD_MA_N_AGO:
    add_MA_n_days_age(5)
    add_MA_n_days_age(10)
    add_MA_n_days_age(15)

if NORMALIZE:
    timestamps = df.Timestamp
    labels = df.Label
    df = df.drop('Timestamp',1)
    df = (df-df.mean())/df.std()
    df.Label = labels
    df['Timestamp'] = timestamps
    print("Standardized input data")
    display(df.describe())


train, test = split_dataset(df, 0.9)
train_x = train.drop('Label',1)
train_y = train['Label']
train_bin = train_y > 0
test_x = test.drop('Label',1)
test_y = test['Label']
test_bin = test_y > 0


display(train.head())
display(test.head())

Standardized input data


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,RSI,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX,Label
count,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,...,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0,107155.0
mean,6.662639e-15,6.577809e-15,-5.039545e-15,4.448713e-15,3.245739e-15,1.933192e-15,-8.00981e-16,-2.306545e-15,-1.921829e-14,3.754496e-14,...,2.716395e-14,-2.636316e-15,-1.9180110000000003e-17,1.062635e-16,9.210831e-14,-2.895746e-14,-1.537598e-14,1.205022e-14,8.247631e-15,3.665415
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,189.60967
min,-0.4515841,-0.450946,-0.4532425,-0.4515692,-0.5625096,-0.2997253,-0.4516021,-0.4514704,-1.088719,-1.086164,...,-3.074052,-0.3240466,-27.41318,-31.35799,-0.1552868,-0.1451161,-1.420284,-1.58785,-2.036601,-3113.298125
25%,-0.4092867,-0.4087736,-0.4098297,-0.4092618,-0.4277991,-0.2896677,-0.40929,-0.4092693,-1.088719,-1.086164,...,-0.6020853,-0.3080492,-0.01750025,-0.01835461,-0.1552868,-0.1451161,-0.6683736,-0.6691819,-0.7457231,-2.819375
50%,-0.3088493,-0.3085371,-0.3091444,-0.3088238,-0.2711934,-0.2614856,-0.3089086,-0.3086063,0.918502,0.9206625,...,-0.01940704,-0.2804579,-0.002729548,0.002293778,-0.1552868,-0.1451161,-0.2206111,-0.1962225,-0.1979552,0.121875
75%,-0.1952652,-0.1949199,-0.1950916,-0.1951581,0.05226555,-0.1610428,-0.1952169,-0.1953143,0.918502,0.9206625,...,0.6033438,-0.206762,0.008798802,0.01184751,-0.143692,-0.1356278,0.394084,0.4198853,0.5616833,5.693646
max,6.578558,6.55174,6.603871,6.588173,78.79565,37.44671,6.588216,6.492605,0.918502,0.9206625,...,2.887482,16.52033,31.3306,37.20484,43.69027,49.65404,5.760323,5.588721,4.182353,2658.07375


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX,Label,Timestamp
23,-0.451426,-0.450789,-0.452128,-0.451411,-0.110512,-0.298803,-0.451444,-0.451409,-1.088719,-1.086164,...,-0.323798,-0.00273,0.002294,-0.155287,-0.145116,5.760323,-1.58785,4.182353,0.691042,2011-12-31 19:00:00
24,-0.451426,-0.450789,-0.452128,-0.451411,-0.110512,-0.298803,-0.451444,-0.451407,-1.088719,0.920663,...,-0.323832,-0.00273,0.002294,-0.155287,-0.145116,5.760323,-1.58785,4.182353,0.695208,2011-12-31 19:30:00
25,-0.451426,-0.450789,-0.452128,-0.451411,-0.110512,-0.298803,-0.451444,-0.451404,0.918502,0.920663,...,-0.323861,-0.00273,0.002294,-0.155287,-0.145116,5.760323,-1.58785,4.182353,0.699375,2011-12-31 20:00:00
26,-0.451426,-0.450789,-0.452128,-0.451411,-0.110512,-0.298803,-0.451444,-0.451401,0.918502,0.920663,...,-0.323886,-0.00273,0.002294,-0.155287,-0.145116,5.760323,-1.58785,4.182353,0.703542,2011-12-31 20:30:00
27,-0.451426,-0.450789,-0.452128,-0.451411,-0.110512,-0.298803,-0.451444,-0.451398,0.918502,0.920663,...,-0.323908,-0.00273,0.002294,-0.155287,-0.145116,5.760323,-1.58785,4.182353,0.707708,2011-12-31 21:00:00


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX,Label,Timestamp
97856,0.578122,0.571527,0.578764,0.573347,-0.212102,0.148135,0.575707,0.553204,0.918502,0.920663,...,0.071568,0.177401,-0.340716,0.096774,-0.145116,1.531663,-1.337263,1.420637,260.954583,2017-08-04 11:00:00
97857,0.566509,0.570762,0.574206,0.57589,-0.177697,0.190312,0.572818,0.554095,0.918502,0.920663,...,0.107577,-0.080186,0.390607,-0.155287,0.374678,0.92463,-0.822119,1.357164,270.8875,2017-08-04 11:30:00
97858,0.56831,0.563867,0.573808,0.566473,-0.246498,0.101282,0.568301,0.554825,0.918502,0.920663,...,0.085346,-0.700195,0.036194,-0.155287,-0.099737,0.722328,-0.836022,1.278334,281.429583,2017-08-04 12:00:00
97859,0.563298,0.561555,0.571131,0.567,-0.361221,-0.044841,0.566774,0.555472,0.918502,0.920663,...,0.06854,-0.236539,0.230351,-0.155287,0.16016,0.516134,-0.542473,1.050734,291.74625,2017-08-04 12:30:00
97860,0.5701,0.563336,0.57112,0.563326,-0.241027,0.107904,0.56713,0.556337,0.918502,0.920663,...,0.067578,0.177401,0.003218,0.096774,-0.145116,0.509992,-0.679631,0.907173,301.554583,2017-08-04 13:00:00


## Simple model performance

In [709]:
predicted_labels = naive_predict_price_diff(df).loc[test_x.index]
predicted_labels['Binary'] = 1 * (predicted_labels.Simple_Predicted_Price > 0)
#display(predicted_labels[::2*24])

print(mean_square_loss(predicted_labels.Simple_Predicted_Price, test_y))
print(accuracy(predicted_labels.Binary, test_y > 0))

351125.07790508954
0.640910787607


In [710]:
last_n_points = -5000

points = predicted_labels

buy_points = points[predicted_labels.Binary == 1]
sell_points = points[predicted_labels.Binary == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.6)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SHIFTING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='black', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::], buy_points.Smoothed_Price[::], color="#00ff00", legend="Buy periods")
p.scatter(sell_points.Timestamp[::], sell_points.Smoothed_Price[::], color="#ff0000", legend="Sell periods")
show(p)

## Neural network

In [702]:
display(df[1000:1010:])

Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,ATR,UpMove,DownMove,pDM,mDM,pDI,mDI,ADX,Label,Timestamp
1583,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.45087,0.918502,0.920663,...,-0.322906,0.006997,-0.006027,-0.141676,-0.145116,3.573223,0.148618,1.21996,0.269375,2012-02-02 07:00:00
1584,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450868,0.918502,0.920663,...,-0.323059,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,1.162453,0.253958,2012-02-02 07:30:00
1585,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450865,0.918502,0.920663,...,-0.323191,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,1.112613,0.240625,2012-02-02 08:00:00
1586,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450859,0.918502,0.920663,...,-0.323306,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,1.069419,0.22,2012-02-02 08:30:00
1587,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450853,0.918502,0.920663,...,-0.323405,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,1.031984,0.204792,2012-02-02 09:00:00
1588,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.45085,0.918502,0.920663,...,-0.323491,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,0.999541,0.201667,2012-02-02 09:30:00
1589,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450847,0.918502,0.920663,...,-0.323566,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,0.971423,0.193125,2012-02-02 10:00:00
1590,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450843,0.918502,0.920663,...,-0.323631,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,0.947054,0.175417,2012-02-02 10:30:00
1591,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450839,0.918502,0.920663,...,-0.323687,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,0.925934,0.161042,2012-02-02 11:00:00
1592,-0.450859,-0.450227,-0.451557,-0.450844,-0.45758,-0.299437,-0.450877,-0.450834,0.918502,0.920663,...,-0.323735,-0.00273,0.002294,-0.155287,-0.145116,3.573223,0.148618,0.907631,0.139583,2012-02-02 11:30:00


plusieurs y gains selon différents temps + vote

In [711]:
to_drop = ['Timestamp', 'Open', 'Close', 'High', 'Low', 'Weighted_Price', 'Volume_(Currency)',
          'pDM','mDM']
train_x_nn = train_x.copy().drop(to_drop,1)
test_x_nn = test_x.copy().drop(to_drop,1)

train_y_nn = (train_y > 0).astype(int)
test_y_nn = (test_y > 0).astype(int)


print("% of 1 in train: " + str(np.sum(train_y_nn == 1) / len(train_y)))
print("% of 1 in test: " + str(np.sum(test_y_nn == 1) / len(test_y)))
print(test_y_nn.dtypes)
display(train_x_nn.head())
display(train_y_nn.head())

% of 1 in train: 0.561868123892
% of 1 in test: 0.558324001493
int32


Unnamed: 0,Volume_(BTC),Smoothed_Price,Inc_1_hour,Inc_30_min,EMA_12,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI,ATR,UpMove,DownMove,pDI,mDI,ADX
23,-0.110512,-0.451409,-1.088719,-1.086164,-0.451453,-0.451454,-0.014483,-0.016003,0.002001,-0.451425,-0.30142,-0.448301,-0.454552,2.887482,-0.323798,-0.00273,0.002294,5.760323,-1.58785,4.182353
24,-0.110512,-0.451407,-1.088719,0.920663,-0.451449,-0.451451,-0.014425,-0.015861,0.001747,-0.451422,-0.301352,-0.448295,-0.454551,2.887482,-0.323832,-0.00273,0.002294,5.760323,-1.58785,4.182353
25,-0.110512,-0.451404,0.918502,0.920663,-0.451445,-0.451448,-0.014398,-0.015741,0.001463,-0.451419,-0.301306,-0.44829,-0.45455,2.887482,-0.323861,-0.00273,0.002294,5.760323,-1.58785,4.182353
26,-0.110512,-0.451401,0.918502,0.920663,-0.451443,-0.451445,-0.014394,-0.015645,0.001171,-0.451415,-0.301278,-0.448286,-0.454547,2.887482,-0.323886,-0.00273,0.002294,5.760323,-1.58785,4.182353
27,-0.110512,-0.451398,0.918502,0.920663,-0.45144,-0.451443,-0.014409,-0.015571,0.000888,-0.451412,-0.301268,-0.448282,-0.454544,2.887482,-0.323908,-0.00273,0.002294,5.760323,-1.58785,4.182353


23    1
24    1
25    1
26    1
27    1
Name: Label, dtype: int32

In [721]:
model = Sequential()
model.add(Dense(128, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

opt = keras.optimizers.SGD(lr=0.01, momentum=0.01, decay=0.0, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
#model.add(Dense(1, kernel_initializer='normal'))
#model.compile(loss='mse', optimizer='adam', metrics=['mae'])


history = model.fit(train_x_nn, train_y_nn, epochs = 5, verbose=1, validation_data = (test_x_nn, test_y_nn))
scores = model.evaluate(test_x_nn, test_y_nn, verbose=0)

print(scores)

Train on 96439 samples, validate on 10716 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
[0.66764701679821736, 0.65341545359418218]


In [714]:
x = (model.predict_classes(test_x_nn)).squeeze()
print(x)
print(accuracy(x, test_bin))
print(np.sum(x==1))
print(np.sum(x==0))
print(np.sum(x==1)/np.sum(x==0))

[1 1 1 ..., 0 1 0]
0.665733482643
5789
4927
1.17495433327


In [707]:
last_n_points = -5000

points = test_x

buy_points = points[x == 1]
sell_points = points[x == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.4)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SHIFTING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='red', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::], buy_points.Smoothed_Price[::], color="#00ff00")
p.scatter(sell_points.Timestamp[::], sell_points.Smoothed_Price[::], color="#ff0000")
show(p)

In [726]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier(n_estimators=25,max_depth=None)
rf.fit(train_x_nn, train_y_nn)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [727]:
predicted = rf.predict(test_x_nn)
accuracy = accuracy_score(test_y_nn, predicted)
print(accuracy)

0.450074654722


## Investing using the model
We use the model to apply the following simple investment strategy. At the end of each day, we take either a long or short position for $100. No matter what happens, the position is liquidated after 24H. Of course, the choice of the position is dependent on the price increase or decrease of the model

In [108]:
daily_amount = 100 #dollars

def compute_investment_return(prices, decisions):
    prices_in_24h = prices.shift(-DAY_WINDOW)
    return np.sum( ((prices_in_24h - prices)/prices) * daily_amount * decisions)

### The anarchist
The anarchist decide whether to invest or not based on a (bit)coin flip:

In [None]:
sum = 0
prices = df_raw.iloc[test2.index].Close
l = len(prices)
for i in range(10000):
    choices = 2 * np.random.choice(2, l) - 1
    sum += compute_investment_return(prices, choices)

print(sum / 10000.0)

### Simple model

In [None]:
#true_labels = pd.DataFrame(create_labels(df_24h).Label)
predicted_labels = naive_predict( df_raw.iloc[test2.index].copy())
predicted_labels.columns = ['Timestamp', 'Price_Prediction']

prices = df_raw.iloc[test2.index].Close

join = pd.concat([predicted_labels, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.Close) - 1 # Long: 1 Short: -1
display(join)

decisions_simple = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

### Neural net

In [None]:
prediction = pd.DataFrame(np.exp(model.predict(test_x2)).squeeze())
prediction.columns = ['Price_Prediction']
prediction['previous'] = prediction.shift(DAY_WINDOW)

prices = df_raw.iloc[test2.index][['Timestamp', 'Close']].reset_index(drop=True)

join = pd.concat([prediction, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.previous) - 1 # Long: 1 Short: -1
display(join)

decisions_nn = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

In [None]:
print(np.sum(decisions_simple.values == decisions_nn[1:].values))

In [None]:
from keras.models import load_model
 
    

with tf.device('/cpu:0'):

    model = load_model('my_model.h5')
    scores = model.evaluate(test_x, test_y, verbose=0)
    print(scores)
    
    del model
