In [1]:
import pandas as pd
import keras
from IPython.display import display, HTML
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
import matplotlib.pyplot as plt
import numpy as np
import datetime

np.random.seed(42)

Using TensorFlow backend.


In [284]:
import matplotlib.pyplot as plt

In [2]:
from math import pi
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file

output_notebook()

In [3]:
a = pd.Series([1,2,3,4,5,6,7,8,9])

print(a.rolling(3).mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
dtype: float64


In [4]:
HOURS_IN_DAY = 24
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60
AGGREGATION_PERIOD = 30 #Model uses 30 minutes candles

DAY_WINDOW = int(HOURS_IN_DAY * MINUTES_IN_HOUR / AGGREGATION_PERIOD)

In [5]:
# Force CPU usage
import tensorflow as tf
from keras import backend as K

num_cores = 8

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : 1, 'GPU' : 0})
session = tf.Session(config=config)
K.set_session(session)

In [6]:
# For plot

def prepare_standardplot(title, xlabel):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(title)
    ax1.set_ylabel('categorical cross entropy')
    ax1.set_xlabel(xlabel)
    ax1.set_yscale('log')
    ax2.set_ylabel('accuracy [% correct]')
    ax2.set_xlabel(xlabel)
    return fig, ax1, ax2

def finalize_standardplot(fig, ax1, ax2):
    ax1handles, ax1labels = ax1.get_legend_handles_labels()
    if len(ax1labels) > 0:
        ax1.legend(ax1handles, ax1labels)
    ax2handles, ax2labels = ax2.get_legend_handles_labels()
    if len(ax2labels) > 0:
        ax2.legend(ax2handles, ax2labels)
    fig.tight_layout()
    plt.subplots_adjust(top=0.9)

def plot_history(history, title):
    fig, ax1, ax2 = prepare_standardplot(title, 'epoch')
    ax1.plot(history.history['loss'], label = "training")
    ax2.plot(history.history['binary_accuracy'], label = "training")
    finalize_standardplot(fig, ax1, ax2)
    return fig

In [7]:
def aggregate_market_values(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
    
    data = data.groupby(data.Timestamp // aggregation_factor).agg({
        'Open' : 'first',
        'High' : np.max,
        'Low' : np.min,
        'Close' : 'last',
        'Volume_(BTC)' : np.sum ,
        'Volume_(Currency)' : np.sum,
        'Weighted_Price' : np.mean,
    }).reset_index()
    
    data.Timestamp *= aggregation_factor
    
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data

def first_in_window(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
            
    data = data.groupby(data.Timestamp // aggregation_factor).first().reset_index(drop=True)
            
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data
    

In [518]:
df_raw_part1 = pd.read_csv('Data/bitstampUSD_1-min_data_2012-01-01_to_2018-01-08.csv')
df_raw_part2 = pd.read_csv('Data/bitstampUSD_30-min_data_january.csv', date_parser=True)

# Aggregate first part of data into chunks of 30 mins, second part already aggregated
df_p1 = aggregate_market_values(df_raw_part1, 30, unix_time=True)
df_p1.Timestamp = pd.to_datetime(df_p1.Timestamp, unit='s')

df_p2 = df_raw_part2
df_p2.Timestamp = pd.to_datetime(df_p2.Timestamp)

df_raw = pd.concat([df_p1, df_p2]).reset_index(drop=True)

display(df_raw[105370:105372])
display(df_raw.tail())
print(df_raw.dtypes)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
105370,16166.78,16176.96,16102.05,16173.98,2018-01-08 00:00:00,25.633791,413278.4,16122.40664
105371,15956.66,16300.0,15954.16,16293.99,2018-01-08 00:30:00,221.65,3577715.0,16140.98


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
108615,8453.2,8505.0,8435.94,8503.41,2018-03-16 14:30:00,250.99,2126237.5,8471.47
108616,8511.47,8527.24,8450.1,8452.99,2018-03-16 15:00:00,321.19,2727121.08,8490.73
108617,8554.04,8613.06,8482.47,8510.0,2018-03-16 15:30:00,563.11,4813543.76,8548.16
108618,8541.17,8572.28,8472.24,8554.42,2018-03-16 16:00:00,305.72,2605758.19,8523.24
108619,8541.19,8553.83,8522.02,8547.41,2018-03-16 16:30:00,30.78,262893.41,8539.96


Close                       float64
High                        float64
Low                         float64
Open                        float64
Timestamp            datetime64[ns]
Volume_(BTC)                float64
Volume_(Currency)           float64
Weighted_Price              float64
dtype: object


In [9]:
df_plot = df_raw.copy()

aggregation_factor = 24 * 60 #24h candles

df_plot = aggregate_market_values(df_plot, 12 * 60)

inc = df_plot.Close >= df_plot.Open
dec = df_plot.Open > df_plot.Close
barWidth = 0.66 * aggregation_factor * 60 * 1000 # 30 minutes in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.segment(df_plot.Timestamp, df_plot.High, df_plot.Timestamp, df_plot.Low, color="black")
p.vbar(df_plot.Timestamp[inc], barWidth, df_plot.Open[inc], df_plot.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_plot.Timestamp[dec], barWidth, df_plot.Open[dec], df_plot.Close[dec], fill_color="#FF2828", line_color="black")

show(p)

In [200]:
def add_MA_n_days_age(num_days):
    num_days_str = str(num_days)
    
    #df[['Open_W_MA_'+num_days_str,'High_W_MA_'+num_days_str,'Low_W_MA_'+num_days_str,'Close_W_MA_'+num_days_str]] = df[['Open_W','High_W','Low_W','Close_W']].rolling(window=day_window * num_days).mean()
    df[['MA_'+num_days_str,'High_MA_'+num_days_str,'Low_MA_'+num_days_str,'Close_MA_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).mean()
    #df[['Open_EMA_'+num_days_str,'High_EMA_'+num_days_str,'Low_EMA_'+num_days_str,'Close_EMA_'+num_days_str]] = df[['Open','High','Low','Close']].ewm(span=day_window * num_days).mean()


def add_prices_n_periods_ago(data, periods, step=DAY_WINDOW):
    return data['Smoothed_Price'].copy().shift(step * periods)

def dummy_increased_since(data, n, step=DAY_WINDOW):
    """
    1 if price increased since n * step, 0 otherwise
    """
    return 1 * (data['Smoothed_Price'] > data['Smoothed_Price'].shift(int(step * n)))

def dummy_piecewise_increased_since(data, n, interval, step=DAY_WINDOW):
    """
    1 if price increased between n * step - interval and n * step, 0 otherwise
    """
    return 1 * (data['Smoothed_Price'].shift(int(step * n)) > data['Smoothed_Price'].shift(int(step * (n + interval))))

def mean_square_loss(predicted_labels, true_labels):
    assert len(predicted_labels) == len(true_labels)
    return np.mean((predicted_labels - true_labels)**2)

def accuracy(x,y):
    return np.mean(x==y)

## Oscillators

In [535]:
def RSI(df, gain_column='Close', period=14):
    data = pd.DataFrame(df[gain_column].copy())
    
    #Compute differences
    data['delta'] = data[gain_column] - data.shift(1)[gain_column]
    
    #Compute gains and losses
    data['gain'] = data[data['delta'] >= 0]['delta']
    data['loss'] = data[data['delta'] <= 0]['delta']
    data['loss'] = -data['loss']
    data['gain'].fillna(0.0, inplace = True)
    data['loss'].fillna(0.0, inplace = True)
    
    #Compute average of gains and losses
    data['avg_gain'] = data['gain'].rolling(period).mean()
    data['avg_loss'] = data['loss'].rolling(period).mean()
    
    #Compute relative strength
    data['RS'] = data['avg_gain'] / data['avg_loss']
    
    return 100 - 100.0/(1.0 + data.RS)
    
def EMA(data, alpha):
    r = data.copy()
    
    display(data[-100:-99])

    
    for i in range(1, len(data)):
        r.iloc[i] = (alpha * data[i]) + ((1.0 - alpha) * r.iloc[i-1])
    
    display(r[-100:-99])
    return r

def MACD(df, price_column='Close', period1=12, period2=26, smooth_factor=9):
    
    data = df.copy()
    
    multiplier_p1 = 2.0 / (period1 + 1)
    multiplier_p2 = 2.0 / (period2 + 1)
    
    data['EMA_'+str(period1)] = EMA(data[price_column], multiplier_p1)
    data['EMA_'+str(period2)] = EMA(data[price_column], multiplier_p2)
    
    data['MACD_Line'] = data['EMA_'+str(period1)] - data['EMA_'+str(period2)]
    
    multiplier_smooth = 2.0 / (smooth_factor + 1)
    data['Signal_Line'] = EMA(data['MACD_Line'], multiplier_smooth)
    
    data['MACD_Histogram'] = data['MACD_Line'] - data['Signal_Line']
    
    return data
    
def BB(dataframe, price_column='Close', period = 20):
    df = dataframe.copy()
    
    #Compute middle Band
    df['MA'] = df[price_column].rolling(period).mean()
    
    # Compute std
    df['std'] = df[price_column].rolling(period).std()

    #Compute Upper Band
    df['Upper'] = df['MA'] + 2*df['std']
    
    #Compute Lower Band
    df['Lower'] = df['MA'] - 2*df['std']
    return df

def average_true_range(df, period):
    data = df.copy()
    
    data['v1'] = data.High - data.Low
    data['v2'] = (data.High - data.shift(1).Close).apply(abs)
    

    data['v3'] = (data.Low - data.shift(1).Close).apply(abs)
    data['True_Range'] = data[['v1','v2','v3']].max(axis=1)
    
    
    return EMA(data.True_Range, 2.0 / (period + 1))


def Directional_Movement(df, period_atr, period_adx):
    pass

In [539]:
df_processed = df_processed.reset_index(drop=True)
df_processed['ATR'] = average_true_range(df_processed, 14.0)


display(df_processed[-100:-99])


107107    76.52
Name: True_Range, dtype: float64

107107    103.715694
Name: True_Range, dtype: float64

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI,ATR,ATR_tmp
107107,8673.48,8750.0,8673.48,8743.63,2018-03-14 15:00:00,227.85,1983204.35,8703.93,8708.555,0,...,-122.454707,-111.176855,-11.277852,8894.8485,212.809343,9320.467186,8469.229814,21.228281,,103.715694


In [540]:
p = make_plot_figure("test", 'test', 'test')
p.line(df_processed.Timestamp[:], df_processed['ATR'][:], line_color='#ffa02b', line_width=2, line_alpha=1, legend='Signal line')
show(p)

In [333]:
def plot_df(df, column):
    p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=950, title = "MSFT Candlestick")
    p.xaxis.major_label_orientation = pi/4
    p.grid.grid_line_alpha=0.3

    p.line(df.Timestamp[:], df[column][:], line_color='#ff0000', line_width=2, line_alpha=1, legend=column)
   

    show(p)

In [402]:
def make_plot_figure(plot_title, xlabel, ylabel):
    p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=950, title = plot_title)
    p.xaxis.major_label_orientation = pi/4
    p.xaxis.axis_label = xlabel
    p.xaxis.axis_label_text_font_style='normal'
    p.yaxis.axis_label = ylabel
    p.yaxis.axis_label_text_font_style='normal'
    p.grid.grid_line_alpha=0.3
    
    return p

In [404]:
df_test = BB(df_processed)[-700:]

p = make_plot_figure("Feature: Bollinger Bands", 'Time', 'BTC price in USD')

p.line(df_test.Timestamp[:], df_test['MA'][:], line_color='#ffa02b', line_width=2, line_alpha=1, legend='SMA')
p.line(df_test.Timestamp[:], df_test['Upper'][:], line_color='#2b8aff', line_width=2, line_alpha=0.6, legend='Upper Band')
p.line(df_test.Timestamp[:], df_test['Lower'][:], line_color='#2b8aff', line_width=2, line_alpha=0.6, legend='Lower Band')

inc = df_test.Close >= df_test.Open
dec = df_test.Open > df_test.Close
barWidth = 0.66 * 30 * 60 * 1000 # 30 minutes in ms

p.segment(df_test.Timestamp, df_test.High, df_test.Timestamp, df_test.Low, color="black")
p.vbar(df_test.Timestamp[inc], barWidth, df_test.Open[inc], df_test.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_test.Timestamp[dec], barWidth, df_test.Open[dec], df_test.Close[dec], fill_color="#FF2828", line_color="black")


show(p)

In [491]:
df_test = df_processed.reset_index(drop=True)
df_test = MACD(df_test)[-500:]

In [492]:
p = make_plot_figure("Feature: MACD", "Time", "")
p2 = make_plot_figure("BTCUSD - Price and EMAs", "Time", "BTC Price in USD")


p2.line(df_test.Timestamp[:], df_test['Close'], line_color="black", line_alpha=0.5, legend='Closing Price')
p2.line(df_test.Timestamp[:], df_test['EMA_12'], line_color="red", line_alpha=0.8, line_width=2, legend='EMA 12')
p2.line(df_test.Timestamp[:], df_test['EMA_26'], line_color="green", line_alpha=0.8, line_width=2, legend='EMA 26')

p.line(df_test.Timestamp[:], df_test['Signal_Line'][:], line_color='#ffa02b', line_width=2, line_alpha=1, legend='Signal line')
p.line(df_test.Timestamp[:], df_test['MACD_Line'][:], line_color='#2b8aff', line_width=2, line_alpha=1, legend='MACD Line')
p.quad(top=df_test['MACD_Histogram'][:], bottom=0, left=df_test.Timestamp[:-1], right=df_test.Timestamp[1:], color='#ff2b8a', legend='MACD Histogram')

show(p2)
show(p)



# ==========================================================
# 
# ==========================================================

In [457]:
SMOOTHING_PERIOD = int(0.5 * 2 * 24) # 6h
SHIFTING_PERIOD = 0 #SMOOTHING_PERIOD

df_processed = df_raw.copy()
df_processed['Smoothed_Price'] = (0.5 * (df_processed.Close + df_processed.Open)).shift(-int(SHIFTING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean()
#df_processed['Inc_1_day'] = dummy_increased_since(df_processed, 1)
#df_processed['Inc_12_hours'] = dummy_increased_since(df_processed, 0.5)
#df_processed['Inc_6_hour'] = dummy_increased_since(df_processed, 0.25)
#df_processed['Inc_1_hour_30min'] = dummy_increased_since(df_processed, 3, 1)
df_processed['Inc_1_hour'] = dummy_increased_since(df_processed, 2, 1)
df_processed['Inc_30_min'] = dummy_increased_since(df_processed, 1, 1)
#df_processed['PInc_30-60'] = dummy_piecewise_increased_since(df_processed, 1, 1, 1)
#df_processed['PInc_60-90'] = dummy_piecewise_increased_since(df_processed, 2, 1, 1)
df_processed = df_processed.reset_index(drop=True)
df_processed = MACD(df_processed)
df_processed = BB(df_processed)
df_processed['RSI'] = RSI(df_processed, gain_column='Close')
df_processed = df_processed.dropna(how='any')
print(DAY_WINDOW*0.04167)
#display(df_processed[10090:10120:])

2.00016


In [458]:
last_n_points = 5000

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(df_processed.Timestamp[-last_n_points:], df_processed.Open[-last_n_points:], line_alpha=0.6, line_color="black", legend='bihourly price')
p.line(df_processed.Timestamp[-last_n_points::48], df_processed.Open[-last_n_points::48], line_width=2, line_alpha=1, line_color="black", legend='daily price')
p.line(df_processed.Timestamp[-last_n_points:], df_processed.Smoothed_Price[-last_n_points:], line_width=2, line_alpha=1, line_color="red", legend=("Rolling avg. "+str(SMOOTHING_PERIOD)))
#p.scatter(df_processed.Timestamp[-last_n_points:], 0.5*(df_processed.Open[-last_n_points:] + df_processed.Close[-last_n_points:]), line_alpha=1, color="#00FF00", legend="daily mean")


display(df_processed.head(1))
show(p)

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

#p.scatter(df_processed.Timestamp[-last_n_points::48], df_processed.Inc_1_day[-last_n_points::48], line_alpha=1, color="#0000FF", legend="increase since 24h")
#p.scatter(df_processed.Timestamp[-last_n_points::48], 0.1+df_processed.Inc_12_hours[-last_n_points::48], line_alpha=1, color="#FF0000", legend="increase since 12h")

display(df_processed.head(1))
show(p)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,EMA_12,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI
19,4.58,4.58,4.58,4.58,2011-12-31 17:00:00,270.0,1236.6,4.58,4.58,1,...,4.442663,4.41642,0.026243,0.007546,0.018697,4.4085,0.056965,4.52243,4.29457,100.0


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,EMA_12,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI
19,4.58,4.58,4.58,4.58,2011-12-31 17:00:00,270.0,1236.6,4.58,4.58,1,...,4.442663,4.41642,0.026243,0.007546,0.018697,4.4085,0.056965,4.52243,4.29457,100.0


# Objective 

We aim to predict price changes across intervals of 24 hours. More specifically, at the end of each day, the model should predict the price of Bitcoin in the following 24 hours.

In [459]:
display(df_processed.head())

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,...,EMA_12,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI
19,4.58,4.58,4.58,4.58,2011-12-31 17:00:00,270.0,1236.6,4.58,4.58,1,...,4.442663,4.41642,0.026243,0.007546,0.018697,4.4085,0.056965,4.52243,4.29457,100.0
20,4.58,4.58,4.58,4.58,2011-12-31 17:30:00,270.0,1236.6,4.58,4.58,1,...,4.463792,4.428537,0.035255,0.013088,0.022167,4.418,0.068411,4.554821,4.281179,100.0
21,4.58,4.58,4.58,4.58,2011-12-31 18:00:00,270.0,1236.6,4.58,4.58,0,...,4.48167,4.439756,0.041913,0.018853,0.023061,4.4275,0.076974,4.581448,4.273552,100.0
22,4.58,4.58,4.58,4.58,2011-12-31 18:30:00,270.0,1236.6,4.58,4.58,0,...,4.496797,4.450145,0.046653,0.024413,0.02224,4.437,0.083546,4.604093,4.269907,100.0
23,4.58,4.58,4.58,4.58,2011-12-31 19:00:00,270.0,1236.6,4.58,4.58,0,...,4.509598,4.459764,0.049834,0.029497,0.020337,4.4465,0.088572,4.623644,4.269356,100.0


## Simple model

To assess the performance of our model, we devise a simple naïve model as a benchmark. Our simple model looks at the price change in the last 24H and assume that this change will repeat in the next 24H. More specifically, let $P_p, P_c, P_f$ be respectively the previous, current and future price of Bitcoin (in intervals of 24 hours). We have:

$$ P_f = P_c + (P_c - P_p) $$

In [460]:
def naive_predictor(previous, current):
    return current.Smoothed_Price + ((current.Smoothed_Price - previous.Smoothed_Price) / previous.Smoothed_Price) * current.Smoothed_Price

def naive_predict_absolute_price(data):  
    P_c = data.Smoothed_Price
    P_p = data.Smoothed_Price.shift(DAY_WINDOW)
    
    return pd.concat([data.Timestamp, 2 * P_c +  - P_p], axis=1)

def naive_predict_price_diff(data):  
    ret = data.copy()
    P_c = ret.Smoothed_Price
    P_p = ret.Smoothed_Price.shift(DAY_WINDOW)
    
    ret['Simple_Predicted_Price'] = P_c - P_p
    return ret

def create_labels(data, step_size=1):
    return data.Smoothed_Price.shift(-step_size) - data.Smoothed_Price

In [461]:
def split_dataset(data, fraction):
    l = len(data)
    
    train = data[0 : int(l * fraction)]
    test = data[ int(l*fraction) :]
    
    return train, test

NORMALIZE = True
ADD_PRICE_N_AGO = False
ADD_MA_N_AGO = False
LOG = False

df = df_processed.copy()
df['Label'] = create_labels(df, step_size = DAY_WINDOW)
df = df.dropna()

if LOG:
    df.Smoothed_Price = df.Smoothed_Price.apply(lambda x: np.log(x))
if ADD_PRICE_N_AGO:
    for i in range(1,31):
        add_prices_n_periods_ago(i, step=4)
if ADD_MA_N_AGO:
    add_MA_n_days_age(5)
    add_MA_n_days_age(10)
    add_MA_n_days_age(15)

if NORMALIZE:
    timestamps = df.Timestamp
    labels = df.Label
    df = df.drop('Timestamp',1)
    df = (df-df.mean())/df.std()
    df.Label = labels
    df['Timestamp'] = timestamps
    print("Standardized input data")
    display(df.describe())


train, test = split_dataset(df, 0.9)
train_x = train.drop('Label',1)
train_y = train['Label']
train_bin = train_y > 0
test_x = test.drop('Label',1)
test_y = test['Label']
test_bin = test_y > 0


display(train.head())
display(test.head())

Standardized input data


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI,Label
count,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,...,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0,107159.0
mean,-2.556065e-15,6.483693e-15,-1.313507e-16,7.218403e-15,2.615242e-15,-4.158129e-15,-2.384495e-16,4.906697e-15,4.46791e-14,-8.147968e-14,...,4.78223e-15,2.630286e-16,-1.102738e-16,-2.224404e-18,-2.817846e-15,-1.682611e-15,-3.275101e-15,1.911454e-15,2.690096e-14,3.68643
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,208.478597
min,-0.4515739,-0.4509359,-0.4532324,-0.451559,-0.5625158,-0.2997193,-0.4515919,-0.451575,-0.9914188,-0.9611,...,-0.4515006,-18.18004,-16.59669,-22.86473,-0.4514815,-0.3029753,-0.4483982,-0.4546417,-3.073739,-4018.075
25%,-0.4092831,-0.4087664,-0.4098189,-0.4092654,-0.4278026,-0.2896616,-0.4092813,-0.4092623,-0.9914188,-0.9611,...,-0.4093165,-0.02898824,-0.03025427,-0.02037856,-0.4092873,-0.2934564,-0.4068745,-0.4117423,-0.6020421,-3.18
50%,-0.308837,-0.3085249,-0.309143,-0.3088187,-0.2711832,-0.261483,-0.3089006,-0.3088624,-0.9914188,-0.9611,...,-0.3086833,-0.01532324,-0.01629998,-6.480664e-05,-0.308716,-0.2718821,-0.3077812,-0.3097017,-0.01948724,0.13
75%,-0.1952728,-0.1949059,-0.1950777,-0.1951693,0.0522581,-0.1610345,-0.1952226,-0.1952777,1.008646,1.040465,...,-0.1947821,0.009935266,0.009866488,0.01967779,-0.1954854,-0.1992694,-0.195687,-0.193775,0.6032535,6.0425
max,6.578673,6.551854,6.603986,6.588288,78.79711,37.44736,6.58833,6.569358,1.008646,1.040465,...,6.482765,14.88114,14.47247,23.84351,6.511395,14.10538,6.428435,6.663183,2.886978,3300.18


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI,Label,Timestamp
19,-0.451416,-0.450779,-0.452118,-0.451401,-0.11051,-0.298797,-0.451434,-0.451417,1.008646,1.040465,...,-0.015192,-0.016702,0.001836,-0.451429,-0.301985,-0.448326,-0.454532,2.886978,0.62,2011-12-31 17:00:00
20,-0.451416,-0.450779,-0.452118,-0.451401,-0.11051,-0.298797,-0.451434,-0.451417,1.008646,-0.9611,...,-0.014921,-0.016525,0.002184,-0.451426,-0.301778,-0.448315,-0.454537,2.886978,0.68,2011-12-31 17:30:00
21,-0.451416,-0.450779,-0.452118,-0.451401,-0.11051,-0.298797,-0.451434,-0.451417,-0.991419,-0.9611,...,-0.014721,-0.016342,0.002274,-0.451422,-0.301624,-0.448306,-0.45454,2.886978,0.695,2011-12-31 18:00:00
22,-0.451416,-0.450779,-0.452118,-0.451401,-0.11051,-0.298797,-0.451434,-0.451417,-0.991419,-0.9611,...,-0.014578,-0.016164,0.002192,-0.451419,-0.301505,-0.448298,-0.454542,2.886978,0.71,2011-12-31 18:30:00
23,-0.451416,-0.450779,-0.452118,-0.451401,-0.11051,-0.298797,-0.451434,-0.451417,-0.991419,-0.9611,...,-0.014483,-0.016002,0.002001,-0.451415,-0.301414,-0.448291,-0.454542,2.886978,0.71,2011-12-31 19:00:00


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI,Label,Timestamp
97856,0.578147,0.571553,0.57879,0.573372,-0.212101,0.148149,0.575733,0.575771,1.008646,1.040465,...,0.671829,0.485238,0.717305,0.555068,0.2987,0.548464,0.561865,1.749112,303.355,2017-08-04 11:00:00
97857,0.566534,0.570787,0.574231,0.575916,-0.177696,0.190326,0.572844,0.571236,-0.991419,-0.9611,...,0.639825,0.523973,0.488223,0.556082,0.304701,0.549676,0.562668,1.045768,292.96,2017-08-04 11:30:00
97858,0.568335,0.563892,0.573833,0.566498,-0.246498,0.101295,0.568327,0.567427,-0.991419,-0.9611,...,0.619309,0.550608,0.335692,0.557083,0.3146,0.551025,0.563297,1.270381,314.075,2017-08-04 12:00:00
97859,0.563323,0.56158,0.571156,0.567026,-0.361223,-0.044831,0.566799,0.565185,-0.991419,-0.9611,...,0.56251,0.559862,0.116606,0.558028,0.301574,0.551438,0.564809,0.646037,317.015,2017-08-04 12:30:00
97860,0.570125,0.563361,0.571145,0.563352,-0.241028,0.107917,0.567155,0.566749,-0.991419,1.040465,...,0.556864,0.566067,0.078174,0.559143,0.306094,0.55269,0.565776,1.014975,313.93,2017-08-04 13:00:00


## Simple model performance

In [462]:
predicted_labels = naive_predict_price_diff(df).loc[test_x.index]
predicted_labels['Binary'] = 1 * (predicted_labels.Simple_Predicted_Price > 0)
#display(predicted_labels[::2*24])

print(mean_square_loss(predicted_labels.Simple_Predicted_Price, test_y))
print(accuracy(predicted_labels.Binary, test_y > 0))

424564.22281654354
0.513717805151


In [463]:
last_n_points = -5000

points = predicted_labels

buy_points = points[predicted_labels.Binary == 1]
sell_points = points[predicted_labels.Binary == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.6)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SHIFTING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='black', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::], buy_points.Smoothed_Price[::], color="#00ff00", legend="Buy periods")
p.scatter(sell_points.Timestamp[::], sell_points.Smoothed_Price[::], color="#ff0000", legend="Sell periods")
show(p)

## Neural network

In [464]:
display(df[1000:1010:])

Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_hour,Inc_30_min,...,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI,Label,Timestamp
1579,-0.450946,-0.450313,-0.451644,-0.450931,-0.552517,-0.299693,-0.450964,-0.450947,-0.991419,-0.9611,...,-0.014373,-0.014549,-0.002211,-0.450883,-0.301122,-0.447765,-0.454004,0.048541,0.285,2012-02-02 05:00:00
1580,-0.450946,-0.450313,-0.451644,-0.450931,-0.552517,-0.299693,-0.450964,-0.450947,-0.991419,-0.9611,...,-0.014615,-0.014738,-0.002426,-0.450885,-0.301135,-0.447767,-0.454005,0.048541,0.11,2012-02-02 05:30:00
1581,-0.450946,-0.450313,-0.451644,-0.450931,-0.552517,-0.299693,-0.450964,-0.450947,-0.991419,-0.9611,...,-0.014821,-0.014933,-0.002499,-0.450887,-0.301152,-0.44777,-0.454006,0.048541,0.11,2012-02-02 06:00:00
1582,-0.450946,-0.450313,-0.451644,-0.450931,-0.552517,-0.299693,-0.450964,-0.450947,-0.991419,-0.9611,...,-0.014995,-0.015126,-0.002474,-0.450881,-0.30206,-0.447799,-0.453962,0.048541,0.11,2012-02-02 06:30:00
1583,-0.450849,-0.450216,-0.451547,-0.450834,-0.457584,-0.299431,-0.450867,-0.45085,1.008646,1.040465,...,-0.014496,-0.015174,-0.000652,-0.450877,-0.301699,-0.447781,-0.453973,1.645162,-0.16,2012-02-02 07:00:00
1584,-0.450849,-0.450216,-0.451547,-0.450834,-0.457584,-0.299431,-0.450867,-0.45085,1.008646,-0.9611,...,-0.014121,-0.015133,0.000471,-0.450874,-0.301446,-0.447769,-0.453981,1.275974,-0.16,2012-02-02 07:30:00
1585,-0.450849,-0.450216,-0.451547,-0.450834,-0.457584,-0.299431,-0.450867,-0.45085,-0.991419,-0.9611,...,-0.013849,-0.015043,0.001096,-0.450869,-0.301276,-0.447757,-0.453982,1.275974,-0.16,2012-02-02 08:00:00
1586,-0.450849,-0.450216,-0.451547,-0.450834,-0.457584,-0.299431,-0.450867,-0.45085,-0.991419,-0.9611,...,-0.01366,-0.01493,0.001374,-0.450864,-0.301157,-0.447748,-0.453982,1.275974,-0.16,2012-02-02 08:30:00
1587,-0.450849,-0.450216,-0.451547,-0.450834,-0.457584,-0.299431,-0.450867,-0.45085,-0.991419,-0.9611,...,-0.013539,-0.014815,0.001416,-0.450859,-0.301079,-0.44774,-0.45398,1.275974,-0.015,2012-02-02 09:00:00
1588,-0.450849,-0.450216,-0.451547,-0.450834,-0.457584,-0.299431,-0.450867,-0.45085,-0.991419,-0.9611,...,-0.013471,-0.014708,0.001304,-0.450854,-0.301038,-0.447734,-0.453977,1.275974,0.13,2012-02-02 09:30:00


plusieurs y gains selon différents temps + vote

In [465]:
to_drop = ['Timestamp', 'Open', 'Close', 'High', 'Low', 'Weighted_Price', 'Volume_(Currency)',
          ]
train_x_nn = train_x.copy().drop(to_drop,1)
test_x_nn = test_x.copy().drop(to_drop,1)

train_y_nn = (train_y > 0).astype(int)
test_y_nn = (test_y > 0).astype(int)


print("% of 1 in train: " + str(np.sum(train_y_nn == 1) / len(train_y)))
print("% of 1 in test: " + str(np.sum(test_y_nn == 1) / len(test_y)))
print(test_y_nn.dtypes)
display(train_x_nn.head())
display(train_y_nn.head())

% of 1 in train: 0.555913855853
% of 1 in test: 0.554031354983
int32


Unnamed: 0,Volume_(BTC),Smoothed_Price,Inc_1_hour,Inc_30_min,EMA_12,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI
19,-0.11051,-0.451417,1.008646,1.040465,-0.451466,-0.451459,-0.015192,-0.016702,0.001836,-0.451429,-0.301985,-0.448326,-0.454532,2.886978
20,-0.11051,-0.451417,1.008646,-0.9611,-0.451459,-0.451455,-0.014921,-0.016525,0.002184,-0.451426,-0.301778,-0.448315,-0.454537,2.886978
21,-0.11051,-0.451417,-0.991419,-0.9611,-0.451452,-0.451451,-0.014721,-0.016342,0.002274,-0.451422,-0.301624,-0.448306,-0.45454,2.886978
22,-0.11051,-0.451417,-0.991419,-0.9611,-0.451447,-0.451447,-0.014578,-0.016164,0.002192,-0.451419,-0.301505,-0.448298,-0.454542,2.886978
23,-0.11051,-0.451417,-0.991419,-0.9611,-0.451442,-0.451444,-0.014483,-0.016002,0.002001,-0.451415,-0.301414,-0.448291,-0.454542,2.886978


19    1
20    1
21    1
22    1
23    1
Name: Label, dtype: int32

In [466]:
#train_x_nn['Label'] = train_y_nn
#test_x_nn['Label'] = test_y_nn

display(train_x_nn.head())
display(train_y_nn.head())

Unnamed: 0,Volume_(BTC),Smoothed_Price,Inc_1_hour,Inc_30_min,EMA_12,EMA_26,MACD_Line,Signal_Line,MACD_Histogram,MA,std,Upper,Lower,RSI
19,-0.11051,-0.451417,1.008646,1.040465,-0.451466,-0.451459,-0.015192,-0.016702,0.001836,-0.451429,-0.301985,-0.448326,-0.454532,2.886978
20,-0.11051,-0.451417,1.008646,-0.9611,-0.451459,-0.451455,-0.014921,-0.016525,0.002184,-0.451426,-0.301778,-0.448315,-0.454537,2.886978
21,-0.11051,-0.451417,-0.991419,-0.9611,-0.451452,-0.451451,-0.014721,-0.016342,0.002274,-0.451422,-0.301624,-0.448306,-0.45454,2.886978
22,-0.11051,-0.451417,-0.991419,-0.9611,-0.451447,-0.451447,-0.014578,-0.016164,0.002192,-0.451419,-0.301505,-0.448298,-0.454542,2.886978
23,-0.11051,-0.451417,-0.991419,-0.9611,-0.451442,-0.451444,-0.014483,-0.016002,0.002001,-0.451415,-0.301414,-0.448291,-0.454542,2.886978


19    1
20    1
21    1
22    1
23    1
Name: Label, dtype: int32

In [467]:
model = Sequential()
model.add(Dense(128, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

opt = keras.optimizers.SGD(lr=0.01, momentum=0.01, decay=0.0, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
#model.add(Dense(1, kernel_initializer='normal'))
#model.compile(loss='mse', optimizer='adam', metrics=['mae'])


history = model.fit(train_x_nn, train_y_nn, epochs = 5, verbose=1, validation_data = (test_x_nn, test_y_nn))
scores = model.evaluate(test_x_nn, test_y_nn, verbose=0)

print(scores)

Train on 96443 samples, validate on 10716 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[0.6870097010026186, 0.55403135498320266]


In [468]:
x = (model.predict_classes(test_x_nn)).squeeze()
print(x)
print(accuracy(x, test_bin))
print(np.sum(x==1))
print(np.sum(x==0))
print(np.sum(x==1)/np.sum(x==0))

[1 1 1 ..., 1 1 1]
0.554031354983
10716
0
inf




In [469]:
last_n_points = -5000

points = test_x

buy_points = points[x == 1]
sell_points = points[x == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.4)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SHIFTING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='red', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::], buy_points.Smoothed_Price[::], color="#00ff00")
p.scatter(sell_points.Timestamp[::], sell_points.Smoothed_Price[::], color="#ff0000")
show(p)

## Investing using the model
We use the model to apply the following simple investment strategy. At the end of each day, we take either a long or short position for $100. No matter what happens, the position is liquidated after 24H. Of course, the choice of the position is dependent on the price increase or decrease of the model

In [108]:
daily_amount = 100 #dollars

def compute_investment_return(prices, decisions):
    prices_in_24h = prices.shift(-DAY_WINDOW)
    return np.sum( ((prices_in_24h - prices)/prices) * daily_amount * decisions)

### The anarchist
The anarchist decide whether to invest or not based on a (bit)coin flip:

In [None]:
sum = 0
prices = df_raw.iloc[test2.index].Close
l = len(prices)
for i in range(10000):
    choices = 2 * np.random.choice(2, l) - 1
    sum += compute_investment_return(prices, choices)

print(sum / 10000.0)

### Simple model

In [None]:
#true_labels = pd.DataFrame(create_labels(df_24h).Label)
predicted_labels = naive_predict( df_raw.iloc[test2.index].copy())
predicted_labels.columns = ['Timestamp', 'Price_Prediction']

prices = df_raw.iloc[test2.index].Close

join = pd.concat([predicted_labels, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.Close) - 1 # Long: 1 Short: -1
display(join)

decisions_simple = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

### Neural net

In [None]:
prediction = pd.DataFrame(np.exp(model.predict(test_x2)).squeeze())
prediction.columns = ['Price_Prediction']
prediction['previous'] = prediction.shift(DAY_WINDOW)

prices = df_raw.iloc[test2.index][['Timestamp', 'Close']].reset_index(drop=True)

join = pd.concat([prediction, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.previous) - 1 # Long: 1 Short: -1
display(join)

decisions_nn = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

In [None]:
print(np.sum(decisions_simple.values == decisions_nn[1:].values))

In [None]:
from keras.models import load_model
 
    

with tf.device('/cpu:0'):

    model = load_model('my_model.h5')
    scores = model.evaluate(test_x, test_y, verbose=0)
    print(scores)
    
    del model
