In [12]:
import pandas as pd
from IPython.display import display, HTML
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
import matplotlib.pyplot as plt
import numpy as np
import datetime

np.random.seed(42)

In [13]:
from math import pi
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file

output_notebook()

In [14]:
a = pd.Series([1,2,3,4,5,6,7,8,9])

print(a.rolling(3).mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
dtype: float64


In [15]:
HOURS_IN_DAY = 24
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60
AGGREGATION_PERIOD = 30 #Model uses 30 minutes candles

DAY_WINDOW = int(HOURS_IN_DAY * MINUTES_IN_HOUR / AGGREGATION_PERIOD)

In [16]:
# Force CPU usage
import tensorflow as tf
from keras import backend as K

num_cores = 8

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : 1, 'GPU' : 0})
session = tf.Session(config=config)
K.set_session(session)

In [17]:
# For plot

def prepare_standardplot(title, xlabel):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(title)
    ax1.set_ylabel('categorical cross entropy')
    ax1.set_xlabel(xlabel)
    ax1.set_yscale('log')
    ax2.set_ylabel('accuracy [% correct]')
    ax2.set_xlabel(xlabel)
    return fig, ax1, ax2

def finalize_standardplot(fig, ax1, ax2):
    ax1handles, ax1labels = ax1.get_legend_handles_labels()
    if len(ax1labels) > 0:
        ax1.legend(ax1handles, ax1labels)
    ax2handles, ax2labels = ax2.get_legend_handles_labels()
    if len(ax2labels) > 0:
        ax2.legend(ax2handles, ax2labels)
    fig.tight_layout()
    plt.subplots_adjust(top=0.9)

def plot_history(history, title):
    fig, ax1, ax2 = prepare_standardplot(title, 'epoch')
    ax1.plot(history.history['loss'], label = "training")
    ax2.plot(history.history['binary_accuracy'], label = "training")
    finalize_standardplot(fig, ax1, ax2)
    return fig

In [18]:
def aggregate_market_values(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
    
    data = data.groupby(data.Timestamp // aggregation_factor).agg({
        'Open' : 'first',
        'High' : np.max,
        'Low' : np.min,
        'Close' : 'last',
        'Volume_(BTC)' : np.sum ,
        'Volume_(Currency)' : np.sum,
        'Weighted_Price' : np.mean,
    }).reset_index()
    
    data.Timestamp *= aggregation_factor
    
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data

def first_in_window(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
            
    data = data.groupby(data.Timestamp // aggregation_factor).first().reset_index(drop=True)
            
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data
    

In [19]:
df_raw_part1 = pd.read_csv('Data/bitstampUSD_1-min_data_2012-01-01_to_2018-01-08.csv')
df_raw_part2 = pd.read_csv('Data/bitstampUSD_30-min_data_january.csv', date_parser=True)

# Aggregate first part of data into chunks of 30 mins, second part already aggregated
df_p1 = aggregate_market_values(df_raw_part1, 30, unix_time=True)
df_p1.Timestamp = pd.to_datetime(df_p1.Timestamp, unit='s')

df_p2 = df_raw_part2
df_p2.Timestamp = pd.to_datetime(df_p2.Timestamp)

df_raw = pd.concat([df_p1, df_p2]).reset_index(drop=True)

display(df_raw[105370:105372])
display(df_raw.tail())
print(df_raw.dtypes)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
105370,16166.78,16176.96,16102.05,16173.98,2018-01-08 00:00:00,25.633791,413278.4,16122.40664
105371,15956.66,16300.0,15954.16,16293.99,2018-01-08 00:30:00,221.65,3577715.0,16140.98


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
108615,8453.2,8505.0,8435.94,8503.41,2018-03-16 14:30:00,250.99,2126237.5,8471.47
108616,8511.47,8527.24,8450.1,8452.99,2018-03-16 15:00:00,321.19,2727121.08,8490.73
108617,8554.04,8613.06,8482.47,8510.0,2018-03-16 15:30:00,563.11,4813543.76,8548.16
108618,8541.17,8572.28,8472.24,8554.42,2018-03-16 16:00:00,305.72,2605758.19,8523.24
108619,8541.19,8553.83,8522.02,8547.41,2018-03-16 16:30:00,30.78,262893.41,8539.96


Close                       float64
High                        float64
Low                         float64
Open                        float64
Timestamp            datetime64[ns]
Volume_(BTC)                float64
Volume_(Currency)           float64
Weighted_Price              float64
dtype: object


In [20]:
df_plot = df_raw.copy()

aggregation_factor = 24 * 60 #24h candles

df_plot = aggregate_market_values(df_plot, 12 * 60)

inc = df_plot.Close >= df_plot.Open
dec = df_plot.Open > df_plot.Close
barWidth = 0.66 * aggregation_factor * 60 * 1000 # 30 minutes in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.segment(df_plot.Timestamp, df_plot.High, df_plot.Timestamp, df_plot.Low, color="black")
p.vbar(df_plot.Timestamp[inc], barWidth, df_plot.Open[inc], df_plot.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_plot.Timestamp[dec], barWidth, df_plot.Open[dec], df_plot.Close[dec], fill_color="#FF2828", line_color="black")

show(p)

In [21]:
SMOOTHING_PERIOD = 2 * 48 # 30-minute slots

df_processed = df_raw.copy()
df_processed['Smoothed_Price'] = (0.5 * (df_processed.Close + df_processed.Open)).shift(-int(SMOOTHING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean()

df_processed = df_processed.dropna(how='any')

last_n_points = 10000

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(df_processed.Timestamp[-last_n_points:], df_processed.Open[-last_n_points:], line_alpha=0.6, line_color="black")
p.line(df_processed.Timestamp[-last_n_points:], df_processed.Smoothed_Price[-last_n_points:], line_alpha=1, line_color="red")

show(p)

# Objective 

We aim to predict price changes across intervals of 24 hours. More specifically, at the end of each day, the model should predict the price of Bitcoin in the following 24 hours.

In [22]:
def add_MA_n_days_age(num_days):
    num_days_str = str(num_days)
    
    df[['Open_W_MA_'+num_days_str,'High_W_MA_'+num_days_str,'Low_W_MA_'+num_days_str,'Close_W_MA_'+num_days_str]] = df[['Open_W','High_W','Low_W','Close_W']].rolling(window=day_window * num_days).mean()
    df[['Open_MA_'+num_days_str,'High_MA_'+num_days_str,'Low_MA_'+num_days_str,'Close_MA_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).mean()
    df[['Open_EMA_'+num_days_str,'High_EMA_'+num_days_str,'Low_EMA_'+num_days_str,'Close_EMA_'+num_days_str]] = df[['Open','High','Low','Close']].ewm(span=day_window * num_days).mean()

    df[['Open_MAX_'+num_days_str,'High_MAX_'+num_days_str,'Low_MAX_'+num_days_str,'Close_MAX_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).max()
    df[['Open_MIN_'+num_days_str,'High_MIN_'+num_days_str,'Low_MIN_'+num_days_str,'Close_MIN_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).min()

    df['Open_TENKAN_'+num_days_str] = 0.5 * (df['Open_MIN_'+num_days_str] + df['Open_MAX_'+num_days_str])
    df['High_TENKAN_'+num_days_str] = 0.5 * (df['High_MIN_'+num_days_str] + df['High_MAX_'+num_days_str])
    df['Low_TENKAN_'+num_days_str] = 0.5 * (df['Low_MIN_'+num_days_str] + df['Low_MAX_'+num_days_str])
    df['Close_TENKAN_'+num_days_str] = 0.5 * (df['Close_MIN_'+num_days_str] + df['Close_MAX_'+num_days_str])

def add_prices_n_periods_ago(periods, step=DAY_WINDOW):
    periods_str = str(periods)
    df['Prev_Smoothed_Price_'+periods_str] = df['Smoothed_Price'].shift(step * periods)

def mean_square_loss(predicted_labels, true_labels):
    assert len(predicted_labels) == len(true_labels)
    return np.mean((predicted_labels - true_labels)**2)

def accuracy(x,y):
    return np.mean(x==y)
    

In [23]:
display(df_processed.head())

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price
95,5.0,5.0,5.0,5.0,2012-01-02 07:00:00,303.0,1515.0,5.0,4.907188
96,5.0,5.0,5.0,5.0,2012-01-02 07:30:00,303.0,1515.0,5.0,4.911563
97,5.0,5.0,5.0,5.0,2012-01-02 08:00:00,303.0,1515.0,5.0,4.915938
98,5.0,5.0,5.0,5.0,2012-01-02 08:30:00,303.0,1515.0,5.0,4.920313
99,5.0,5.0,5.0,5.0,2012-01-02 09:00:00,303.0,1515.0,5.0,4.924688


## Simple model

To assess the performance of our model, we devise a simple naïve model as a benchmark. Our simple model looks at the price change in the last 24H and assume that this change will repeat in the next 24H. More specifically, let $P_p, P_c, P_f$ be respectively the previous, current and future price of Bitcoin (in intervals of 24 hours). We have:

$$ P_f = P_c + (P_c - P_p) $$

In [24]:
def naive_predictor(previous, current):
    return current.Smoothed_Price + ((current.Smoothed_Price - previous.Smoothed_Price) / previous.Smoothed_Price) * current.Smoothed_Price

def naive_predict_absolute_price(data):  
    P_c = data.Smoothed_Price
    P_p = data.Smoothed_Price.shift(DAY_WINDOW)
    
    return pd.concat([data.Timestamp, 2 * P_c +  - P_p], axis=1)

def naive_predict_price_diff(data):  
    ret = data.copy()
    P_c = ret.Smoothed_Price
    P_p = ret.Smoothed_Price.shift(DAY_WINDOW)
    
    ret['Simple_Predicted_Price'] = P_c - P_p
    return ret

def create_labels(data, step_size=1):
    return data.Smoothed_Price.shift(-step_size) - data.Smoothed_Price

In [25]:
def split_dataset(data, fraction):
    l = len(data)
    
    train = data[0 : int(l * fraction)]
    test = data[ int(l*fraction) :]
    
    return train, test

NORMALIZE = False
ADD_PRICE_N_AGO = True
ADD_MA_N_AGO = False
LOG = True

df = df_processed.copy()

if LOG:
    df.Smoothed_Price = df.Smoothed_Price.apply(lambda x: np.log(x))
if ADD_PRICE_N_AGO:
    for i in range(1,31):
        add_prices_n_periods_ago(i, step=4)
if ADD_MA_N_AGO:
    add_MA_n_days_age(5)
    add_MA_n_days_age(10)
    add_MA_n_days_age(15)

if NORMALIZE:
    labels = df.Label
    df = (df-df.mean())/df.std()
    df.Label = labels
    df = df.reset_index()
    df = df.drop('index',1)
    display(df.describe())


df['Label'] = create_labels(df, step_size = DAY_WINDOW)
df = df.dropna()

train, test = split_dataset(df, 0.9)
train_x = train.drop('Label',1)
train_y = train['Label']
train_bin = train_y > 0
test_x = test.drop('Label',1)
test_y = test['Label']
test_bin = test_y > 0


display(train.head())
display(test.head())

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Prev_Smoothed_Price_1,...,Prev_Smoothed_Price_22,Prev_Smoothed_Price_23,Prev_Smoothed_Price_24,Prev_Smoothed_Price_25,Prev_Smoothed_Price_26,Prev_Smoothed_Price_27,Prev_Smoothed_Price_28,Prev_Smoothed_Price_29,Prev_Smoothed_Price_30,Label
215,5.57,5.57,5.37,5.37,2012-01-04 19:00:00,1299.365873,7072.412081,5.442972,1.715167,1.706659,...,1.623413,1.620203,1.616486,1.612755,1.609042,1.603692,1.597808,1.594261,1.590701,0.121896
216,5.57,5.57,5.37,5.37,2012-01-04 19:30:00,1299.365873,7072.412081,5.442972,1.717283,1.708793,...,1.624008,1.62113,1.617416,1.613689,1.609948,1.605242,1.59903,1.595149,1.591592,0.121322
217,5.57,5.57,5.37,5.37,2012-01-04 20:00:00,1299.365873,7072.412081,5.442972,1.719394,1.710922,...,1.624604,1.622056,1.618346,1.614622,1.610885,1.606601,1.600586,1.596036,1.592482,0.120419
218,5.57,5.57,5.37,5.37,2012-01-04 20:30:00,1299.365873,7072.412081,5.442972,1.721501,1.713047,...,1.625198,1.622817,1.619275,1.615555,1.61182,1.607895,1.602141,1.596922,1.593372,0.119189
219,5.57,5.57,5.37,5.37,2012-01-04 21:00:00,1299.365873,7072.412081,5.442972,1.723603,1.715167,...,1.625793,1.623413,1.620203,1.616486,1.612755,1.609042,1.603692,1.597808,1.594261,0.117963


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Prev_Smoothed_Price_1,...,Prev_Smoothed_Price_22,Prev_Smoothed_Price_23,Prev_Smoothed_Price_24,Prev_Smoothed_Price_25,Prev_Smoothed_Price_26,Prev_Smoothed_Price_27,Prev_Smoothed_Price_28,Prev_Smoothed_Price_29,Prev_Smoothed_Price_30,Label
97693,2853.38,2860.02,2832.66,2836.72,2017-08-01 01:30:00,44.976771,128190.675185,2846.107321,7.924975,7.925077,...,7.898225,7.898386,7.898767,7.899472,7.900284,7.900817,7.901317,7.902217,7.903226,-0.010324
97694,2858.04,2858.04,2837.31,2846.27,2017-08-01 02:00:00,94.664494,269694.327408,2848.667243,7.925031,7.925051,...,7.898098,7.898372,7.898641,7.899323,7.900093,7.900707,7.901112,7.902048,7.90283,-0.01091
97695,2841.6,2857.53,2840.38,2848.72,2017-08-01 02:30:00,57.692913,164121.426077,2845.430197,7.925148,7.925014,...,7.897969,7.898324,7.898563,7.899144,7.899882,7.900582,7.901001,7.901802,7.902508,-0.011529
97696,2846.48,2855.22,2837.73,2841.84,2017-08-01 03:00:00,60.23578,171253.777946,2847.779256,7.925262,7.924947,...,7.897925,7.898282,7.898446,7.898945,7.899657,7.900428,7.900932,7.901563,7.902326,-0.012149
97697,2862.93,2863.88,2840.1,2846.39,2017-08-01 03:30:00,115.063544,328711.006652,2857.994752,7.92533,7.924975,...,7.8979,7.898225,7.898386,7.898767,7.899472,7.900284,7.900817,7.901317,7.902217,-0.012742


## Simple model performance

In [26]:
predicted_labels = naive_predict_price_diff(df).loc[test_x.index]
predicted_labels['Binary'] = 1 * (predicted_labels.Simple_Predicted_Price > 0)
#display(predicted_labels[::2*24])

print(mean_square_loss(predicted_labels.Simple_Predicted_Price, test_y))
print(accuracy(predicted_labels.Binary, test_y > 0))

0.0012057733485961245
0.729572523313


In [27]:
last_n_points = -5000

points = predicted_labels

buy_points = points[predicted_labels.Binary == 1]
sell_points = points[predicted_labels.Binary == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.6)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SMOOTHING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='black', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::2*24], buy_points.Smoothed_Price[::2*24], color="#00ff00")
p.scatter(sell_points.Timestamp[::2*24], sell_points.Smoothed_Price[::2*24], color="#ff0000")
show(p)

## Neural network

In [28]:
display(df[1000:1020:])

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Prev_Smoothed_Price_1,...,Prev_Smoothed_Price_22,Prev_Smoothed_Price_23,Prev_Smoothed_Price_24,Prev_Smoothed_Price_25,Prev_Smoothed_Price_26,Prev_Smoothed_Price_27,Prev_Smoothed_Price_28,Prev_Smoothed_Price_29,Prev_Smoothed_Price_30,Label
1215,6.85,6.85,6.85,6.85,2012-01-25 15:00:00,15.0,102.75,6.85,1.864589,1.866234,...,1.879592,1.881372,1.882514,1.880601,1.879099,1.878065,1.877802,1.879393,1.880983,-0.05153
1216,6.85,6.85,6.85,6.85,2012-01-25 15:30:00,15.0,102.75,6.85,1.864137,1.865847,...,1.879147,1.880927,1.882704,1.88103,1.879314,1.878455,1.877794,1.878996,1.880586,-0.052847
1217,6.85,6.85,6.85,6.85,2012-01-25 16:00:00,15.0,102.75,6.85,1.863684,1.86546,...,1.878701,1.880482,1.88226,1.881459,1.879743,1.878884,1.877762,1.878598,1.880188,-0.054107
1218,6.85,6.85,6.85,6.85,2012-01-25 16:30:00,15.0,102.75,6.85,1.863232,1.865041,...,1.878288,1.880037,1.881816,1.881887,1.880172,1.879099,1.877714,1.8782,1.879791,-0.05531
1219,6.85,6.85,6.85,6.85,2012-01-25 17:00:00,15.0,102.75,6.85,1.862779,1.864589,...,1.877905,1.879592,1.881372,1.882514,1.880601,1.879099,1.878065,1.877802,1.879393,-0.056516
1220,6.85,6.85,6.85,6.85,2012-01-25 17:30:00,15.0,102.75,6.85,1.862326,1.864137,...,1.877523,1.879147,1.880927,1.882704,1.88103,1.879314,1.878455,1.877794,1.878996,-0.057724
1221,6.09,6.85,6.09,6.85,2012-01-25 18:00:00,26.920677,171.546925,6.596667,1.861873,1.863684,...,1.87714,1.878701,1.880482,1.88226,1.881459,1.879743,1.878884,1.877762,1.878598,-0.058283
1222,6.0,6.09,6.0,6.09,2012-01-25 18:30:00,48.828287,293.074635,6.002063,1.86142,1.863232,...,1.876758,1.878288,1.880037,1.881816,1.881887,1.880172,1.879099,1.877714,1.8782,-0.058113
1223,6.0,6.0,6.0,6.0,2012-01-25 19:00:00,45.919413,275.516478,6.0,1.860966,1.862779,...,1.876375,1.877905,1.879592,1.881372,1.882514,1.880601,1.879099,1.878065,1.877802,-0.057865
1224,6.0,6.0,6.0,6.0,2012-01-25 19:30:00,45.919413,275.516478,6.0,1.860513,1.862326,...,1.875992,1.877523,1.879147,1.880927,1.882704,1.88103,1.879314,1.878455,1.877794,-0.057618


plusieurs y gains selon différents temps + vote

In [37]:
del model

In [53]:
to_drop = ['Timestamp', 'Open', 'Close', 'High', 'Low', 'Volume_(BTC)']
train_x_nn = train_x.copy().drop(to_drop,1)
test_x_nn = test_x.copy().drop(to_drop,1)

train_y_nn = (train_y > 0).astype(int)
test_y_nn = (test_y > 0).astype(int)


print("% of 1 in train: " + str(np.sum(train_y_nn == 1) / len(train_y)))
print("% of 1 in test: " + str(np.sum(test_y_nn == 1) / len(test_y)))
print(test_y_nn.dtypes)
display(train_x_nn.head())

% of 1 in train: 0.570005539712
% of 1 in test: 0.600775551657
int32


Unnamed: 0,Volume_(Currency),Weighted_Price,Smoothed_Price,Prev_Smoothed_Price_1,Prev_Smoothed_Price_2,Prev_Smoothed_Price_3,Prev_Smoothed_Price_4,Prev_Smoothed_Price_5,Prev_Smoothed_Price_6,Prev_Smoothed_Price_7,...,Prev_Smoothed_Price_21,Prev_Smoothed_Price_22,Prev_Smoothed_Price_23,Prev_Smoothed_Price_24,Prev_Smoothed_Price_25,Prev_Smoothed_Price_26,Prev_Smoothed_Price_27,Prev_Smoothed_Price_28,Prev_Smoothed_Price_29,Prev_Smoothed_Price_30
215,7072.412081,5.442972,1.715167,1.706659,1.69805,1.689048,1.680149,1.670758,1.664044,1.658069,...,1.625793,1.623413,1.620203,1.616486,1.612755,1.609042,1.603692,1.597808,1.594261,1.590701
216,7072.412081,5.442972,1.717283,1.708793,1.70023,1.691191,1.682611,1.672863,1.665562,1.659557,...,1.626387,1.624008,1.62113,1.617416,1.613689,1.609948,1.605242,1.59903,1.595149,1.591592
217,7072.412081,5.442972,1.719394,1.710922,1.702378,1.693492,1.684758,1.675187,1.667078,1.661042,...,1.626981,1.624604,1.622056,1.618346,1.614622,1.610885,1.606601,1.600586,1.596036,1.592482
218,7072.412081,5.442972,1.721501,1.713047,1.704521,1.695835,1.6869,1.677564,1.668797,1.662524,...,1.627575,1.625198,1.622817,1.619275,1.615555,1.61182,1.607895,1.602141,1.596922,1.593372
219,7072.412081,5.442972,1.723603,1.715167,1.706659,1.69805,1.689048,1.680149,1.670758,1.664044,...,1.628168,1.625793,1.623413,1.620203,1.616486,1.612755,1.609042,1.603692,1.597808,1.594261


In [None]:
model = Sequential()
model.add(Dense(16, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_x_nn, train_y_nn, epochs = 5, verbose=1, validation_data = (test_x_nn, test_y_nn))
scores = model.evaluate(test_x_nn, test_y_nn, verbose=0)

print(scores)

Train on 97478 samples, validate on 10831 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [225]:
x = (model.predict_classes(test_x_nn)).squeeze()
print(x)
print(accuracy(x, test_bin))
print(np.sum(x==1))
print(np.sum(x==0))
print(np.sum(x==1)/np.sum(x==0))

[1 1 1 ..., 1 1 1]
0.600775551657
10831
0
inf




In [197]:
last_n_points = -5000

points = test_x

buy_points = points[x == 1]
sell_points = points[x == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.6)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SMOOTHING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='black', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::2*24], buy_points.Smoothed_Price[::2*24], color="#00ff00")
p.scatter(sell_points.Timestamp[::2*24], sell_points.Smoothed_Price[::2*24], color="#ff0000")
show(p)

## Investing using the model
We use the model to apply the following simple investment strategy. At the end of each day, we take either a long or short position for $100. No matter what happens, the position is liquidated after 24H. Of course, the choice of the position is dependent on the price increase or decrease of the model

In [None]:
daily_amount = 100 #dollars

def compute_investment_return(prices, decisions):
    prices_in_24h = prices.shift(-DAY_WINDOW)
    return np.sum( ((prices_in_24h - prices)/prices) * daily_amount * decisions)

### The anarchist
The anarchist decide whether to invest or not based on a (bit)coin flip:

In [None]:
sum = 0
prices = df_raw.iloc[test2.index].Close
l = len(prices)
for i in range(10000):
    choices = 2 * np.random.choice(2, l) - 1
    sum += compute_investment_return(prices, choices)

print(sum / 10000.0)

### Simple model

In [None]:
#true_labels = pd.DataFrame(create_labels(df_24h).Label)
predicted_labels = naive_predict( df_raw.iloc[test2.index].copy())
predicted_labels.columns = ['Timestamp', 'Price_Prediction']

prices = df_raw.iloc[test2.index].Close

join = pd.concat([predicted_labels, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.Close) - 1 # Long: 1 Short: -1
display(join)

decisions_simple = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

### Neural net

In [None]:
prediction = pd.DataFrame(np.exp(model.predict(test_x2)).squeeze())
prediction.columns = ['Price_Prediction']
prediction['previous'] = prediction.shift(DAY_WINDOW)

prices = df_raw.iloc[test2.index][['Timestamp', 'Close']].reset_index(drop=True)

join = pd.concat([prediction, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.previous) - 1 # Long: 1 Short: -1
display(join)

decisions_nn = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

In [None]:
print(np.sum(decisions_simple.values == decisions_nn[1:].values))

In [None]:
from keras.models import load_model
 
    

with tf.device('/cpu:0'):

    model = load_model('my_model.h5')
    scores = model.evaluate(test_x, test_y, verbose=0)
    print(scores)
    
    del model
