In [150]:
import pandas as pd
import keras
from IPython.display import display, HTML
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
import matplotlib.pyplot as plt
import numpy as np
import datetime

np.random.seed(42)

In [2]:
from math import pi
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file

output_notebook()

In [3]:
a = pd.Series([1,2,3,4,5,6,7,8,9])

print(a.rolling(3).mean())

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
dtype: float64


In [4]:
HOURS_IN_DAY = 24
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60
AGGREGATION_PERIOD = 30 #Model uses 30 minutes candles

DAY_WINDOW = int(HOURS_IN_DAY * MINUTES_IN_HOUR / AGGREGATION_PERIOD)

In [5]:
# Force CPU usage
import tensorflow as tf
from keras import backend as K

num_cores = 8

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : 1, 'GPU' : 0})
session = tf.Session(config=config)
K.set_session(session)

In [6]:
# For plot

def prepare_standardplot(title, xlabel):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(title)
    ax1.set_ylabel('categorical cross entropy')
    ax1.set_xlabel(xlabel)
    ax1.set_yscale('log')
    ax2.set_ylabel('accuracy [% correct]')
    ax2.set_xlabel(xlabel)
    return fig, ax1, ax2

def finalize_standardplot(fig, ax1, ax2):
    ax1handles, ax1labels = ax1.get_legend_handles_labels()
    if len(ax1labels) > 0:
        ax1.legend(ax1handles, ax1labels)
    ax2handles, ax2labels = ax2.get_legend_handles_labels()
    if len(ax2labels) > 0:
        ax2.legend(ax2handles, ax2labels)
    fig.tight_layout()
    plt.subplots_adjust(top=0.9)

def plot_history(history, title):
    fig, ax1, ax2 = prepare_standardplot(title, 'epoch')
    ax1.plot(history.history['loss'], label = "training")
    ax2.plot(history.history['binary_accuracy'], label = "training")
    finalize_standardplot(fig, ax1, ax2)
    return fig

In [7]:
def aggregate_market_values(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
    
    data = data.groupby(data.Timestamp // aggregation_factor).agg({
        'Open' : 'first',
        'High' : np.max,
        'Low' : np.min,
        'Close' : 'last',
        'Volume_(BTC)' : np.sum ,
        'Volume_(Currency)' : np.sum,
        'Weighted_Price' : np.mean,
    }).reset_index()
    
    data.Timestamp *= aggregation_factor
    
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data

def first_in_window(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
            
    data = data.groupby(data.Timestamp // aggregation_factor).first().reset_index(drop=True)
            
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data
    

In [8]:
df_raw_part1 = pd.read_csv('Data/bitstampUSD_1-min_data_2012-01-01_to_2018-01-08.csv')
df_raw_part2 = pd.read_csv('Data/bitstampUSD_30-min_data_january.csv', date_parser=True)

# Aggregate first part of data into chunks of 30 mins, second part already aggregated
df_p1 = aggregate_market_values(df_raw_part1, 30, unix_time=True)
df_p1.Timestamp = pd.to_datetime(df_p1.Timestamp, unit='s')

df_p2 = df_raw_part2
df_p2.Timestamp = pd.to_datetime(df_p2.Timestamp)

df_raw = pd.concat([df_p1, df_p2]).reset_index(drop=True)

display(df_raw[105370:105372])
display(df_raw.tail())
print(df_raw.dtypes)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
105370,16166.78,16176.96,16102.05,16173.98,2018-01-08 00:00:00,25.633791,413278.4,16122.40664
105371,15956.66,16300.0,15954.16,16293.99,2018-01-08 00:30:00,221.65,3577715.0,16140.98


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
108615,8453.2,8505.0,8435.94,8503.41,2018-03-16 14:30:00,250.99,2126237.5,8471.47
108616,8511.47,8527.24,8450.1,8452.99,2018-03-16 15:00:00,321.19,2727121.08,8490.73
108617,8554.04,8613.06,8482.47,8510.0,2018-03-16 15:30:00,563.11,4813543.76,8548.16
108618,8541.17,8572.28,8472.24,8554.42,2018-03-16 16:00:00,305.72,2605758.19,8523.24
108619,8541.19,8553.83,8522.02,8547.41,2018-03-16 16:30:00,30.78,262893.41,8539.96


Close                       float64
High                        float64
Low                         float64
Open                        float64
Timestamp            datetime64[ns]
Volume_(BTC)                float64
Volume_(Currency)           float64
Weighted_Price              float64
dtype: object


In [9]:
df_plot = df_raw.copy()

aggregation_factor = 24 * 60 #24h candles

df_plot = aggregate_market_values(df_plot, 12 * 60)

inc = df_plot.Close >= df_plot.Open
dec = df_plot.Open > df_plot.Close
barWidth = 0.66 * aggregation_factor * 60 * 1000 # 30 minutes in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.segment(df_plot.Timestamp, df_plot.High, df_plot.Timestamp, df_plot.Low, color="black")
p.vbar(df_plot.Timestamp[inc], barWidth, df_plot.Open[inc], df_plot.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_plot.Timestamp[dec], barWidth, df_plot.Open[dec], df_plot.Close[dec], fill_color="#FF2828", line_color="black")

show(p)

In [189]:
def add_MA_n_days_age(num_days):
    num_days_str = str(num_days)
    
    df[['Open_W_MA_'+num_days_str,'High_W_MA_'+num_days_str,'Low_W_MA_'+num_days_str,'Close_W_MA_'+num_days_str]] = df[['Open_W','High_W','Low_W','Close_W']].rolling(window=day_window * num_days).mean()
    df[['Open_MA_'+num_days_str,'High_MA_'+num_days_str,'Low_MA_'+num_days_str,'Close_MA_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).mean()
    df[['Open_EMA_'+num_days_str,'High_EMA_'+num_days_str,'Low_EMA_'+num_days_str,'Close_EMA_'+num_days_str]] = df[['Open','High','Low','Close']].ewm(span=day_window * num_days).mean()

    df[['Open_MAX_'+num_days_str,'High_MAX_'+num_days_str,'Low_MAX_'+num_days_str,'Close_MAX_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).max()
    df[['Open_MIN_'+num_days_str,'High_MIN_'+num_days_str,'Low_MIN_'+num_days_str,'Close_MIN_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).min()

    df['Open_TENKAN_'+num_days_str] = 0.5 * (df['Open_MIN_'+num_days_str] + df['Open_MAX_'+num_days_str])
    df['High_TENKAN_'+num_days_str] = 0.5 * (df['High_MIN_'+num_days_str] + df['High_MAX_'+num_days_str])
    df['Low_TENKAN_'+num_days_str] = 0.5 * (df['Low_MIN_'+num_days_str] + df['Low_MAX_'+num_days_str])
    df['Close_TENKAN_'+num_days_str] = 0.5 * (df['Close_MIN_'+num_days_str] + df['Close_MAX_'+num_days_str])

def add_prices_n_periods_ago(data, periods, step=DAY_WINDOW):
    return data['Smoothed_Price'].copy().shift(step * periods)

def dummy_increased_since(data, periods, step=DAY_WINDOW):
    return 1 * (data['Smoothed_Price'] > data['Smoothed_Price'].shift(int(step * periods)))

def mean_square_loss(predicted_labels, true_labels):
    assert len(predicted_labels) == len(true_labels)
    return np.mean((predicted_labels - true_labels)**2)

def accuracy(x,y):
    return np.mean(x==y)

In [326]:
SMOOTHING_PERIOD = int(0.25 * 2 * 24) # 3h

df_processed = df_raw.copy()
df_processed['Smoothed_Price'] = (0.5 * (df_processed.Close + df_processed.Open)).shift(-int(SMOOTHING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean()
df_processed['Inc_1_day'] = dummy_increased_since(df_processed, 1)
df_processed['Inc_12_hours'] = dummy_increased_since(df_processed, 0.5)
df_processed['Inc_6_hour'] = dummy_increased_since(df_processed, 0.25)
df_processed['Inc_3_hour'] = dummy_increased_since(df_processed, 0.125)
df_processed = df_processed.dropna(how='any')

display(df_processed.head(1))

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour
11,4.39,4.39,4.39,4.39,2011-12-31 13:00:00,13.667426,60.000001,4.39,4.39,0,0,0,0


In [327]:
last_n_points = 10000

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(df_processed.Timestamp[-last_n_points:], df_processed.Open[-last_n_points:], line_alpha=0.6, line_color="black")
p.line(df_processed.Timestamp[-last_n_points:], df_processed.Smoothed_Price[-last_n_points:], line_alpha=1, line_color="red", legend=("Rolling avg. "+str(SMOOTHING_PERIOD)))

display(df_processed.head(1))
show(p)

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(df_processed.Timestamp[-last_n_points:], df_processed.Inc_1_day[-last_n_points:], line_alpha=1, line_color="#0000FF", legend="increase since 24h")
p.line(df_processed.Timestamp[-last_n_points:], df_processed.Inc_12_hours[-last_n_points:], line_alpha=1, line_color="#FF0000", legend="increase since 12h")

display(df_processed.head(1))
show(p)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour
11,4.39,4.39,4.39,4.39,2011-12-31 13:00:00,13.667426,60.000001,4.39,4.39,0,0,0,0


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour
11,4.39,4.39,4.39,4.39,2011-12-31 13:00:00,13.667426,60.000001,4.39,4.39,0,0,0,0


# Objective 

We aim to predict price changes across intervals of 24 hours. More specifically, at the end of each day, the model should predict the price of Bitcoin in the following 24 hours.

In [328]:
display(df_processed.head())

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour
11,4.39,4.39,4.39,4.39,2011-12-31 13:00:00,13.667426,60.000001,4.39,4.39,0,0,0,0
12,4.39,4.39,4.39,4.39,2011-12-31 13:30:00,13.667426,60.000001,4.39,4.3975,0,0,0,0
13,4.39,4.39,4.39,4.39,2011-12-31 14:00:00,13.667426,60.000001,4.39,4.413333,0,0,0,0
14,4.39,4.39,4.39,4.39,2011-12-31 14:30:00,13.667426,60.000001,4.39,4.429167,0,0,0,0
15,4.39,4.39,4.39,4.39,2011-12-31 15:00:00,13.667426,60.000001,4.39,4.445,0,0,0,0


## Simple model

To assess the performance of our model, we devise a simple naïve model as a benchmark. Our simple model looks at the price change in the last 24H and assume that this change will repeat in the next 24H. More specifically, let $P_p, P_c, P_f$ be respectively the previous, current and future price of Bitcoin (in intervals of 24 hours). We have:

$$ P_f = P_c + (P_c - P_p) $$

In [329]:
def naive_predictor(previous, current):
    return current.Smoothed_Price + ((current.Smoothed_Price - previous.Smoothed_Price) / previous.Smoothed_Price) * current.Smoothed_Price

def naive_predict_absolute_price(data):  
    P_c = data.Smoothed_Price
    P_p = data.Smoothed_Price.shift(DAY_WINDOW)
    
    return pd.concat([data.Timestamp, 2 * P_c +  - P_p], axis=1)

def naive_predict_price_diff(data):  
    ret = data.copy()
    P_c = ret.Smoothed_Price
    P_p = ret.Smoothed_Price.shift(DAY_WINDOW)
    
    ret['Simple_Predicted_Price'] = P_c - P_p
    return ret

def create_labels(data, step_size=1):
    return data.Smoothed_Price.shift(-step_size) - data.Smoothed_Price

In [330]:
df.describe()

Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour,Label
count,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0,108564.0
mean,5.36672e-15,1.309875e-14,-6.743182e-15,2.940158e-15,2.550005e-15,-7.994866e-16,-1.188157e-15,-2.492657e-15,1.908688e-14,-2.199695e-14,9.18844e-15,-3.526781e-14,3.629034
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,203.040397
min,-0.4480159,-0.4473844,-0.4496722,-0.4480012,-0.5607027,-0.2975694,-0.4480337,-0.4480316,-1.122307,-1.091871,-1.07289,-1.053388,-3580.818333
25%,-0.4071828,-0.4066538,-0.4078257,-0.4071662,-0.4278157,-0.2881451,-0.4071836,-0.4071835,-1.122307,-1.091871,-1.07289,-1.053388,-2.964375
50%,-0.3077693,-0.3075345,-0.3081612,-0.3077224,-0.2709886,-0.2602778,-0.3077523,-0.30779,0.8910136,0.9158506,0.9320538,0.9493091,0.116667
75%,-0.1935552,-0.193934,-0.1936062,-0.1935968,0.052188,-0.1614237,-0.1937398,-0.193705,0.8910136,0.9158506,0.9320538,0.9493091,5.782708
max,6.619447,6.592501,6.644871,6.62911,78.77071,37.67849,6.62915,6.588208,0.8910136,0.9158506,0.9320538,0.9493091,3007.045833


In [331]:
def split_dataset(data, fraction):
    l = len(data)
    
    train = data[0 : int(l * fraction)]
    test = data[ int(l*fraction) :]
    
    return train, test

NORMALIZE = True
ADD_PRICE_N_AGO = False
ADD_MA_N_AGO = False
LOG = False

df = df_processed.copy()
df['Label'] = create_labels(df, step_size = DAY_WINDOW)
df = df.dropna()

if LOG:
    df.Smoothed_Price = df.Smoothed_Price.apply(lambda x: np.log(x))
if ADD_PRICE_N_AGO:
    for i in range(1,31):
        add_prices_n_periods_ago(i, step=4)
if ADD_MA_N_AGO:
    add_MA_n_days_age(5)
    add_MA_n_days_age(10)
    add_MA_n_days_age(15)

if NORMALIZE:
    timestamps = df.Timestamp
    labels = df.Label
    df = df.drop('Timestamp',1)
    df = (df-df.mean())/df.std()
    df.Label = labels
    df['Timestamp'] = timestamps
    print("Standardized input data")
    display(df.describe())


train, test = split_dataset(df, 0.9)
train_x = train.drop('Label',1)
train_y = train['Label']
train_bin = train_y > 0
test_x = test.drop('Label',1)
test_y = test['Label']
test_bin = test_y > 0


display(train.head())
display(test.head())

Standardized input data


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour,Label
count,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0,108555.0
mean,3.530909e-16,1.1818650000000001e-17,-3.968588e-15,2.321893e-15,1.801488e-15,1.600488e-15,3.279903e-15,3.844134e-15,3.120703e-14,1.242607e-15,-2.801624e-14,-1.977519e-14,3.623895
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,198.015037
min,-0.4479942,-0.4473628,-0.4496507,-0.4479797,-0.5607091,-0.2975446,-0.4480121,-0.4480273,-1.123188,-1.104688,-1.087581,-1.072469,-3130.014583
25%,-0.4071527,-0.4066291,-0.4077929,-0.4071298,-0.4278113,-0.2881176,-0.4071571,-0.407186,-1.123188,-1.104688,-1.087581,-1.072469,-2.885417
50%,-0.3077368,-0.3075018,-0.3081269,-0.3076897,-0.2709928,-0.2602464,-0.3077153,-0.3077293,0.8903148,0.9052245,0.9194631,0.9324194,0.113333
75%,-0.1935295,-0.1939015,-0.1935722,-0.1935749,0.05217809,-0.1613866,-0.1937086,-0.1939213,0.8903148,0.9052245,0.9194631,0.9324194,5.695833
max,6.619833,6.5929,6.645265,6.629518,78.76845,37.68289,6.629543,6.568652,0.8903148,0.9052245,0.9194631,0.9324194,2888.075833


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour,Label,Timestamp
11,-0.447904,-0.447273,-0.4486,-0.447889,-0.537882,-0.2975,-0.447922,-0.447963,-1.123188,-1.104688,-1.087581,-1.072469,0.244167,2011-12-31 13:00:00
12,-0.447904,-0.447273,-0.4486,-0.447889,-0.537882,-0.2975,-0.447922,-0.44796,-1.123188,-1.104688,-1.087581,-1.072469,0.258333,2011-12-31 13:30:00
13,-0.447904,-0.447273,-0.4486,-0.447889,-0.537882,-0.2975,-0.447922,-0.447955,-1.123188,-1.104688,-1.087581,-1.072469,0.264167,2011-12-31 14:00:00
14,-0.447904,-0.447273,-0.4486,-0.447889,-0.537882,-0.2975,-0.447922,-0.447949,-1.123188,-1.104688,-1.087581,-1.072469,0.27,2011-12-31 14:30:00
15,-0.447904,-0.447273,-0.4486,-0.447889,-0.537882,-0.2975,-0.447922,-0.447943,-1.123188,-1.104688,-1.087581,-1.072469,0.275833,2011-12-31 15:00:00


Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour,Label,Timestamp
97710,0.545991,0.539423,0.534749,0.527702,-0.179167,0.169657,0.535476,0.541392,0.890315,-1.104688,-1.087581,-1.072469,-20.545417,2017-08-01 10:00:00
97711,0.545396,0.540494,0.539827,0.546115,-0.318699,0.000233,0.539978,0.538308,-1.123188,-1.104688,-1.087581,-1.072469,-14.284583,2017-08-01 10:30:00
97712,0.541973,0.548371,0.544555,0.544186,-0.180804,0.173228,0.546958,0.535697,-1.123188,-1.104688,-1.087581,-1.072469,-9.68625,2017-08-01 11:00:00
97713,0.544148,0.540376,0.538372,0.541913,-0.16953,0.183767,0.540942,0.534621,-1.123188,-1.104688,-1.087581,-1.072469,-9.284167,2017-08-01 11:30:00
97714,0.539207,0.537841,0.525748,0.544189,0.149945,0.568372,0.534812,0.534569,-1.123188,-1.104688,-1.087581,-1.072469,-11.357083,2017-08-01 12:00:00


## Simple model performance

In [332]:
predicted_labels = naive_predict_price_diff(df).loc[test_x.index]
predicted_labels['Binary'] = 1 * (predicted_labels.Simple_Predicted_Price > 0)
#display(predicted_labels[::2*24])

print(mean_square_loss(predicted_labels.Simple_Predicted_Price, test_y))
print(accuracy(predicted_labels.Binary, test_y > 0))

383101.34897899564
0.524042004422


In [333]:
last_n_points = -5000

points = predicted_labels

buy_points = points[predicted_labels.Binary == 1]
sell_points = points[predicted_labels.Binary == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.6)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SMOOTHING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='black', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::2*24], buy_points.Smoothed_Price[::2*24], color="#00ff00", legend="Buy periods")
p.scatter(sell_points.Timestamp[::2*24], sell_points.Smoothed_Price[::2*24], color="#ff0000", legend="Sell periods")
show(p)

## Neural network

In [334]:
display(df[1000:1010:])

Unnamed: 0,Close,High,Low,Open,Volume_(BTC),Volume_(Currency),Weighted_Price,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour,Label,Timestamp
1011,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.567917,2012-01-21 09:00:00
1012,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.60875,2012-01-21 09:30:00
1013,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.649583,2012-01-21 10:00:00
1014,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.67,2012-01-21 10:30:00
1015,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.67,2012-01-21 11:00:00
1016,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.67,2012-01-21 11:30:00
1017,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.680417,2012-01-21 12:00:00
1018,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.667917,2012-01-21 12:30:00
1019,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.622083,2012-01-21 13:00:00
1020,-0.446981,-0.446357,-0.447668,-0.446966,-0.485367,-0.29731,-0.446998,-0.44704,0.890315,-1.104688,-1.087581,-1.072469,-0.57625,2012-01-21 13:30:00


plusieurs y gains selon différents temps + vote

In [335]:
to_drop = ['Timestamp', 'Open', 'Close', 'High', 'Low', 'Volume_(BTC)', 'Weighted_Price', 'Volume_(Currency)']
train_x_nn = train_x.copy().drop(to_drop,1)
test_x_nn = test_x.copy().drop(to_drop,1)

train_y_nn = (train_y > 0).astype(int)
test_y_nn = (test_y > 0).astype(int)


print("% of 1 in train: " + str(np.sum(train_y_nn == 1) / len(train_y)))
print("% of 1 in test: " + str(np.sum(test_y_nn == 1) / len(test_y)))
print(test_y_nn.dtypes)
display(train_x_nn.head())
display(train_y_nn.head())

% of 1 in train: 0.558214516013
% of 1 in test: 0.557479734709
int32


Unnamed: 0,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour
11,-0.447963,-1.123188,-1.104688,-1.087581,-1.072469
12,-0.44796,-1.123188,-1.104688,-1.087581,-1.072469
13,-0.447955,-1.123188,-1.104688,-1.087581,-1.072469
14,-0.447949,-1.123188,-1.104688,-1.087581,-1.072469
15,-0.447943,-1.123188,-1.104688,-1.087581,-1.072469


11    1
12    1
13    1
14    1
15    1
Name: Label, dtype: int32

In [336]:
#train_x_nn['Label'] = train_y_nn
#test_x_nn['Label'] = test_y_nn

display(train_x_nn.head())
display(train_y_nn.head())

Unnamed: 0,Smoothed_Price,Inc_1_day,Inc_12_hours,Inc_6_hour,Inc_3_hour
11,-0.447963,-1.123188,-1.104688,-1.087581,-1.072469
12,-0.44796,-1.123188,-1.104688,-1.087581,-1.072469
13,-0.447955,-1.123188,-1.104688,-1.087581,-1.072469
14,-0.447949,-1.123188,-1.104688,-1.087581,-1.072469
15,-0.447943,-1.123188,-1.104688,-1.087581,-1.072469


11    1
12    1
13    1
14    1
15    1
Name: Label, dtype: int32

In [337]:
model = Sequential()
model.add(Dense(128, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, kernel_initializer='normal', input_dim=len(train_x_nn.columns), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

opt = keras.optimizers.SGD(lr=0.01, momentum=0.01, decay=0.0, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
#model.add(Dense(1, kernel_initializer='normal'))
#model.compile(loss='mse', optimizer='adam', metrics=['mae'])


history = model.fit(train_x_nn, train_y_nn, epochs = 5, verbose=1, validation_data = (test_x_nn, test_y_nn))
scores = model.evaluate(test_x_nn, test_y_nn, verbose=0)

print(scores)

Train on 97699 samples, validate on 10856 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[0.67759829447738551, 0.57332350773765661]


In [338]:
x = (model.predict_classes(test_x_nn)).squeeze()
print(x)
print(accuracy(x, test_bin))
print(np.sum(x==1))
print(np.sum(x==0))
print(np.sum(x==1)/np.sum(x==0))

[0 0 0 ..., 1 1 1]
0.573323507738
6658
4198
1.58599333016


In [339]:
last_n_points = -5000

points = test_x

buy_points = points[x == 1]
sell_points = points[x == 0]

p = figure(x_axis_type="datetime",  plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:], line_color='black', line_width=2, line_alpha=0.6)
p.line(points.Timestamp[last_n_points * 2 :], points.Close[last_n_points * 2:].shift(-int(SMOOTHING_PERIOD/2)).rolling(SMOOTHING_PERIOD).mean(), line_color='black', line_width=2, line_alpha=1)
p.scatter(buy_points.Timestamp[::2*24], buy_points.Smoothed_Price[::2*24], color="#00ff00")
p.scatter(sell_points.Timestamp[::2*24], sell_points.Smoothed_Price[::2*24], color="#ff0000")
show(p)

## Investing using the model
We use the model to apply the following simple investment strategy. At the end of each day, we take either a long or short position for $100. No matter what happens, the position is liquidated after 24H. Of course, the choice of the position is dependent on the price increase or decrease of the model

In [285]:
daily_amount = 100 #dollars

def compute_investment_return(prices, decisions):
    prices_in_24h = prices.shift(-DAY_WINDOW)
    return np.sum( ((prices_in_24h - prices)/prices) * daily_amount * decisions)

### The anarchist
The anarchist decide whether to invest or not based on a (bit)coin flip:

In [286]:
sum = 0
prices = df_raw.iloc[test2.index].Close
l = len(prices)
for i in range(10000):
    choices = 2 * np.random.choice(2, l) - 1
    sum += compute_investment_return(prices, choices)

print(sum / 10000.0)

NameError: name 'test2' is not defined

### Simple model

In [None]:
#true_labels = pd.DataFrame(create_labels(df_24h).Label)
predicted_labels = naive_predict( df_raw.iloc[test2.index].copy())
predicted_labels.columns = ['Timestamp', 'Price_Prediction']

prices = df_raw.iloc[test2.index].Close

join = pd.concat([predicted_labels, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.Close) - 1 # Long: 1 Short: -1
display(join)

decisions_simple = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

### Neural net

In [None]:
prediction = pd.DataFrame(np.exp(model.predict(test_x2)).squeeze())
prediction.columns = ['Price_Prediction']
prediction['previous'] = prediction.shift(DAY_WINDOW)

prices = df_raw.iloc[test2.index][['Timestamp', 'Close']].reset_index(drop=True)

join = pd.concat([prediction, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.previous) - 1 # Long: 1 Short: -1
display(join)

decisions_nn = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

In [None]:
print(np.sum(decisions_simple.values == decisions_nn[1:].values))

In [None]:
from keras.models import load_model
 
    

with tf.device('/cpu:0'):

    model = load_model('my_model.h5')
    scores = model.evaluate(test_x, test_y, verbose=0)
    print(scores)
    
    del model
