In [1]:
import pandas as pd
from IPython.display import display, HTML
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
import matplotlib.pyplot as plt
import numpy as np
import datetime

np.random.seed(42)

Using TensorFlow backend.


In [2]:
from math import pi
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file

output_notebook()

In [3]:
HOURS_IN_DAY = 24
MINUTES_IN_HOUR = 60
SECONDS_IN_MINUTE = 60
AGGREGATION_PERIOD = 30 #Model uses 30 minutes candles

DAY_WINDOW = int(HOURS_IN_DAY * MINUTES_IN_HOUR / AGGREGATION_PERIOD)

In [4]:
# Force CPU usage
import tensorflow as tf
from keras import backend as K

num_cores = 8

config = tf.ConfigProto(intra_op_parallelism_threads=num_cores,\
        inter_op_parallelism_threads=num_cores, allow_soft_placement=True,\
        device_count = {'CPU' : 1, 'GPU' : 0})
session = tf.Session(config=config)
K.set_session(session)

In [5]:
# For plot

def prepare_standardplot(title, xlabel):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(title)
    ax1.set_ylabel('categorical cross entropy')
    ax1.set_xlabel(xlabel)
    ax1.set_yscale('log')
    ax2.set_ylabel('accuracy [% correct]')
    ax2.set_xlabel(xlabel)
    return fig, ax1, ax2

def finalize_standardplot(fig, ax1, ax2):
    ax1handles, ax1labels = ax1.get_legend_handles_labels()
    if len(ax1labels) > 0:
        ax1.legend(ax1handles, ax1labels)
    ax2handles, ax2labels = ax2.get_legend_handles_labels()
    if len(ax2labels) > 0:
        ax2.legend(ax2handles, ax2labels)
    fig.tight_layout()
    plt.subplots_adjust(top=0.9)

def plot_history(history, title):
    fig, ax1, ax2 = prepare_standardplot(title, 'epoch')
    ax1.plot(history.history['loss'], label = "training")
    ax2.plot(history.history['binary_accuracy'], label = "training")
    finalize_standardplot(fig, ax1, ax2)
    return fig

In [6]:
def aggregate_market_values(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
    
    data = data.groupby(data.Timestamp // aggregation_factor).agg({
        'Open' : 'first',
        'High' : np.max,
        'Low' : np.min,
        'Close' : 'last',
        'Volume_(BTC)' : np.sum ,
        'Volume_(Currency)' : np.sum,
        'Weighted_Price' : np.mean,
    }).reset_index()
    
    data.Timestamp *= aggregation_factor
    
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data

def first_in_window(dataframe, aggregation_period, unix_time=False):
    """
    timestamp / open / high / low / close / volume btc / volume currency / weighted price
    """    
    data = dataframe.copy()
    aggregation_factor = aggregation_period * SECONDS_IN_MINUTE
    
    if not unix_time:
        data.Timestamp = data.Timestamp.astype(np.int64) // 10**9
            
    data = data.groupby(data.Timestamp // aggregation_factor).first().reset_index(drop=True)
            
    if not unix_time:
        data.Timestamp = pd.to_datetime(data.Timestamp, unit='s')
    
    return data
    

In [7]:
df_raw_part1 = pd.read_csv('Data/bitstampUSD_1-min_data_2012-01-01_to_2018-01-08.csv')
df_raw_part2 = pd.read_csv('Data/bitstampUSD_30-min_data_january.csv', date_parser=True)

# Aggregate first part of data into chunks of 30 mins, second part already aggregated
df_p1 = aggregate_market_values(df_raw_part1, 30, unix_time=True)
df_p1.Timestamp = pd.to_datetime(df_p1.Timestamp, unit='s')

df_p2 = df_raw_part2
df_p2.Timestamp = pd.to_datetime(df_p2.Timestamp)

df_raw = pd.concat([df_p1, df_p2]).reset_index(drop=True)

display(df_raw[105370:105372])
display(df_raw.tail())
print(df_raw.dtypes)

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
105370,16166.78,16176.96,16102.05,16173.98,2018-01-08 00:00:00,25.633791,413278.4,16122.40664
105371,15956.66,16300.0,15954.16,16293.99,2018-01-08 00:30:00,221.65,3577715.0,16140.98


Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price
108615,8453.2,8505.0,8435.94,8503.41,2018-03-16 14:30:00,250.99,2126237.5,8471.47
108616,8511.47,8527.24,8450.1,8452.99,2018-03-16 15:00:00,321.19,2727121.08,8490.73
108617,8554.04,8613.06,8482.47,8510.0,2018-03-16 15:30:00,563.11,4813543.76,8548.16
108618,8541.17,8572.28,8472.24,8554.42,2018-03-16 16:00:00,305.72,2605758.19,8523.24
108619,8541.19,8553.83,8522.02,8547.41,2018-03-16 16:30:00,30.78,262893.41,8539.96


Close                       float64
High                        float64
Low                         float64
Open                        float64
Timestamp            datetime64[ns]
Volume_(BTC)                float64
Volume_(Currency)           float64
Weighted_Price              float64
dtype: object


In [8]:
df_plot = df_raw.copy()

aggregation_factor = 24 * 60 #24h candles

df_plot = aggregate_market_values(df_plot, 12 * 60)

inc = df_plot.Close >= df_plot.Open
dec = df_plot.Open > df_plot.Close
barWidth = 0.66 * aggregation_factor * 60 * 1000 # 30 minutes in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.segment(df_plot.Timestamp, df_plot.High, df_plot.Timestamp, df_plot.Low, color="black")
p.vbar(df_plot.Timestamp[inc], barWidth, df_plot.Open[inc], df_plot.Close[inc], fill_color="#48D922", line_color="black")
p.vbar(df_plot.Timestamp[dec], barWidth, df_plot.Open[dec], df_plot.Close[dec], fill_color="#FF2828", line_color="black")

show(p)

# Objective 

We aim to predict price changes across intervals of 24 hours. More specifically, at the end of each day, the model should predict the price of Bitcoin in the following 24 hours.

In [42]:
def add_MA_n_days_age(num_days):
    num_days_str = str(num_days)
    
    df[['Open_W_MA_'+num_days_str,'High_W_MA_'+num_days_str,'Low_W_MA_'+num_days_str,'Close_W_MA_'+num_days_str]] = df[['Open_W','High_W','Low_W','Close_W']].rolling(window=day_window * num_days).mean()
    df[['Open_MA_'+num_days_str,'High_MA_'+num_days_str,'Low_MA_'+num_days_str,'Close_MA_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).mean()
    df[['Open_EMA_'+num_days_str,'High_EMA_'+num_days_str,'Low_EMA_'+num_days_str,'Close_EMA_'+num_days_str]] = df[['Open','High','Low','Close']].ewm(span=day_window * num_days).mean()

    df[['Open_MAX_'+num_days_str,'High_MAX_'+num_days_str,'Low_MAX_'+num_days_str,'Close_MAX_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).max()
    df[['Open_MIN_'+num_days_str,'High_MIN_'+num_days_str,'Low_MIN_'+num_days_str,'Close_MIN_'+num_days_str]] = df[['Open','High','Low','Close']].rolling(window=day_window * num_days).min()

    df['Open_TENKAN_'+num_days_str] = 0.5 * (df['Open_MIN_'+num_days_str] + df['Open_MAX_'+num_days_str])
    df['High_TENKAN_'+num_days_str] = 0.5 * (df['High_MIN_'+num_days_str] + df['High_MAX_'+num_days_str])
    df['Low_TENKAN_'+num_days_str] = 0.5 * (df['Low_MIN_'+num_days_str] + df['Low_MAX_'+num_days_str])
    df['Close_TENKAN_'+num_days_str] = 0.5 * (df['Close_MIN_'+num_days_str] + df['Close_MAX_'+num_days_str])

def add_prices_n_days_ago(num_days):
    num_days_str = str(num_days)
    df[['Open_S_'+num_days_str,'High_S_'+num_days_str,'Low_S_'+num_days_str,'Close_S_'+num_days_str]] = df[['Open','High','Low','Close']].shift(day_window * num_days)

def create_weak_labels(data, threshold):
    """
    Label is 1 if price increases or 0 otherwise
    """
    r = data.copy()
    current_close = r['Close']
    future_close = r['Close'].shift(-DAY_WINDOW)

    r['Weak_Label'] = 1 * (future_close > current_close)
    
    return r
                      
def create_strong_sell_labels(data, threshold):
    """
    Label is 1 if price decrease by more that threshold %, 0 otherwise
    """
    r = data.copy()
    current_close = r['Close']
    future_close = r['Close'].shift(-DAY_WINDOW)
    r['Strong_Sell_Label'] = 1 * (((future_close - current_close) / current_close) < -threshold)
    
    return r

def create_strong_buy_labels(data, threshold):
    """
    Label is 1 if price increase by more that threshold %, 0 otherwise
    """
    r = data.copy()
    current_close = r['Close']
    future_close = r['Close'].shift(-DAY_WINDOW)
    r['Strong_Buy_Label'] = 1 * ((future_close - current_close) / current_close > threshold)
    
    return r

def mean_square_loss(predicted_labels, true_labels):
    assert len(predicted_labels) == len(true_labels)
    return np.mean((predicted_labels - true_labels)**2)
    

In [50]:
t = 0.1

tmp = create_weak_labels(df_raw,t)
tmp = create_strong_buy_labels(tmp, t)
tmp = create_strong_sell_labels(tmp, t)
display(tmp[0:240: 2 * 24])

print(tmp.describe())

Unnamed: 0,Close,High,Low,Open,Timestamp,Volume_(BTC),Volume_(Currency),Weighted_Price,Weak_Label,Strong_Buy_Label,Strong_Sell_Label
0,4.39,4.39,4.39,4.39,2011-12-31 07:30:00,3.644647,16.0,4.39,1,0,0
48,4.58,4.58,4.58,4.58,2012-01-01 07:30:00,45.06,206.3748,4.58,1,0,0
96,5.0,5.0,5.0,5.0,2012-01-02 07:30:00,303.0,1515.0,5.0,0,0,0
144,5.0,5.0,5.0,5.0,2012-01-03 07:30:00,571.44,2857.2,5.0,0,0,0
192,4.93,4.93,4.93,4.93,2012-01-04 07:30:00,290.4,1431.672,4.93,1,1,0


               Close           High            Low           Open  \
count  108620.000000  108620.000000  108620.000000  108620.000000   
mean     1249.570211    1256.892187    1241.106251    1249.495719   
std      2776.113561    2796.364277    2752.981890    2776.037148   
min         4.140000       4.140000       1.500000       4.140000   
25%       117.397500     117.900000     116.627500     117.400000   
50%       393.355000     394.905000     391.010000     393.385000   
75%       710.092500     712.990000     706.762500     710.395000   
max     19600.010000   19666.000000   19508.480000   19626.300000   

        Volume_(BTC)  Volume_(Currency)  Weighted_Price     Weak_Label  \
count  108620.000000       1.086200e+05   108620.000000  108620.000000   
mean      335.024034       3.975405e+05     1249.414593       0.551989   
std       597.351920       1.333389e+06     2775.656316       0.497292   
min         0.027968       2.999990e-01        4.140000       0.000000   
25%     

## Simple model

To assess the performance of our model, we devise a simple naïve model as a benchmark. Our simple model looks at the price change in the last 24H and assume that this change will repeat in the next 24H. More specifically, let $P_p, P_c, P_f$ be respectively the previous, current and future price of Bitcoin (in intervals of 24 hours). We have:

$$ P_f = P_c + (P_c - P_p) $$

In [None]:
def naive_predictor(previous, current):
    return current.Close + ((current.Close - previous.Close) / previous.Close) * current.Close

def naive_predict(data):  
    P_c = data.Close
    P_p = data.Close.shift(DAY_WINDOW)
    
    return pd.concat([data.Timestamp, 2 * P_c +  - P_p], axis=1)
    

## Simple model performance

In [None]:
true_labels = pd.DataFrame(create_labels(df_raw).Label)
predicted_labels = naive_predict(df_raw)

join = pd.concat([true_labels, predicted_labels], axis=1).dropna()

print(mean_square_loss(np.log(join.Label), np.log(join.Close)))

In [None]:
last_n_points = -6000

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(df_raw.Timestamp[last_n_points:], np.log(df_raw.Close[last_n_points:]), line_color='blue')
p.line(df_raw.Timestamp[last_n_points:], np.log(naive_predict(df_raw).Close[last_n_points:]), line_color='red')

show(p)

## Neural network

In [None]:
window_aggregation = 30

df = aggregate_market_values(df_raw.copy(), window_aggregation)

df = df.drop('Timestamp',1)
#df = df.drop('Weighted_Price',1)


labels = np.log(create_labels(df).Label)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df[['Open_W','High_W','Low_W','Close_W']] = df[['Open','High','Low','Close']].divide(df['Volume_(BTC)'],axis = 0)

#df[['Open','High','Low','Close']] = df[['Open','High','Low','Close']].apply(lambda x: np.log(x))

add_prices_n_days_ago(1)
add_prices_n_days_ago(2)
add_prices_n_days_ago(3)

add_MA_n_days_age(5)
add_MA_n_days_age(10)
add_MA_n_days_age(15)

plusieurs y gains selon différents temps + vote

In [None]:
#mean = (df['Open'] + df['Close']) * 0.5
#res = mean.shift(day_window)

#df['y'] = ((mean - res) > 0).astype(int)
df = pd.concat([df, labels], axis=1)

In [None]:
display(df.sample(10))

In [None]:
df = df.dropna(how ='any')

labels = df.Label
df = (df-df.mean())/df.std()
df.Label = labels
df = df.reset_index()
df = df.drop('index',1)
display(df.describe())

In [None]:
length_test2=int(0.8*len(df.index))
window = day_window * 30 * 2

index = df.index
index = index[0:length_test2]

indexList = [index[i:min(i + window,len(index))] for i in range(0, len(index), window)]
trainIndex = []
testIndex = []

for i in range(len(indexList)):
    if i%9 != 0:
        trainIndex += indexList[i].tolist()
    else:
        testIndex  += indexList[i].tolist()
        
print(len(trainIndex+testIndex) == len(index))
print(len(indexList))

In [None]:
train=df.iloc[trainIndex,:]
test1=df.iloc[testIndex,:]
test2=df.drop(train.index).drop(test1.index)

train_x = train.drop('Label',1)
train_y = train['Label']
test_x1 = test1.drop('Label',1)
test_y1 = test1['Label']
test_x2 = test2.drop('Label',1)
test_y2 = test2['Label']

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=95, activation='selu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='selu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='selu'))
model.add(Dropout(0.5))
model.add(Dense(1))

model.compile(loss='mean_squared_error',
              optimizer='adam')

history = model.fit(train_x, train_y, epochs = 30, verbose=1)
scores1 = model.evaluate(test_x1, test_y1, verbose=0)
scores2 = model.evaluate(test_x2, test_y2, verbose=0)

model.save('my_model2.h5')

In [None]:
#plot_history(history,"test")

In [None]:
print(scores1)
print(scores2)

In [None]:
prediction = model.predict(test_x2)


last_n_points = -len(prediction)

p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=990, title = "MSFT Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

p.line(range(len(prediction)), np.squeeze(prediction), line_color='blue')
p.line(range(len(prediction)), np.log(df_raw.Close[last_n_points:]), line_color='red')

show(p)

In [None]:
correct = []
not_correct = []

prediction = model.predict(test_x2)

for i in range(len(prediction)):
    if prediction[i].round() == test_y2.values[i]:
        correct.append([i,test_x2['Open'].values[i]])
    else:
        not_correct.append([i,test_x2['Open'].values[i]])

fig, ax = plt.subplots(figsize=(10, 10),dpi = 80)
a,b = zip(*correct)
ax.plot(a,b, '.r',markersize=0.2)
a,b = zip(*not_correct)
ax.plot(a,b, '.b',markersize=0.2)

plt.show()

## Investing using the model
We use the model to apply the following simple investment strategy. At the end of each day, we take either a long or short position for $100. No matter what happens, the position is liquidated after 24H. Of course, the choice of the position is dependent on the price increase or decrease of the model

In [None]:
daily_amount = 100 #dollars

def compute_investment_return(prices, decisions):
    prices_in_24h = prices.shift(-DAY_WINDOW)
    return np.sum( ((prices_in_24h - prices)/prices) * daily_amount * decisions)

### The anarchist
The anarchist decide whether to invest or not based on a (bit)coin flip:

In [None]:
sum = 0
prices = df_raw.iloc[test2.index].Close
l = len(prices)
for i in range(10000):
    choices = 2 * np.random.choice(2, l) - 1
    sum += compute_investment_return(prices, choices)

print(sum / 10000.0)

### Simple model

In [None]:
#true_labels = pd.DataFrame(create_labels(df_24h).Label)
predicted_labels = naive_predict( df_raw.iloc[test2.index].copy())
predicted_labels.columns = ['Timestamp', 'Price_Prediction']

prices = df_raw.iloc[test2.index].Close

join = pd.concat([predicted_labels, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.Close) - 1 # Long: 1 Short: -1
display(join)

decisions_simple = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

### Neural net

In [None]:
prediction = pd.DataFrame(np.exp(model.predict(test_x2)).squeeze())
prediction.columns = ['Price_Prediction']
prediction['previous'] = prediction.shift(DAY_WINDOW)

prices = df_raw.iloc[test2.index][['Timestamp', 'Close']].reset_index(drop=True)

join = pd.concat([prediction, prices], axis=1).dropna()
join = first_in_window(join.copy(), 24*60)
join['Decision'] = 2 * (join.Price_Prediction >= join.previous) - 1 # Long: 1 Short: -1
display(join)

decisions_nn = join.copy().Decision

print(len(join))

print(compute_investment_return(join.Close, join.Decision))

In [None]:
print(np.sum(decisions_simple.values == decisions_nn[1:].values))

In [None]:
from keras.models import load_model
 
    

with tf.device('/cpu:0'):

    model = load_model('my_model.h5')
    scores = model.evaluate(test_x, test_y, verbose=0)
    print(scores)
    
    del model
