## Import Data and Libraries

In [1]:
from Functions import normalize_data, time_series_CV_split, wrapper_feature_selector, train_and_predict, warn
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler 
from time import time
from math import sqrt
from sklearn.linear_model import BayesianRidge
from matplotlib import pyplot
import plotly.offline as py
import plotly.graph_objs as go
import warnings
warnings.warn = warn
py.init_notebook_mode(connected=True)
%matplotlib inline

# Train Set
train_data = pd.read_csv('bitcoin_train.csv')
train_data = train_data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(train_data.iloc[0]))
print('n_samples:', len(train_data))
train_data.head()

n_features: 36
n_samples: 2735


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
2730,6093.67,0.031736,3279760000.0,6.0,1260.3,16.225,864.0,939.0,75.23,1.1672,...,1274.987261,675604827.0,61.446716,213583.5164,501623219.9,39627403.83,0.920948,5080000000000.0,415602,12092550.0
2731,6157.13,0.020789,3296220000.0,6.0,1254.6,16.21,860.0,946.0,77.41,1.1616,...,1338.383562,609799528.3,59.349571,201953.0378,363193880.4,36850961.53,1.067592,5080000000000.0,410397,11395190.0
2732,5903.44,0.049391,3467800000.0,6.0,1251.55,16.11,852.0,945.0,73.45,1.1583,...,1557.933884,795615808.3,50.289212,197368.2529,345979167.6,30540865.37,1.088043,5080000000000.0,397865,9288788.0
2733,6218.3,0.070443,3966230000.0,8.0,1250.45,16.03,851.0,953.0,74.13,1.1658,...,1195.057325,703945479.4,65.127555,166798.5735,262900494.4,39627403.83,0.68943,5080000000000.0,396405,12052690.0
2734,6404.0,0.039642,4543860000.0,7.0,1250.45,16.03,851.0,953.0,74.13,1.1658,...,1310.470199,656285943.9,57.609894,163181.8222,414797814.1,38112980.76,0.746513,5080000000000.0,453050,11236720.0


In [2]:
# Test Set
test_data = pd.read_csv('bitcoin_test.csv')
test_data = test_data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(test_data.iloc[0]))
print('n_samples:', len(test_data))
test_data.head()

n_features: 36
n_samples: 92


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
0,6385.82,0.022569,4788259840,62,1250.45,16.03,851.0,953,74.13,1.1658,...,1284.148936,493208885.1,58.735614,140545.4062,489427084.1,35588942.3,0.98734,5077500000000.0,368307,10494418.5
1,6614.18,0.058242,4396930048,72,1247.8,15.98,839.0,941,73.89,1.1639,...,1062.904762,991949405.1,75.393733,106682.7087,224327884.6,37103365.37,0.705291,5077500000000.0,341861,11673361.88
2,6529.59,0.034094,4672309760,69,1251.75,15.93,838.0,954,74.19,1.1665,...,1413.432624,720058629.1,57.309569,150160.0866,281178824.1,37594816.26,0.98755,5172890000000.0,433257,11271293.25
3,6597.55,0.048633,4176689920,65,1255.65,16.045,834.0,948,74.19,1.1665,...,1868.508621,872392592.0,45.224241,185064.5374,356469308.6,30929068.69,1.113817,5363680000000.0,456149,9617154.0
4,6639.14,0.030521,4999240192,69,1255.5,15.95,845.5,947,73.05,1.1709,...,1543.19403,665699858.5,54.318173,142640.2951,345023639.3,35728406.94,0.935353,5363680000000.0,435401,11089706.0


In [3]:
# Combined train and test sets
combined_data = pd.concat([train_data[2613:], test_data], ignore_index=True) # use data from 03/2018 onwards only - best stationarity
combined_data['Price'] = combined_data['close'].shift(-1) # Dependent variable Y
print('n_features:', len(combined_data.iloc[0]))
print('n_samples:', len(combined_data))
combined_data = combined_data.iloc[:-1,1:]
combined_data.tail()

n_features: 37
n_samples: 214


Unnamed: 0,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,usd_jpy,...,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue,Price
0,0.065391,7317280000.0,11.0,1307.75,16.315,969.0,1011.0,60.98,1.2171,0.009374,...,2094369000.0,108.994016,531980.0107,871257588.0,23172168.75,1.054216,3010000000000.0,469400,20144045.51,11086.4
1,0.030757,7620590000.0,11.0,1322.3,16.445,968.0,993.0,61.19,1.2312,0.009412,...,2018302000.0,99.595931,573101.5759,790675903.7,24667147.38,1.068241,3010000000000.0,514904,22728868.45,11489.7
2,0.046683,6690570000.0,11.0,1322.3,16.445,968.0,993.0,61.19,1.2312,0.009412,...,1317222000.0,112.897919,464496.4666,673860238.9,23321666.61,0.861064,3010000000000.0,524246,21569895.07,11512.6
3,0.03325,6084150000.0,10.0,1322.3,16.445,968.0,993.0,61.19,1.2312,0.009412,...,1034631000.0,146.396239,430013.9903,663520514.0,26909615.32,0.976834,3010000000000.0,416966,25485633.75,11573.3
4,0.022482,6468540000.0,11.0,1320.4,16.51,957.0,983.0,62.49,1.2307,0.009474,...,1054511000.0,134.358251,382520.8819,542742758.5,22125683.71,1.002854,3010000000000.0,383191,21145836.08,10779.9


## Feature Selection Method

In [8]:
# Feature Meta Subset
subset = [34, 28, 30, 32, 1, 27, 2, 8, 25, 12, 22, 0, 17, 6, 18, 24, 9, 5, 31, 10, 19, 33, 23, 14] 

# Split train data into X (features) and Y (dependent variable)
data = combined_data.values 
Y_train = data[:-92,-1].reshape(-1,1) 
X_train = data[:-92,:-1]

# Training Validation samples size (1/4/18 - 30/6/18)
n_validation = 91

# Feature Selection
selected_features = wrapper_feature_selector(X_train,Y_train,BayesianRidge(),subset)[0]   
print('Selected Features:',str(selected_features))


### Training

In [168]:
%%time
rmse,Y_train_test,Y_train_pred= train_and_predict(X_train[:,selected_features],Y_train,BayesianRidge(),predict=False)
print('Train RMSE: {:0.2f}'.format(rmse))
        

RMSE: 367.2615660240794
[34, 30, 1, 27, 2, 12, 0, 6, 9, 10, 33, 23, 14]


### Prediction

In [174]:
%%time
# Test samples size (01/07/18 - 30/09/18)
n_validation = 90

# Split test data into X (features) and Y (dependent variable)
Y_test = data[:,-1].reshape(-1,1) # including train data for fitting the model
X_test = data[:,:-1]

rmse,Y_test,Y_pred= train_and_predict(X_test,Y_test,BayesianRidge(),n_validation,subset,predict=True)
print('Test RMSE: {:0.2f}'.format(rmse))

1
RMSE: 381.46404894267386
[[1.16733619e+07 5.07750000e+12 4.39693005e+09 7.53937329e+01
  1.06290476e+03 5.82416787e-02 9.41000000e+02 1.56247000e+05
  7.05291442e-01 1.30310000e+00 2.77555688e+03 3.41861000e+05
  2.43071797e+04]]
2
RMSE: 365.5198534079396
[[1.12712932e+07 3.75948163e+07 4.67230976e+09 5.73095695e+01
  1.41343262e+03 3.40940034e-02 1.99294000e+05 2.78688794e+03
  4.33257000e+05 1.10289001e+03]]
3
RMSE: 368.6876447514309
[[9.61715400e+06 3.09290687e+07 4.17668992e+09 4.52242409e+01
  1.86850862e+03 4.86332048e-02 9.48000000e+02 2.16747000e+05
  8.82000000e-03 2.75912598e+03 4.56149000e+05 1.10289001e+03
  2.41748203e+04]]
4
RMSE: 352.8631568492775
[[1.10897060e+07 3.57284069e+07 4.99924019e+09 5.43181727e+01
  1.54319403e+03 3.05208860e-02 2.06788000e+05 8.87000000e-03
  4.35401000e+05 1.12427002e+03]]
5
RMSE: 357.06329016525325
[[1.14677049e+07 3.65282966e+07 4.31395994e+09 5.81165698e+01
  1.45693431e+03 2.52973748e-02 2.17881406e+04 1.99600000e+05
  7.83748219e-01 2

### VISUALIZATION

#### True plot

In [135]:
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = Y_pred.reshape(-1,),
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = Y_test.reshape(-1,),
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')

print(np.sqrt(mean_squared_error(Y_test.reshape(-1,), Y_pred.reshape(-1,))))


345.007199532464
