In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn import tree, svm, linear_model, neural_network, gaussian_process
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV, ParameterGrid, cross_validate
from sklearn import preprocessing
from sklearn import metrics
import pandas as pd
import numpy as np
import datetime
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

'en_US.UTF-8'

In [3]:
OPEN = 'Open*'
HIGH = 'High'
LOW = 'Low'
CLOSE = 'Close**'
VOLUME = 'Volume'

In [4]:
def unscale(value):
    return value * np.std(df_num[CLOSE]) + np.mean(df_num[OPEN])

In [5]:
# Open the dataset
df = pd.read_csv("bitcoin-price-all-2.csv", sep=',')

In [6]:
# Convert to numeric
def raw_data_to_numeric(raw_data):
    data = raw_data.replace(' ', '')
    data = data.replace(',', '.')
    if data == '0':
        return np.nan
    return float(data)
df_num = df[['Open*', 'High', 'Low', 'Close**', 'Volume', 'Market Cap']]
df_num = df_num.applymap(raw_data_to_numeric)

In [7]:
# Replace NaN by first known data (for `Volume`)
df_num = df_num.fillna(46862700)

In [8]:
# Read dates
dates = df['Date'].apply(lambda x: datetime.datetime.strptime(x, '%b %d, %Y'))

In [9]:
dates_num = dates.map(datetime.datetime.toordinal)

In [10]:
df_dated = df_num.copy()
df_dated.insert(0, 'Date', dates)

In [11]:
df_dated.sort_values(by=['Date'], ascending=True)
df_num = df_dated.copy()[['Open*', 'High', 'Low', 'Close**', 'Volume', 'Market Cap']]

In [12]:
# Scale data
scaler = preprocessing.StandardScaler()
df_norm = pd.DataFrame(scaler.fit_transform(df_num.values), columns=df_num.columns, index=df_num.index)
print(df_norm)

         Open*      High       Low   Close**    Volume  Market Cap
0     1.434426  1.389760  1.472135  1.413960  2.260594    1.535942
1     1.402501  1.379486  1.462052  1.432540  2.422154    1.555256
2     1.512686  1.464117  1.480165  1.402411  2.538837    1.523675
3     1.505648  1.467654  1.568848  1.510694  2.084588    1.636705
4     1.478571  1.450206  1.539506  1.503790  2.282077    1.629392
...        ...       ...       ...       ...       ...         ...
2580 -0.841637 -0.838682 -0.853942 -0.844468 -0.539968   -0.834211
2581 -0.839018 -0.834300 -0.850409 -0.842539 -0.539968   -0.832999
2582 -0.833169 -0.830715 -0.846271 -0.839495 -0.539968   -0.831083
2583 -0.831877 -0.828950 -0.839214 -0.833805 -0.539968   -0.827501
2584 -0.834348 -0.828809 -0.839227 -0.832373 -0.539968   -0.826606

[2585 rows x 6 columns]


## DataFrames:
df_num: numeric variables

df_norm: normalized numeric variables

dates: dates as datetime object

dates_num: dates as numeric

In [13]:
def train(df_, delta, model):
    scores = list()
    X = df_[['Open*', 'High', 'Low', 'Close**', 'Volume', 'Market Cap']].iloc[:-delta,:]
    y = df_[['Close**']].iloc[delta:]
    
    tscv = TimeSeriesSplit()
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train.values.ravel())
        scores.append(model.score(X_test, y_test))
        
    X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel())
    cross = cross_val_score(model, X_test, y_test, cv=10, scoring='neg_mean_absolute_error')
    
    return -np.median(cross), np.mean(scores)

In [14]:
delta = 7

In [15]:
print(train(df_num, delta, linear_model.LinearRegression(fit_intercept=False, copy_X=True)))

(287.7231887636583, 0.8448841771088416)


In [16]:
print(train(df_norm, delta, neural_network.MLPRegressor(hidden_layer_sizes=(125),
                                                        activation='identity',
                                                        solver="lbfgs",
                                                        alpha=0.005
                                                       )))

(0.06916778129557798, 0.6251079920240306)


In [17]:
print(train(df_num, delta, svm.SVR()))
print(train(df_num, delta, linear_model.SGDRegressor()))
print(train(df_num, delta, gaussian_process.GaussianProcessRegressor()))
print(train(df_num, delta, tree.DecisionTreeRegressor()))

(2727.1597655594014, -509.8021970698036)
(1.0689150241688029e+30, -1.52523781467793e+55)
(3391.2030877403845, -6.054051503201183)
(390.6813365384615, 0.013005122990977114)


# Neural Network

In [18]:
# Define variables
delta = 1

X = df_norm[['Open*', 'High', 'Low', 'Close**', 'Volume']].iloc[:-delta,:]
y = df_norm['Close**'].iloc[delta:]

X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel())

In [19]:
# Evaluate the model with the found parameters:

model = neural_network.MLPRegressor(hidden_layer_sizes=(100,),
                                    activation='relu',
                                    solver='lbfgs',
                                    alpha=1
                                   )

In [20]:
# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y)
model.fit(X_train, y_train)

MLPRegressor(alpha=1, solver='lbfgs')

In [21]:
def absolute_error_score(y_true, y_pred):
    diff = np.abs(unscale(y_true) - unscale(y_pred)) * -1
    return np.average(diff, axis=0)

def absolute_error(estimator, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    return absolute_error_score(y_test, y_pred)
    
cross = cross_val_score(model, X, y, scoring=absolute_error)
print(np.mean(cross))
cross

-53.79065557093578


array([-57.21180575, -54.10828577, -26.86363654, -67.44475763,
       -63.32479217])

In [22]:
def percentage_error_score(y_true, y_pred):
    percentage = (np.abs(y_pred - y_true)) / np.abs(y_true) * 100
    return -np.average(percentage, axis=0)

def percentage_error(estimator, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    return percentage_error_score(y_test, y_pred)
cross = cross_val_score(model, X, y, scoring=percentage_error)
print(np.mean(cross))
cross

-2.4499903672345726


array([-2.18303691, -4.4426484 , -1.52245844, -2.06164956, -2.04015852])

In [23]:
def aon(x_close, y_pred, y_true):
    # all-or-none principle
    return not ((y_pred <= x_close) != (y_true < x_close))
        
def aon_scorer(estimator, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test).reshape(X_test.shape[0], 1)
    X_test_close = pd.DataFrame(X_test[CLOSE])
    X_test_close.insert(1, 'y_test', y_test)
    X_test_close.insert(1, 'y_pred', y_pred)
    aon_series = X_test_close.apply(lambda val: aon(val[CLOSE], val['y_pred'], val['y_test']), axis=1)
    return np.mean(aon_series)
"""
print(aon(50, 52, 51))
print(aon(50, 52, 50))
print(aon(50, 50, 50))
print(aon(50, 52, 49))
print(aon(50, 48, 48))
print(aon(50, 48, 51))
print(aon(50, 48, 50))
"""
cross = cross_val_score(model, X, y, scoring=aon_scorer)
print(np.mean(cross))
cross

0.48982707215265364


array([0.43846154, 0.42307692, 0.67692308, 0.52307692, 0.3875969 ])

In [28]:
scores = cross_validate(neural_network.MLPRegressor(hidden_layer_sizes=(100,),
                                    activation='tanh',
                                    solver='adam',
                                    alpha=10
                                   ), X, y,
                        scoring={
                            'percentage_error': percentage_error,
                            'absolute_error': absolute_error,
                            'aon_scorer': aon_scorer
                        })

print(np.mean(scores['test_absolute_error']), np.std(scores['test_absolute_error']))
print(np.mean(scores['test_percentage_error']), np.std(scores['test_percentage_error']))
print(np.mean(scores['test_aon_scorer']), np.std(scores['test_aon_scorer']))
scores

-175.72552558084007 79.4429590545589
-9.738719682989693 7.608497045500248
0.6362075134168157 0.06026372448133699


{'fit_time': array([2.34793997, 1.33097339, 2.05436063, 1.51144767, 2.26935697]),
 'score_time': array([2.83061409, 2.95782328, 2.46833372, 2.33432412, 2.56323004]),
 'test_percentage_error': array([ -5.61611668, -12.5980119 , -23.22306198,  -1.65710827,
         -5.59929959]),
 'test_absolute_error': array([-163.87517829, -228.05930137, -287.42913598,  -51.57724613,
        -147.68676613]),
 'test_aon_scorer': array([0.63846154, 0.61538462, 0.68461538, 0.70769231, 0.53488372])}

In [29]:
scores = cross_validate(neural_network.MLPRegressor(hidden_layer_sizes=(100,),
                                    activation='logistic',
                                    solver='adam',
                                    alpha=1
                                   ), X, y,
                        scoring={
                            'percentage_error': percentage_error,
                            'absolute_error': absolute_error,
                            'aon_scorer': aon_scorer
                        })

print(np.mean(scores['test_absolute_error']), np.std(scores['test_absolute_error']))
print(np.mean(scores['test_percentage_error']), np.std(scores['test_percentage_error']))
print(np.mean(scores['test_aon_scorer']), np.std(scores['test_aon_scorer']))
scores

-254.41512980919043 109.96155443086954
-14.551338413066498 8.62416807762261
0.4942277877161598 0.13067299780435163


{'fit_time': array([1.34835124, 0.92362332, 1.04641485, 1.16161299, 1.11511803]),
 'score_time': array([1.57846379, 1.71861053, 0.56192613, 0.48001337, 0.32627606]),
 'test_percentage_error': array([-21.75833859, -15.7835259 , -25.35147205,  -2.04172889,
         -7.82162663]),
 'test_absolute_error': array([-286.74074405, -231.13699358, -392.24682489,  -60.37998284,
        -301.57110369]),
 'test_aon_scorer': array([0.55384615, 0.62307692, 0.48461538, 0.56153846, 0.24806202])}

In [26]:
#############################################"" Search the best parameters:
model = neural_network.MLPRegressor()

gs = GridSearchCV(
    model,
    {
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'alpha': 10.0**np.arange(-3,1),
        #'solver': ['lbfgs', 'sgd', 'adam'],
        #'hidden_layer_sizes': [(100,), (10, 10, 10), (125, 125)]
    },
    scoring=aon_scorer
)

gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.score(X_test, y_test))

{'activation': 'logistic', 'alpha': 1.0}
0.6728395061728395


In [27]:
#############################################"" Search the best parameters:
model = neural_network.MLPRegressor(activation='logistic',alpha=1)

gs = GridSearchCV(
    model,
    {
        'solver': ['lbfgs', 'sgd', 'adam'],
        'hidden_layer_sizes': [(100,), (10, 10, 10), (125, 125)]
    },
    scoring=aon_scorer
)

gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.score(X_test, y_test))

{'hidden_layer_sizes': (100,), 'solver': 'adam'}
0.8580246913580247


In [30]:
#############################################"" Search the best parameters:
model = neural_network.MLPRegressor()

gs = GridSearchCV(
    model,
    {
        'activation': ['identity', 'logistic', 'tanh', 'relu'],
        'alpha': 10.0**np.arange(-3,2),
        'solver': ['lbfgs', 'sgd', 'adam'],
        'hidden_layer_sizes': [(100,), (10, 10, 10), (125, 125)]
    },
    scoring=aon_scorer
)

gs.fit(X_train, y_train)
print(gs.best_params_)
print(gs.score(X_test, y_test))

{'activation': 'tanh', 'alpha': 10.0, 'hidden_layer_sizes': (100,), 'solver': 'adam'}
0.7407407407407407


In [None]:
# Use TimeSeriesSplit, with sgd solver

delta = 2

X = df_norm[['Open*', 'High', 'Low', 'Close**', 'Volume']].iloc[:-delta,:]
y = df_norm[['Close**']].iloc[delta:]

X_train0, X_test0, y_train0, y_test0 = train_test_split(X, y.values.ravel(),
                                                        shuffle=False,
                                                        test_size=0.15
                                                       )

model = neural_network.MLPRegressor(hidden_layer_sizes=(100,),
                                    activation='relu',
                                    solver='sgd',
                                    alpha=1
                                   )
scores = list()
tscv = TimeSeriesSplit(n_splits=100)
for train_index, test_index in tscv.split(X_train0):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.partial_fit(X_train, y_train)
    
    y_pred = model.predict(X_test).reshape(X_test.shape[0], 1)
    
    diff = np.abs(unscale(y_test) - unscale(y_pred)) * -1
    percentage = (np.abs(y_pred - y_test)) / np.abs(y_test) * 100
    
    scores.append((-np.average(percentage, axis=0), np.average(diff, axis=0)))
    
    if np.average(percentage, axis=0) < 3:
        break

y_pred = model.predict(X_test0)
print(percentage_error_score(y_pred, y_test0))
print(absolute_error_score(y_pred, y_test0))
print(scores[-1])
scores