## Import Data and Libraries

In [1]:
from pyspark import SparkContext
from spark_sklearn import GridSearchCV

from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler 
from time import time
from math import sqrt
from sklearn.linear_model import ElasticNet, SGDRegressor, LinearRegression, ElasticNetCV, LassoCV, Lasso, Ridge, RidgeCV, BayesianRidge, ARDRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor

from matplotlib import pyplot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
%matplotlib inline

data = pd.read_csv('final.csv')
data = data.drop(columns=['date','low','open','high','marketcap'])
#data = data[data.columns[sel]]
print('n_features:', len(data.iloc[0]))
print('n_samples:', len(data))
data.tail()

n_features: 36
n_samples: 2735


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
2730,6093.67,0.031736,3279760000.0,6.0,1260.3,16.225,864.0,939.0,75.23,1.1672,...,1274.987261,675604827.0,61.446716,213583.5164,501623219.9,39627403.83,0.920948,5080000000000.0,415602,12092550.0
2731,6157.13,0.020789,3296220000.0,6.0,1254.6,16.21,860.0,946.0,77.41,1.1616,...,1338.383562,609799528.3,59.349571,201953.0378,363193880.4,36850961.53,1.067592,5080000000000.0,410397,11395190.0
2732,5903.44,0.049391,3467800000.0,6.0,1251.55,16.11,852.0,945.0,73.45,1.1583,...,1557.933884,795615808.3,50.289212,197368.2529,345979167.6,30540865.37,1.088043,5080000000000.0,397865,9288788.0
2733,6218.3,0.070443,3966230000.0,8.0,1250.45,16.03,851.0,953.0,74.13,1.1658,...,1195.057325,703945479.4,65.127555,166798.5735,262900494.4,39627403.83,0.68943,5080000000000.0,396405,12052690.0
2734,6404.0,0.039642,4543860000.0,7.0,1250.45,16.03,851.0,953.0,74.13,1.1658,...,1310.470199,656285943.9,57.609894,163181.8222,414797814.1,38112980.76,0.746513,5080000000000.0,453050,11236720.0


In [2]:
test_data = pd.read_csv('Test_Data.csv')
test_data = test_data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(test_data.iloc[0]))
print('n_samples:', len(test_data))
test_data.head()

n_features: 36
n_samples: 92


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
0,6385.82,0.022569,4788259840,62,1250.45,16.03,851.0,953,74.13,1.1658,...,1284.148936,493208885.1,58.735614,140545.4062,489427084.1,35588942.3,0.98734,5077500000000.0,368307,10494418.5
1,6614.18,0.058242,4396930048,72,1247.8,15.98,839.0,941,73.89,1.1639,...,1062.904762,991949405.1,75.393733,106682.7087,224327884.6,37103365.37,0.705291,5077500000000.0,341861,11673361.88
2,6529.59,0.034094,4672309760,69,1251.75,15.93,838.0,954,74.19,1.1665,...,1413.432624,720058629.1,57.309569,150160.0866,281178824.1,37594816.26,0.98755,5172890000000.0,433257,11271293.25
3,6597.55,0.048633,4176689920,65,1255.65,16.045,834.0,948,74.19,1.1665,...,1868.508621,872392592.0,45.224241,185064.5374,356469308.6,30929068.69,1.113817,5363680000000.0,456149,9617154.0
4,6639.14,0.030521,4999240192,69,1255.5,15.95,845.5,947,73.05,1.1709,...,1543.19403,665699858.5,54.318173,142640.2951,345023639.3,35728406.94,0.935353,5363680000000.0,435401,11089706.0


In [3]:
# Combined train and test sets
combined_data = pd.concat([data[2613:], test_data], ignore_index=True)
combined_data['Price'] = combined_data['close'].shift(-1)
print('n_features:', len(combined_data.iloc[0]))
print('n_samples:', len(combined_data))
combined_data = combined_data.iloc[:-1,1:]
combined_data.tail()

n_features: 37
n_samples: 214


Unnamed: 0,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,usd_jpy,...,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue,Price
208,0.034161,4726180000.0,61.0,1201.9,14.29,827.0,1065.0,73.4,1.1777,0.00905,...,503494967.4,52.112921,102965.2698,273203917.3,52622781.21,0.807114,7152630000000.0,469028,12385677.98,6495.0
209,0.028964,4437300000.0,59.0,1194.25,14.475,824.0,1059.0,72.22,1.1737,0.00904,...,842041019.0,54.247914,130284.3136,390313648.8,60089527.19,0.768241,7152630000000.0,490588,13676604.3,6676.75
210,0.037517,4606810000.0,58.0,1185.4,14.42,812.0,1067.0,72.18,1.1707,0.00905,...,748444892.4,46.610199,130238.5658,312748554.4,49422747.22,0.898005,7152630000000.0,474079,11369486.09,6644.13
211,0.027904,5014430000.0,58.0,1187.25,14.305,815.0,1094.0,73.16,1.1576,0.00906,...,824319235.8,47.597177,154586.939,283381555.4,50844984.55,0.877031,7152630000000.0,481907,11836751.48,6601.96
212,0.019986,4363690000.0,48.0,1187.25,14.305,815.0,1094.0,73.16,1.1576,0.00902,...,913410388.9,51.317781,193097.0923,457654461.3,54756137.2,0.855169,7152630000000.0,527057,13046981.4,6625.56


In [75]:
# Data normalization
def normalize_data(X,Y=None):
    scaler = MinMaxScaler()
    return scaler.fit_transform(X,Y)

# Define time-series cross validation split
def TimeSeriesCVSplit(n_samples, test_size, step_size):
    data_split = []
    train_end = n_samples - test_size - step_size 
    test_start = train_end + step_size
    for i in range(0,test_size):
        train_index = list(range(0, train_end))
        test_index = [test_start]
        data_split.append([train_index,test_index])
        train_end+=1
        test_start+=1
    return data_split

def wfss(X,Y,model,subset):
    sel = subset.copy() # # Selected features
    overall_error = train_model(X[:,sel],Y,model,predict=False) 
    while len(sel) != 0:
        # Select candidate
        cand_error = 1e10 # Assign a big number
        for cand in sel:
            features = sel.copy()
            features.remove(cand)
            if len(features) > 1:
                new_error = train_model(X[:,features],Y,model,predict=False)
            else:
                new_error = train_model(X[:,features[0]].reshape(-1,1),Y,model,predict=False)
            if new_error < cand_error:
                selected_candidate = cand
                cand_error = new_error
        if overall_error < cand_error:
            # Stop if the new candidate doesn’t
            # improve the assessment of the
            # previously selected candidates
            break
        else:
            overall_error = cand_error
            sel.remove(selected_candidate)
    print('RMSE:', str(train_model(X[:,sel],Y,model,predict=False)))
    return sel

def train_model(X,Y,model,predict):
    data_split = TimeSeriesCVSplit(len(X),n_validation,0)
    Y_pred = []
    Y_test = []
    scores = []
    n=1
    for fold in data_split:
        X_train = X[fold[0]]
        Y_train = Y[fold[0]]
        X_test = X[fold[1]]
        y_pred = Y[fold[1]]
        
        import timeit
        start = timeit.default_timer()
        
        if predict:
            #print(n)
            #Fine-tune alpha
            #k, p, weights = fine_tune_KNN(X_train,Y_train)
                # Feature Selection
            selected_features = wfss(X_train,Y_train,model,np.arange(0,35).tolist())         
            print(n, selected_features)
            
            scaler = MinMaxScaler()
            scaler.fit(X_train[:,selected_features],Y_train)
            X_train = scaler.transform(X_train[:,selected_features])
            X_test = scaler.transform(X_test[:,selected_features])  
            
        if not predict:
            scaler = MinMaxScaler()
            scaler.fit(X_train,Y_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test) 
        
        
        model.fit(X_train,Y_train.reshape(-1,))
        prediction = model.predict(X_test).reshape(-1,1)
        Y_pred.append(prediction)
        Y_test.append(y_pred)
        rmse = np.sqrt(mean_squared_error(y_pred, prediction))
        scores.append(rmse)
        
        if predict:
            stop = timeit.default_timer()
            print('Time: ', stop - start)
        n+=1
    Y_pred = np.asarray(Y_pred)
    Y_test = np.asarray(Y_test)
    if predict:
        return np.sqrt(mean_squared_error(Y_pred.reshape(-1,1), Y_test.reshape(-1,1))), Y_test, Y_pred
    if not predict:
        return np.sqrt(mean_squared_error(Y_pred.reshape(-1,1), Y_test.reshape(-1,1)))



def get_feature_ranking(feature_scores):
    indices = np.nonzero(feature_scores)[0].tolist()
    scores = np.abs(feature_scores[indices]).tolist() 
    sorted_scores, sorted_features = zip(*sorted(zip(scores, indices),reverse=True))
    return list(sorted_features)

def RMSE(y_true, y_hat):
    error = np.sqrt(mean_squared_error(y_true, y_hat))
    return error

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


## Data Preparation

In [17]:
# Feature Subsets
lasso_subset = [34, 28, 30, 32, 1, 27, 2, 8, 25, 12, 22, 0, 17, 6, 18, 24, 9, 5, 31, 10, 19, 33, 23, 14] # lasso [0:20] for KNN

### TO BE UPDATED ###
etrees_subset = [34, 32, 14, 18, 27, 22, 1, 6, 23, 30, 16, 15, 19, 29, 28, 13, 26, 7, 8, 11, 2, 33, 3] # extra trees
#subset4 = [0, 1, 24, 2, 31, 3, 29, 27, 34, 28, 7, 25, 26, 33, 4, 8, 6, 11, 17, 21, 30, 23, 22] # xgboost can remove 9
xgb_subset = [0, 1, 29, 31, 27, 6, 28, 24, 25, 34, 7, 2, 8, 3, 26, 22, 4, 11, 21, 33, 20, 30, 17] # xgboost can remove 9
subsets = [lasso_subset,etrees_subset,xgb_subset]

# Validation samples size (1/4/18 - 30/6/18)
n_validation = 91

data = combined_data.values # use best period

n_samples = len(data)
Y = data[:-92,-1].reshape(-1,1)
X = data[:-92,:-1]



### Parameters Tuning

In [6]:
from sklearn.pipeline import Pipeline
pipe = pipe = Pipeline([
    ('normalize', MinMaxScaler()),
    ('xgb', XGBRegressor())
])
pipe

Pipeline(memory=None,
     steps=[('normalize', MinMaxScaler(copy=True, feature_range=(0, 1))), ('xgb', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [7]:
%%time
sc = SparkContext()

param_grid = [{
            "xgb__silent":[True],
            "xgb__learning_rate": [0.25, 0.15, 0.1, 0.2, 0.3],
            "xgb__gamma": [1,2,3],
            "xgb__max_depth": [3, 5, 10],
            "xgb__n_estimators": [105,110,115],
            "xgb__reg_alpha": [0,1],
            "xgb__n_jobs": [-1],
            "xgb__booster": ["gbtree"]
} ]
split = TimeSeriesCVSplit(len(X),n_validation,0)
gs = GridSearchCV(sc, pipe, param_grid, cv=split, scoring=make_scorer(RMSE,greater_is_better=False), verbose=1) 
gs.fit(X,Y.reshape(-1,))
print('Best Parameters: ',gs.cv_results_['params'][gs.best_index_])
sc.stop()

Fitting 91 folds for each of 270 candidates, totalling 24570 fits
Best Parameters:  {'xgb__booster': 'gbtree', 'xgb__gamma': 3, 'xgb__learning_rate': 0.15, 'xgb__max_depth': 3, 'xgb__n_estimators': 115, 'xgb__n_jobs': -1, 'xgb__reg_alpha': 0, 'xgb__silent': True}
CPU times: user 2.27 s, sys: 746 ms, total: 3.01 s
Wall time: 21min 55s


In [19]:
print("Best RMSE:", str(gs.cv_results_['mean_test_score'][gs.best_index_]))

Best RMSE: -342.56379915006875


### Feature Subset Selection

Replace the best parameters from above into the XGBRegressor() below

In [11]:
%%time
# Feature Selection on Whole set
selected_features = wfss(X,Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),np.arange(0,35).tolist())
print(selected_features)

[0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34]
CPU times: user 9min 40s, sys: 3.08 s, total: 9min 43s
Wall time: 10min 20s


In [59]:
# Feature Selection on Whole Set
sel = [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34]
rmse = train_model(X,Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=False)
print('Train RMSE:',str(rmse))

Train RMSE: 319.7105023159526


In [55]:
# Feature Selection on Whole Set
sel = [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34]
selected_features = wfss(X,Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),np.arange(0,35).tolist())
print(selected_features)

KeyboardInterrupt: 

In [51]:
sel = [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34]
new_sel = [0, 1, 2, 3, 4, 6, 8, 9, 11, 15, 16, 17, 18, 21, 23, 26, 27, 30, 32, 33, 34]

rmse = train_model(X[:,sel],Y,XGBRegressor(),predict=False)
print('Train RMSE:',str(rmse))

# Train Set
n_validation = 91
Y = data[:-92,-1].reshape(-1,1)
X = data[:-92,:-1]
selected_features=[0, 1, 2, 3, 4, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26, 27, 30, 31, 32, 33, 34]
rmse,Y_test,Y_train_pred= train_model(X[:,selected_features],Y,XGBRegressor(),predict=True)
print('Train RMSE:',str(rmse))


Train RMSE: 429.64698468406425
Train RMSE: 402.30792719193806


In [38]:
%%time
selected_features = wfss(X,Y,XGBRegressor(),sel)
print(selected_features)

RMSE: 391.4651047830174
[0, 1, 2, 3, 4, 6, 8, 9, 11, 15, 16, 17, 18, 21, 23, 26, 27, 30, 32, 33, 34]
CPU times: user 2min 40s, sys: 716 ms, total: 2min 40s
Wall time: 2min 47s


In [20]:
# Train RMSE on whole set
rmse = train_model(X,Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=False)
print('Train RMSE:',str(rmse))
xgb = XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True)
Y = data[:-92,-1].reshape(-1,1)
X = data[:-92,:-1]
xgb.fit(X,Y.reshape(-1,))
xgb_features = get_feature_ranking(xgb.feature_importances_)
xgb_features


Train RMSE: 448.941169363459


[0,
 1,
 29,
 27,
 30,
 3,
 6,
 26,
 34,
 28,
 31,
 33,
 15,
 14,
 11,
 4,
 2,
 23,
 22,
 8,
 24,
 18,
 7,
 13,
 5,
 25,
 32,
 17,
 16,
 10,
 9,
 21,
 12,
 20]

In [19]:
rmse = train_model(X,Y,XGBRegressor(),predict=False)
print('Train RMSE:',str(rmse))

Train RMSE: 462.9408087710165


In [12]:
for subset in subsets:
    rmse = train_model(X[:,subset],Y,XGBRegressor(),predict=False)
    print('Train RMSE:',str(rmse))

Train RMSE: 477.93429600898173
Train RMSE: 437.65100376125724
Train RMSE: 483.40974920998406


### Prediction

Replace the best parameters from above into the XGBRegressor() below

In [None]:
%%time
# Test Set
n_validation = 90
Y = data[:,-1].reshape(-1,1)
X = data[:,:-1]
rmse,Y_test,Y_pred= train_model(X[:,:],Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=True)
print('Test RMSE:',str(rmse))


RMSE: 406.5415612855171
1 [0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34]
Time:  436.71328531099425
RMSE: 396.1515131528861
2 [0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34]
Time:  514.0519061109953
RMSE: 395.03482271572926
3 [0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34]
Time:  503.134327820997
RMSE: 385.6247639961619
4 [0, 1, 2, 3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 25, 26, 27, 28, 30, 31, 32, 33, 34]
Time:  569.273807058009
RMSE: 394.87089264943137
5 [0, 1, 3, 6, 8, 9, 11, 12, 13, 14, 15, 17, 18, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34]
Time:  586.62177431499
RMSE: 395.82684960300963
6 [0, 1, 3, 6, 8, 9, 11, 12, 13, 14, 15, 17, 18, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34]
Time:  593.2980180829909
RMSE: 395.1525974302118
7 [0, 1, 3, 6, 8, 9, 11, 12, 13, 14, 15, 17, 18, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34]
Time:  

In [71]:
%%time
selected_features = [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 21, 22, 23, 25, 26, 27, 28, 30, 31, 32, 33, 34]

# Train Set
n_validation = 91
Y = data[:-92,-1].reshape(-1,1)
X = data[:-92,:-1]
rmse,Y_test,Y_train_pred= train_model(X[:,:],Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=True)
print('Train RMSE:',str(rmse))

# Test Set
n_validation = 90
Y = data[:,-1].reshape(-1,1)
X = data[:,:-1]
rmse,Y_test,Y_pred= train_model(X[:,:],Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=True)
print('Test RMSE:',str(rmse))


Train RMSE: 448.941169363459
Test RMSE: 319.7105023159526
CPU times: user 5.67 s, sys: 31.5 ms, total: 5.7 s
Wall time: 5.86 s


In [23]:
%%time
# Train Set
n_validation = 91
Y = data[:-92,-1].reshape(-1,1)
X = data[:-92,:-1]
rmse,Y_test,Y_train_pred= train_model(X[:,selected_features],Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=True)
print('Train RMSE:',str(rmse))

# Test Set
n_validation = 90
Y = data[:,-1].reshape(-1,1)
X = data[:,:-1]
rmse,Y_test,Y_pred= train_model(X[:,selected_features],Y,XGBRegressor(booster='gbtree',gamma=3,learning_rate=0.15,max_depth=3,n_estimators=115,n_jobs=-1,reg_alpha=0,silent=True),predict=True)
print('Test RMSE:',str(rmse))


Train RMSE: 423.03191736525815
Test RMSE: 334.4979421670692
CPU times: user 4.02 s, sys: 34.8 ms, total: 4.06 s
Wall time: 4.11 s


### VISUAlIZATION

#### True plot

In [62]:
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = Y_pred.reshape(-1,),
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = Y_test.reshape(-1,),
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')

print(np.sqrt(mean_squared_error(Y_test.reshape(-1,), Y_pred.reshape(-1,))))


319.7105023159526


In [17]:
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = Y_pred.reshape(-1,),
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = Y_test.reshape(-1,),
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')

print(np.sqrt(mean_squared_error(Y_test.reshape(-1,), Y_pred.reshape(-1,))))


334.4979421670692


#### Plot shifted -1

In [18]:

y1 = Y_pred.reshape(-1,)[2:]
y2 = Y_test.reshape(-1,)[0:-2]
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = y1,
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = y2,
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')


In [26]:
np.savetxt('XGB.csv', Y_pred.reshape(-1,1), delimiter=',',  fmt='%1.10e')
