In [187]:
import pandas as pd
import numpy as np
import multiprocessing
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [188]:
df = pd.read_csv('../data/tweets&sentiment&absoluteprices&topics.csv')

In [189]:
df.shape

(26231, 30)

## Feature preprocessing

### Drop features

In this step, features not useful for modelling are discarded. Such features are:
* source
* text
* created_at
* id_str
* hashtag
* mention
* neg_sentiment
* neu_sentiment
* pos_sentiment
* created_date
* created_time
* passed_closing
* market_date

In [190]:
# drop review_no, product_id and review_date
df = df.drop(columns=['source', 'text', 'created_at', 'id_str', 'hashtag', 'mention', 
                      'neg_sentiment', 'neu_sentiment', 'pos_sentiment', 'created_date',
                      'created_time', 'passed_closing'
                     ], errors='ignore')

In [191]:
df.columns

Index(['Unnamed: 0', 'cleaned_text', 'retweet_count', 'favorite_count',
       'is_retweet', 'hashtag_count', 'mention_count', 'word_count',
       'char_count', 'word_density', 'punctuation_freq', 'upper_case_freq',
       'compound_sentiment', 'market_date', '^GSPC', '^IXIC', 'VGT', 'topic'],
      dtype='object')

### Type casting

In [192]:
df.dtypes

Unnamed: 0              int64
cleaned_text           object
retweet_count           int64
favorite_count          int64
is_retweet               bool
hashtag_count           int64
mention_count           int64
word_count              int64
char_count              int64
word_density          float64
punctuation_freq      float64
upper_case_freq       float64
compound_sentiment    float64
market_date            object
^GSPC                 float64
^IXIC                 float64
VGT                   float64
topic                   int64
dtype: object

In [193]:
# change is_tweet from boolean to binary
df['is_retweet'] = df['is_retweet'].replace({True:1, False:0})

### Drop NA values

In [194]:
df.isna().sum()

Unnamed: 0            0
cleaned_text          0
retweet_count         0
favorite_count        0
is_retweet            0
hashtag_count         0
mention_count         0
word_count            0
char_count            0
word_density          0
punctuation_freq      0
upper_case_freq       0
compound_sentiment    0
market_date           0
^GSPC                 0
^IXIC                 0
VGT                   0
topic                 0
dtype: int64

In [195]:
df = df.dropna(how='any')
df = df.reset_index().drop(columns=['index'])

In [196]:
df.head()

Unnamed: 0.1,Unnamed: 0,cleaned_text,retweet_count,favorite_count,is_retweet,hashtag_count,mention_count,word_count,char_count,word_density,punctuation_freq,upper_case_freq,compound_sentiment,market_date,^GSPC,^IXIC,VGT,topic
0,0,thank,19616,65721,0,0,1,5,28,5.6,0.4,0.0,0.4199,2020-10-01,3380.800049,11326.509766,315.329987,6
1,1,big news maine court side rnc uphold ban ballo...,29393,0,1,0,0,24,138,5.75,0.291667,0.166667,0.0,2020-10-01,3380.800049,11326.509766,315.329987,2
2,2,thank paul,15992,63294,0,0,0,4,16,4.0,0.25,0.0,0.4199,2020-10-01,3380.800049,11326.509766,315.329987,6
3,3,defective ballot new york want replace happen ...,51445,190750,0,0,0,42,199,4.738095,0.119048,0.857143,-0.7988,2020-10-01,3380.800049,11326.509766,315.329987,6
4,4,half years secure americas border rebuild awes...,18885,70838,0,1,0,43,239,5.55814,0.093023,0.116279,-0.126,2020-10-01,3380.800049,11326.509766,315.329987,7


### Correlation plot
From the correlation plot it can be seen that the meta text features have little correlation with the target variables

In [197]:
# from string import ascii_letters

In [198]:
# #sns.set_theme(style="white")

# # Generate a large random dataset
# rs = np.random.RandomState(33)
# d = pd.DataFrame(data=rs.normal(size=(100, 26)),
#                  columns=list(ascii_letters[26:]))

# # Compute the correlation matrix
# corr = df.corr()

# # Generate a mask for the upper triangle
# mask = np.triu(np.ones_like(corr, dtype=bool))

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(11, 9))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(230, 20, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)

# plt.show()

### Normalise features
Normalisation is essential for deep-learning/NN-related tasks. MinMax Scaler is chosen in this case to rescale the numeric inputs.

In [199]:
# from sklearn.preprocessing import MinMaxScaler

In [200]:
# # obtain a list of numeric columns
# numeric_cols = list(df.columns)

# # remove text and output columns
# numeric_cols.remove('^GSPC')
# numeric_cols.remove('^IXIC')
# numeric_cols.remove('VGT')

In [201]:
# # scale the features
# scaler = MinMaxScaler() 
# df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

### Word embedding
Word2Vec is the chosen text embedding method for several reasons:
* The mapping between the target word to its context word implicitly embeds the sub-linear relationship into the vector space of words, so that relationships like “king:man as queen:woman” can be infered by word vectors.
* It is less computationally expensive than deep language models such as GloVe, BERT, ElMo. BERT + transfer learning with BiLSTM was initially chosen for this problem but due to the relatively large dataset and limited computational power, training was extremely slow.

#### Further process text for Word2Vec

In [202]:
# import nltk

In [203]:
# def process_text_w2v(paragraph):    
#     result = list()
#     for line in nltk.sent_tokenize(paragraph):
#         tokenizer = nltk.RegexpTokenizer(r"\w+")
#         tokens = [token for token in tokenizer.tokenize(line)]
#         result.append(tokens) 
#     return result

In [204]:
# df['cleaned_text_w2v'] = df['cleaned_text'].apply(lambda x: process_text_w2v(x))

In [205]:
# # compile all sentences together to compose the corpus for later usage.
# sentences = [sent for x in df['cleaned_text_w2v'].values.tolist() for sent in x]

#### Set parameters for word2vec model
`min_count` is set to 1 since we would like to obtain the embeddings of all words in our vocabulary for subsequent modelling to work. Normally, `min_count` is set to a larger value.

In [206]:
# import gensim
# from gensim.models.phrases import Phrases, Phraser
# from gensim.models import Word2Vec

In [207]:
# w2v_model = Word2Vec(min_count=1, 
#                      window=3,
#                      size=64,
#                      sample=6e-5, 
#                      alpha=0.03, 
#                      min_alpha=0.0007, 
#                      negative=20,
#                      workers=multiprocessing.cpu_count())

#### Building the Vocabulary Table
Digest all the words and filter out the unique words, and doing some basic counts on them.

In [208]:
# t = time()
# w2v_model.build_vocab(sentences, progress_per=10000)
# print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

### Train Word2Vec

In [209]:
# start_time = time()
# w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
# print('Time to train the model: {} mins'.format(round((time() - start_time) / 60, 2)))

In [210]:
# w2v_model.save('../model/word2vec/word2vec.model')

In [211]:
# w2v_model.init_sims(replace=True)

#### Generate Word2Vec vectors
generate and write the word embedding vectors to directory

In [212]:
# # load the model from the model file
# w2v_model = Word2Vec.load('../model/word2vec.model')

In [213]:
# Store the vectors in a csv file
# with open('../model/word2vec/word2vec.csv', 'w+') as word2vec_file:
    
#     for index, row in df.iterrows():
#         model_vector = (np.mean([w2v_model[token] for token in row['cleaned_text_w2v'][0]], axis=0)).tolist()
        
#         if index == 0:
#             header = ",".join(str(ele) for ele in range(64))
#             word2vec_file.write(header)
#             word2vec_file.write("\n")
        
#         # Check if the line exists else it is vector of zeros
#         if type(model_vector) is list:  
#             line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
#         else:
#             line1 = ",".join([str(0) for i in range(64)])
        
#         word2vec_file.write(line1)
#         word2vec_file.write('\n')

#### Concatenate the word embeddings with other features in the datasets

In [214]:
w2v_embeddings = pd.read_csv('../model/word2vec/word2vec.csv')

In [215]:
df = pd.concat([df, w2v_embeddings], axis=1)

In [216]:
df.head()

Unnamed: 0.1,Unnamed: 0,cleaned_text,retweet_count,favorite_count,is_retweet,hashtag_count,mention_count,word_count,char_count,word_density,...,54,55,56,57,58,59,60,61,62,63
0,0,thank,19616,65721,0,0,1,5,28,5.6,...,0.065594,0.094567,0.022625,0.031416,-0.211155,0.256162,-0.076439,0.046196,-0.178672,0.282274
1,1,big news maine court side rnc uphold ban ballo...,29393,0,1,0,0,24,138,5.75,...,0.016894,0.028469,-0.03222,0.027841,-0.131,-0.026378,-0.019021,0.125694,-0.159512,0.051435
2,2,thank paul,15992,63294,0,0,0,4,16,4.0,...,-0.025543,0.134922,0.037013,-0.022119,-0.081507,0.26815,-0.066681,0.082602,-0.176282,0.178487
3,3,defective ballot new york want replace happen ...,51445,190750,0,0,0,42,199,4.738095,...,-0.006196,0.014436,-0.001041,-0.010833,-0.074154,0.013568,-0.091835,0.090702,-0.144243,0.002622
4,4,half years secure americas border rebuild awes...,18885,70838,0,1,0,43,239,5.55814,...,-0.001725,0.052266,-0.002023,0.012134,-0.030705,0.108762,-0.015056,-0.030167,-0.16916,-0.006276


#### Drop `cleaned_text` and `cleaned_text_w2v`

In [217]:
df = df.drop(columns=['cleaned_text', 'cleaned_text_w2v'], errors='ignore')

In [218]:
df.shape

(26231, 81)

#### Group by day

In [219]:
df.columns

Index(['Unnamed: 0', 'retweet_count', 'favorite_count', 'is_retweet',
       'hashtag_count', 'mention_count', 'word_count', 'char_count',
       'word_density', 'punctuation_freq', 'upper_case_freq',
       'compound_sentiment', 'market_date', '^GSPC', '^IXIC', 'VGT', 'topic',
       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60',
       '61', '62', '63'],
      dtype='object')

In [220]:
df = df.groupby(['market_date','topic']).mean()

### Train test split

In [221]:
from sklearn.model_selection import train_test_split

In [222]:
y_GSPC = df['^GSPC']
y_IXIC = df['^IXIC']
y_VGT = df['VGT']
X = df.drop(columns=['^GSPC', '^IXIC', 'VGT'], errors='ignore')

In [223]:
# train is 75% of the entire data set
train_ratio = 0.75

#### GSPC

In [224]:
X_train_GSPC, X_test_GSPC, y_train_GSPC, y_test_GSPC = train_test_split(X, y_GSPC, test_size=1 - train_ratio, random_state=1)

#### IXIC

In [225]:
X_train_IXIC, X_test_IXIC, y_train_IXIC, y_test_IXIC = train_test_split(X, y_IXIC, test_size=1 - train_ratio, random_state=1)

#### VGT

In [226]:
X_train_VGT, X_test_VGT, y_train_VGT, y_test_VGT = train_test_split(X, y_VGT, test_size=1 - train_ratio, random_state=1)

## Modelling - XGBoost 
XGBoost is chosen to be the classification model for a few reasons:
* XGBoost is an ensemble method which helps improve machine learning results by combining and improving on the results from multiple single models.
* As compared to gradient boosting algorithms, XGBoost uses a more regularized model formalization to control over-fitting, which gives it better performance.
* XGBoost is built with the engineering goal to push the limit of computation resources for boosted tree algorithms. It utilises computational power more efficiently for faster, and production-ready training and testing, which is a good choice given the limited computation resources.

In [227]:
#!pip install scikit-learn==0.23.2

In [228]:
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import mean_squared_error

### Define baseline parameters

In [229]:
params = {'objective':'reg:linear',
          'learning_rate':0.05,
          'n_estimators':200,
          'max_depth':10,
          'min_child_weight':5,
          'gamma':0.1,
          'subsample':0.7,
          'colsample_bytree':0.7,
          'n_jobs':4,
          'random_state':1,
          'seed':1
         }

### Define error metrics

Write a custom function to calculate MAPE, mean average percentage error. MAPE expresses expresses accuracy as a percentage of the error. Because the MAPE is a percentage, it can be easier to understand than the other accuracy measure statistics e.g. RMSE. Refer to the picture below for a guide on the interpretation of MAPE values

<img src="https://www.researchgate.net/profile/Albert_Sese/publication/257812432/figure/tbl1/AS:601657310203931@1520457689632/nterpretation-of-typical-MAPE-values.png" width=300 align="left"/>

In [230]:
# define custom functions to calculate MAPE

def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100

### GSPC

In [231]:
# initialise an xgboost object
xgb_GSPC = xgboost.XGBModel(**params)

In [232]:
xgb_GSPC.fit(X_train_GSPC, y_train_GSPC,
             eval_set=[(X_train_GSPC, y_train_GSPC), (X_test_GSPC, y_test_GSPC)],
             eval_metric='rmse',
             verbose=50)

[0]	validation_0-rmse:2594.75708	validation_1-rmse:2581.96289
[50]	validation_0-rmse:213.04425	validation_1-rmse:215.23517
[100]	validation_0-rmse:27.83068	validation_1-rmse:48.93320
[150]	validation_0-rmse:9.40329	validation_1-rmse:40.78949
[199]	validation_0-rmse:4.87121	validation_1-rmse:39.57674


XGBModel(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bynode=1, colsample_bytree=0.7, gamma=0.1, gpu_id=-1,
     importance_type='gain', interaction_constraints='',
     learning_rate=0.05, max_delta_step=0, max_depth=10,
     min_child_weight=5, missing=nan, monotone_constraints='()',
     n_estimators=200, n_jobs=4, num_parallel_tree=1,
     objective='reg:linear', random_state=1, reg_alpha=0, reg_lambda=1,
     scale_pos_weight=1, seed=1, subsample=0.7, tree_method='exact',
     validate_parameters=1, verbosity=None)

In [233]:
evals_result_GSPC = xgb_GSPC.evals_result()
evals_result_GSPC

{'validation_0': {'rmse': [2594.75708,
   2465.719482,
   2344.078613,
   2227.643555,
   2116.9021,
   2011.650879,
   1911.685913,
   1816.683838,
   1726.516968,
   1640.758057,
   1560.633545,
   1484.27002,
   1410.541016,
   1340.535522,
   1274.005249,
   1212.083618,
   1152.065063,
   1094.96521,
   1041.656494,
   991.345154,
   942.297485,
   895.640625,
   852.534668,
   810.357178,
   770.253906,
   733.156677,
   696.991455,
   662.573303,
   629.801514,
   599.609131,
   570.088623,
   542.05011,
   515.423035,
   490.14502,
   466.921265,
   443.912903,
   423.042084,
   403.26825,
   383.53183,
   364.788452,
   347.917206,
   330.960815,
   315.65448,
   300.279114,
   285.721741,
   272.591461,
   259.407654,
   246.8591,
   235.000015,
   223.704941,
   213.04425,
   203.52034,
   194.448196,
   185.171555,
   176.413605,
   168.15303,
   160.732254,
   153.277924,
   146.687088,
   139.846802,
   133.373337,
   127.256035,
   121.839684,
   116.310982,
   110.99995

In [234]:
xgb_GSPC.feature_importances_

array([5.6328297e-01, 1.5048002e-01, 7.1242161e-02, 2.3248375e-03,
       2.7582267e-02, 4.1839378e-03, 1.4056786e-02, 2.2101250e-02,
       7.9566555e-04, 7.4377423e-04, 1.8885819e-03, 1.3762016e-03,
       9.7340107e-04, 2.0711117e-03, 1.2006206e-03, 1.1725465e-03,
       7.2375499e-03, 8.1883520e-03, 8.5810648e-04, 9.9305937e-04,
       1.1676942e-03, 3.0797576e-03, 1.7650787e-03, 1.4672289e-03,
       1.0386930e-03, 1.7663818e-03, 2.1297678e-03, 1.4775742e-03,
       8.6662616e-04, 1.2675005e-03, 1.1544138e-03, 5.6013727e-04,
       1.1597485e-03, 2.4313410e-03, 1.0446296e-03, 1.8610805e-03,
       1.0320069e-03, 6.3229952e-04, 8.5588952e-04, 9.3936117e-04,
       1.1422548e-03, 1.2775699e-03, 1.2739068e-03, 1.2241661e-02,
       2.8445623e-03, 1.1837129e-03, 8.4961293e-04, 6.1925566e-03,
       2.7422274e-03, 1.0857132e-03, 1.0468216e-03, 3.3657888e-03,
       8.8609481e-04, 1.5058150e-03, 1.2480073e-03, 1.2332756e-03,
       1.2261642e-03, 1.8027043e-03, 2.6778406e-03, 1.2398190e

In [235]:
xgb_GSPC.predict(X_test_GSPC)

array([3251.2544, 2891.2588, 2363.0037, 3253.876 , 2050.2654, 2974.6746,
       2285.8574, 2881.3645, 2988.035 , 2538.1724, 3088.9104, 2098.088 ,
       2886.3933, 2435.8433, 2389.6084, 2829.194 , 2180.6304, 2996.4148,
       2148.6746, 2381.4993, 3308.24  , 2466.1936, 3404.5017, 2453.2039,
       2731.8528, 2797.55  , 2349.4226, 2846.1213, 2910.1973, 3319.6829,
       2468.3394, 2715.4255, 2902.2876, 3268.6497, 2886.878 , 2159.0688,
       3135.9373, 3119.4849, 2427.9546, 2441.3416, 1990.7104, 3396.8403,
       2044.4285, 3054.768 , 2143.3103, 2747.007 , 2764.6567, 2643.158 ,
       2981.0125, 3186.356 , 3364.5422, 2041.8695, 2985.515 , 2135.2583,
       2324.1184, 2908.5662, 1948.2603, 2468.4333, 2792.8027, 2003.6235,
       2746.791 , 2724.0085, 2455.073 , 2593.992 , 2970.6917, 2720.1157,
       2902.496 , 2865.4446, 2141.5913, 2467.205 , 2216.128 , 3053.3142,
       2823.7437, 2852.0645, 2841.7346, 2673.5337, 2497.8455, 3281.1155,
       2972.2065, 2173.9504, 2515.5918, 2429.6487, 

In [236]:
y_test_GSPC

market_date  topic
2020-01-24   2        3295.469971
2020-05-12   6        2870.120117
2017-04-24   7        2374.149902
2020-01-27   7        3243.629883
2016-03-17   1        2040.589966
2020-03-06   6        2972.370117
2017-01-25   6        2298.370117
2020-06-11   3        3002.100098
2019-07-16   7        3004.040039
2020-03-30   9        2626.649902
2020-06-23   7        3131.290039
2016-06-15   6        2071.500000
2018-10-08   2        2884.429932
2017-07-07   6        2425.179932
2017-05-25   6        2415.070068
2019-03-26   6        2818.459961
2016-08-12   2        2184.050049
2020-05-28   3        3029.729980
2016-08-15   7        2190.149902
2017-09-26   2        2496.840088
2020-02-20   6        3373.229980
2017-08-21   3        2428.370117
2020-08-24   6        3431.280029
2017-07-31   6        2470.300049
2018-03-12   10       2783.020020
2019-02-22   6        2792.669922
2017-03-29   2        2361.129883
2019-05-24   6        2826.060059
2019-08-13   2        2926.32

In [237]:
# retrieve the MSE on the test set
from math import sqrt
sqrt(mean_squared_error(xgb_GSPC.predict(X_test_GSPC), y_test_GSPC)) #RMSE

39.57673732494457

In [238]:
# # retrieve the MAPE on the test set
mean_absolute_percentage_error(y_test_GSPC, xgb_GSPC.predict(X_test_GSPC)) #MAPE

1.0447166279482216

#### IXIC

In [239]:
dtrain_IXIC = xgboost.DMatrix(data=X_train_IXIC, label=y_train_IXIC)
dtest_IXIC = xgboost.DMatrix(data=X_test_IXIC, label=y_test_IXIC)

In [240]:
cv_results_IXIC = xgboost.cv(dtrain=dtrain_IXIC, 
                             params=params, 
                             nfold=10, 
                             num_boost_round=200,
                             early_stopping_rounds=10,
                             metrics="rmse", 
                             as_pandas=True,
                             verbose_eval=10,
                             seed=1)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings bu

In [241]:
cv_results_IXIC.tail()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
195,18.751558,1.318926,178.225546,30.929186
196,18.522792,1.29764,178.170335,30.925507
197,18.302395,1.319005,178.135288,30.913276
198,18.087858,1.315673,178.09537,30.882007
199,17.885424,1.306879,178.065248,30.876786


In [242]:
cv_results_IXIC['test-rmse-mean'].min()

178.06524819999998

In [243]:
xgb_IXIC = xgboost.train(
                        params,
                        dtrain_IXIC,
                        num_boost_round=200,
                        evals=[(dtest_IXIC, "Test")],
                        early_stopping_rounds=10,
                        verbose_eval=10
                        )

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	Test-rmse:7176.33936
Will train until Test-rmse hasn't improved in 10 rounds.
[10]	Test-rmse:4328.48486
[20]	Test-rmse:2632.75757
[30]	Test-rmse:1610.72620
[40]	Test-rmse:1009.41003
[50]	Test-rmse:643.72186
[60]	Test-rmse:440.10031
[70]	Test-rmse:319.40051
[80]	Test-rmse:254.56674
[90]	Test-rmse:221.27261
[100]	Test-rmse:205.33116
[110]	Test-rmse:195.75413
[120]	Test-rmse:190.94350
[130]	Test-rmse:186.89609
[140]	Test-rmse:183.35712
[150]	Test-rmse:180.90083
[160]	Test-rmse:179.39459
[170]	Test-rmse:178.38911
[180]	Test-rmse:177.35422
[190]	Test-rmse:176.58614
[199]	Test-rmse:176.44760


In [244]:
# retrieve the RMSE on the test set
from math import sqrt
sqrt(mean_squared_error(xgb_IXIC.predict(dtest_IXIC), y_test_IXIC)) #RMSE

176.44760097913496

In [245]:
# retrieve the MAPE on the test set
mean_absolute_percentage_error(y_test_IXIC, xgb_IXIC.predict(dtest_IXIC)) #MAPE

1.7776125837895371

#### VGT

In [246]:
dtrain_VGT = xgboost.DMatrix(data=X_train_VGT, label=y_train_VGT)
dtest_VGT = xgboost.DMatrix(data=X_test_VGT, label=y_test_VGT)

In [247]:
cv_results_VGT = xgboost.cv(dtrain=dtrain_VGT, 
                             params=params, 
                             nfold=10, 
                             num_boost_round=200,
                             early_stopping_rounds=10,
                             metrics="rmse", 
                             as_pandas=True,
                             verbose_eval=10,
                             seed=1)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings bu

In [248]:
cv_results_VGT.tail()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
195,0.557868,0.047671,6.477263,1.203371
196,0.552062,0.047985,6.475687,1.203258
197,0.546232,0.047458,6.474544,1.203463
198,0.540705,0.047832,6.472815,1.202865
199,0.535132,0.047751,6.471763,1.202511


In [249]:
cv_results_VGT['test-rmse-mean'].min()

6.4717629

In [250]:
xgb_VGT = xgboost.train(
                        params,
                        dtrain_VGT,
                        num_boost_round=200,
                        evals=[(dtest_VGT, "Test")],
                        early_stopping_rounds=10,
                        verbose_eval=10
                        )

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	Test-rmse:181.98383
Will train until Test-rmse hasn't improved in 10 rounds.
[10]	Test-rmse:110.28732
[20]	Test-rmse:67.81508
[30]	Test-rmse:42.05543
[40]	Test-rmse:26.96930
[50]	Test-rmse:17.77968
[60]	Test-rmse:12.86458
[70]	Test-rmse:9.93371
[80]	Test-rmse:8.41603
[90]	Test-rmse:7.60644
[100]	Test-rmse:7.21122
[110]	Test-rmse:6.94426
[120]	Test-rmse:6.79846
[130]	Test-rmse:6.68289
[140]	Test-rmse:6.58359
[150]	Test-rmse:6.50129
[160]	Test-rmse:6.45468
[170]	Test-rmse:6.41602
[180]	Test-rmse:6.38274
[190]	Test-rmse:6.35726
[199]	Test-rmse:6.35113


In [251]:
# retrieve the RMSE on the test set
from math import sqrt
sqrt(mean_squared_error(xgb_VGT.predict(dtest_VGT), y_test_VGT)) #RMSE

6.351127899983594

In [252]:
# retrieve the MAPE on the test set
mean_absolute_percentage_error(y_test_VGT, xgb_VGT.predict(dtest_VGT)) #MAPE

2.7035305890363253