## Importing packages

In [1]:
import os
os.environ['PYTHONHASHSEED']=str(2)

In [2]:
import pickle
import pandas as pd
import numpy as np
import math
import random
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

In [3]:
def reset_random_seeds():
    os.environ['PYTHONHASHSEED']=str(2)
    tf.random.set_seed(2)
    np.random.seed(2)
    random.seed(2)
reset_random_seeds()

## Loading the data

In [4]:
with open("Data Thesis/Oil companies/2011-2015/week_sentiment_senti.txt", "rb") as fp:   # Unpickling
    oil_senti_small = pickle.load(fp)

In [5]:
with open("Data Thesis/Oil companies/2016-2020/week_sentiment_senti_ext.txt", "rb") as fp:   # Unpickling
    oil_senti_large = pickle.load(fp)

In [6]:
oil_senti = oil_senti_small + oil_senti_large

In [7]:
#Checking the length of the combined list
print(len(oil_senti))

522


In [8]:
with open("Data Thesis/Oil companies/2011-2015/week_sentiment_corenlp_new.txt", "rb") as fp:   # Unpickling
    oil_nlp_small = pickle.load(fp)

In [9]:
with open("Data Thesis/Oil companies/2016-2020/week_sentiment_corenlp_ext_new.txt", "rb") as fp:   # Unpickling
    oil_nlp_large = pickle.load(fp)

In [10]:
oil_nlp = oil_nlp_small + oil_nlp_large

In [11]:
print(len(oil_nlp))

522


In [12]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_corenlp_new.txt", "rb") as fp:   # Unpickling
    eufp_nlp_small = pickle.load(fp)

In [13]:
with open("Data Thesis/Think Tanks/2016-2020/week_sentiment_corenlp_ext_new.txt", "rb") as fp:   # Unpickling
    eufp_nlp_large = pickle.load(fp)

In [14]:
eufp_nlp = eufp_nlp_small + eufp_nlp_large

In [15]:
print(len(eufp_nlp))

522


In [16]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_senti.txt", "rb") as fp:   # Unpickling
    eufp_senti_small = pickle.load(fp)

In [17]:
with open("Data Thesis/Think Tanks/2016-2020/week_sentiment_senti_ext.txt", "rb") as fp:   # Unpickling
    eufp_senti_large = pickle.load(fp)

In [18]:
eufp_senti = eufp_senti_small + eufp_senti_large

In [19]:
print(len(eufp_senti))

522


In [20]:
with open("Data Thesis/Think Tanks/2011-2015/percentage_references_oil_new.txt", "rb") as fp:   # Unpickling
    eufp_oilfreq_small = pickle.load(fp)

In [21]:
with open("Data Thesis/Think Tanks/2016-2020/percentage_references_oil_new.txt", "rb") as fp:   # Unpickling
    eufp_oilfreq_large = pickle.load(fp)

In [22]:
eufp_oilfreq = eufp_oilfreq_small + eufp_oilfreq_large

In [23]:
print(len(eufp_oilfreq))

522


In [24]:
with open("Data Thesis/Think Tanks/2011-2015/percentage_references_opec_new.txt", "rb") as fp:   # Unpickling
    eufp_opecfreq_small = pickle.load(fp)

In [25]:
with open("Data Thesis/Think Tanks/2016-2020/percentage_references_opec_new.txt", "rb") as fp:   # Unpickling
    eufp_opecfreq_large = pickle.load(fp)

In [26]:
eufp_opecfreq = eufp_opecfreq_small + eufp_opecfreq_large

In [27]:
print(len(eufp_opecfreq))

522


In [28]:
df = pd.read_excel("Data Thesis/Brent prices/Brent_2011_2020.xlsx")

In [29]:
print(df)

          Date  Weekly Price
0   2011-01-07         94.72
1   2011-01-14         97.09
2   2011-01-21         97.34
3   2011-01-28         96.62
4   2011-02-04        100.36
..         ...           ...
524 2021-01-22         55.23
525 2021-01-29         55.18
526 2021-02-05         58.22
527 2021-02-12         61.13
528 2021-02-19         63.90

[529 rows x 2 columns]


In [30]:
price = df['Weekly Price']

In [31]:
price_list = price.tolist()

In [32]:
inc_dec = []
for i in range(len(price_list)):
    if i+7 > len(price_list)-1:
        break
    if price_list[i] < price_list[i+7]:
        inc_dec.append(1)
    else:
        inc_dec.append(0)

In [121]:
#TESTING: MUST BE REMOVED IF NOT WORKING
inc_dec_lag1 = [0] #need to add one number since we cannot compare with "0-1", and we do not want to discard na later on
for i in range(1, len(price_list)):
    #print(i)
    if i+1 > len(price_list)-1:
        break
    if price_list[i] > price_list[i-1]:
        inc_dec_lag1.append(1)
    else:
        inc_dec_lag1.append(0)

In [122]:
df['inc_dec_lag1'] = pd.Series(inc_dec_lag1)

In [33]:
print(inc_dec)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [34]:
df['inc_dec01'] = pd.Series(inc_dec) #use pd.Series since last values will be filled with NaN because these are not present

In [35]:
print(df)

          Date  Weekly Price  inc_dec01
0   2011-01-07         94.72        1.0
1   2011-01-14         97.09        1.0
2   2011-01-21         97.34        1.0
3   2011-01-28         96.62        1.0
4   2011-02-04        100.36        1.0
..         ...           ...        ...
524 2021-01-22         55.23        NaN
525 2021-01-29         55.18        NaN
526 2021-02-05         58.22        NaN
527 2021-02-12         61.13        NaN
528 2021-02-19         63.90        NaN

[529 rows x 3 columns]


In [36]:
df = df.dropna()

In [37]:
print(df)

          Date  Weekly Price  inc_dec01
0   2011-01-07         94.72        1.0
1   2011-01-14         97.09        1.0
2   2011-01-21         97.34        1.0
3   2011-01-28         96.62        1.0
4   2011-02-04        100.36        1.0
..         ...           ...        ...
517 2020-12-04         47.83        1.0
518 2020-12-11         49.32        1.0
519 2020-12-18         51.05        1.0
520 2020-12-25         50.61        1.0
521 2021-01-01         50.82        1.0

[522 rows x 3 columns]


In [38]:
df['oil_senti'] = oil_senti
df['oil_nlp'] = oil_nlp
df['eufp_senti'] = eufp_senti
df['eufp_nlp'] = eufp_nlp
df['eufp_oilfreq'] = eufp_oilfreq
df['eufp_opecfreq'] = eufp_opecfreq

In [39]:
print(df)

          Date  Weekly Price  inc_dec01  oil_senti   oil_nlp  eufp_senti  \
0   2011-01-07         94.72        1.0  -0.112676  2.028169   -0.215517   
1   2011-01-14         97.09        1.0  -0.040000  2.083333   -0.161290   
2   2011-01-21         97.34        1.0   0.022727  2.164706   -0.096970   
3   2011-01-28         96.62        1.0   0.201550  2.158730   -0.263889   
4   2011-02-04        100.36        1.0   0.228814  2.137931   -0.198157   
..         ...           ...        ...        ...       ...         ...   
517 2020-12-04         47.83        1.0   0.456954  2.314570   -0.193999   
518 2020-12-11         49.32        1.0   0.349481  2.145329   -0.281621   
519 2020-12-18         51.05        1.0   0.349515  2.271845   -0.217033   
520 2020-12-25         50.61        1.0   0.479263  2.333333   -0.267769   
521 2021-01-01         50.82        1.0   0.383784  2.232432   -0.188976   

     eufp_nlp  eufp_oilfreq  eufp_opecfreq  
0    1.965517      0.000000       0.024590

## SVM with BlockingTimeSeriesSplit

In [80]:
y = df['inc_dec01'].to_numpy()
X = df.iloc[:,3:9].to_numpy()

In [81]:
X_train = X[0:417]
y_train = y[0:417]
X_test = X[417:]
y_test = y[417:]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [82]:
y_train = np.where(y_train==0, -1, y_train)
y_test = np.where(y_test==0, -1, y_test)
print(y_train)

[ 1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1 -1 -1
 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1 -1  1  1 -1
 -1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1  1
  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1  1
  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1  1  1  1 -1 -1 -1 -1 -1 -1  1  1  1
  1  1  1 -1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1  1  1  1 -1 -1
 -1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1  1  1
  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1  1  1  1  1  1  1  1  1  1  1
 -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1

In [113]:
print(sum(y_train))
print(sum(y_train)/len(y_train))

204
0.4892086330935252


In [83]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [84]:
#Checking the balance of the validation sets
btscv = BlockingTimeSeriesSplit(n_splits=3)

for tr_idx, val_idx in btscv.split(X_train, y_train):
    print((tr_idx))
    print((val_idx))

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110]
[111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
 129 130 131 132 133 134 135 136 137 138]
[139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

In [85]:
print(len(y_train[111:139]))

28


In [86]:
print(y_train[111:139])
print(y_train[250:278])
print(y_train[389:417])

[-1 -1 -1 -1 -1 -1 -1 -1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1
  1 -1 -1 -1]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1]
[-1 -1 -1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
  1  1  1  1]


In [87]:
print(np.count_nonzero(y_train[111:139] == -1))
print(np.count_nonzero(y_train[250:278] == -1))
print(np.count_nonzero(y_train[389:417] == -1))

12
12
14


In [88]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [89]:
btscv = BlockingTimeSeriesSplit(n_splits=3)
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, cv = btscv, scoring = 'f1', refit = True, verbose = 3)
  
#fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.667, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.667, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .......... C=100, gamma=1, kernel=rbf, score=0.500, total=   0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=100, gamma=1, kernel=rbf ......................................
[CV] .......... C=100, gamma=1, kernel=rbf, score=0.727, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.621, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.462, total=   0.0s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=100, gamma=0.1, kernel=rbf, score=0.606, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=100, gamma=0.01, kernel=rbf, score=0.667, total=   0.0s
[CV] C=100, gamma=0.01, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.4s finished


GridSearchCV(cv=<__main__.BlockingTimeSeriesSplit object at 0x7fc3b5ec7bb0>,
             estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             scoring='f1', verbose=3)

In [90]:
#print best parameter after tuning
print(grid.best_params_)
  
#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, gamma=0.1)


## Performance of model on test set

In [91]:
X_test = scaler.transform(X_test)

In [92]:
#Since we have set refit = True, GridSearch refits the model on the whole training set using the best parameters
y_pred_svm = grid.predict(X_test)

In [93]:
print(classification_report(y_test, y_pred_svm, digits = 4))

              precision    recall  f1-score   support

          -1     0.5000    0.4167    0.4545        48
           1     0.5692    0.6491    0.6066        57

    accuracy                         0.5429       105
   macro avg     0.5346    0.5329    0.5306       105
weighted avg     0.5376    0.5429    0.5371       105



In [94]:
cm = confusion_matrix(y_test, y_pred_svm)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[0.41666667 0.64912281]
[[0.41666667 0.58333333]
 [0.35087719 0.64912281]]


## NB with Blocking TimeSeriesSplit

In [95]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [96]:
y = df['inc_dec01'].to_numpy()
X = df.iloc[:,3:9].to_numpy()

In [97]:
X_train = X[0:417]
y_train = y[0:417]
X_test = X[417:]
y_test = y[417:]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [98]:
scaler = PowerTransformer() #transforms the features to a more or less Gaussian distribution
X_train = scaler.fit_transform(X_train)

In [99]:
btscv = BlockingTimeSeriesSplit(n_splits=3)
param_grid = {'var_smoothing': np.logspace(0,-9, num=10)} 
  
grid = GridSearchCV(GaussianNB(), param_grid, cv = btscv, scoring = 'f1', refit = True, verbose = 3)
  
#fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] var_smoothing=1.0 ...............................................
[CV] ................... var_smoothing=1.0, score=0.111, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] ................... var_smoothing=1.0, score=0.000, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] ................... var_smoothing=1.0, score=0.667, total=   0.0s
[CV] var_smoothing=0.1 ...............................................
[CV] ................... var_smoothing=0.1, score=0.522, total=   0.0s
[CV] var_smoothing=0.1 ...............................................
[CV] ................... var_smoothing=0.1, score=0.000, total=   0.0s
[CV] var_smoothing=0.1 ...............................................
[CV] ................... var_smoothing=0.1, score=0.667, total=   0.0s
[CV] var_smoothing=0.01 ..............................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


GridSearchCV(cv=<__main__.BlockingTimeSeriesSplit object at 0x7fc3b5efec40>,
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09])},
             scoring='f1', verbose=3)

In [100]:
#print best parameter after tuning
print(grid.best_params_)
  
#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'var_smoothing': 0.01}
GaussianNB(var_smoothing=0.01)


## Performance of model on test set

In [101]:
X_test = scaler.transform(X_test)

In [102]:
#Since we have set refit = True, GridSearch refits the model on the whole training set using the best parameters
y_pred_nb = grid.predict(X_test)

In [103]:
print(classification_report(y_test, y_pred_nb, digits = 4))

              precision    recall  f1-score   support

           0     0.5102    0.5208    0.5155        48
           1     0.5893    0.5789    0.5841        57

    accuracy                         0.5524       105
   macro avg     0.5497    0.5499    0.5498       105
weighted avg     0.5531    0.5524    0.5527       105



In [104]:
cm = confusion_matrix(y_test, y_pred_nb)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[0.52083333 0.57894737]
[[0.52083333 0.47916667]
 [0.42105263 0.57894737]]


## MLP with Blocking TimeSeriesSplit

In [65]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [66]:
y = df['inc_dec01'].to_numpy()
X = df.iloc[:,3:9].to_numpy()

In [67]:
X_train = X[0:417]
y_train = y[0:417]
X_test = X[417:]
y_test = y[417:]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [68]:
print(y_train.shape)
print(X_train.shape)

(417,)
(417, 6)


In [69]:
input_dim = X_train.shape[1]
print(input_dim)

6


In [70]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [71]:
def FindLayerNodesLinear_one(n_layers, first_layer_nodes):
    layers = [first_layer_nodes]
    return layers

In [73]:
def createmodel_one(n_layers, first_layer_nodes, dropout_rate, lr):
    model = tf.keras.models.Sequential()
    n_nodes = FindLayerNodesLinear_one(n_layers, first_layer_nodes)
    
    #Input layer
    model.add(Input(shape=input_dim))
    
    #Hidden layer
    model.add(Dense(first_layer_nodes, input_dim=X_train.shape[1]))
    model.add(LeakyReLU())
    model.add(Dropout(dropout_rate, seed=2))
            
    #Finally, the output layer should have a single node in binary classification
    model.add(Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='binary_crossentropy') 
    
    return model

In [74]:
reset_random_seeds()

btscv = BlockingTimeSeriesSplit(n_splits=3)

param_grid = dict(n_layers=[1], first_layer_nodes = [5,6,7,8,9,10,11], 
                  dropout_rate = [0.2,0.3,0.4], epochs = [50], lr = [0.001])

Kmodel = KerasClassifier(build_fn=createmodel_one, verbose=3)
Kmodel._estimator_type = "classifier"
grid = GridSearchCV(estimator=Kmodel, param_grid=param_grid, cv=btscv, refit = True, scoring='f1', return_train_score = True)

grid.fit(X_train, y_train)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

GridSearchCV(cv=<__main__.BlockingTimeSeriesSplit object at 0x7fc3d0f57e20>,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fc3d0f57a60>,
             param_grid={'dropout_rate': [0.2, 0.3, 0.4], 'epochs': [50],
                         'first_layer_nodes': [5, 6, 7, 8, 9, 10, 11],
                         'lr': [0.001], 'n_layers': [1]},
             return_train_score=True, scoring='f1')

In [75]:
#print best parameter after tuning
print(grid.best_params_)
  
#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'dropout_rate': 0.4, 'epochs': 50, 'first_layer_nodes': 9, 'lr': 0.001, 'n_layers': 1}
<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fc3d2a04c70>


## Performance of model on test set

In [76]:
X_test = scaler.transform(X_test)

In [77]:
#Since we have set refit = True, GridSearch refits the model on the whole training set using the best parameters
y_pred_mlp = grid.predict(X_test)

In [78]:
print(classification_report(y_test, y_pred_mlp, digits = 4))

              precision    recall  f1-score   support

           0     0.6400    0.3333    0.4384        48
           1     0.6000    0.8421    0.7007        57

    accuracy                         0.6095       105
   macro avg     0.6200    0.5877    0.5695       105
weighted avg     0.6183    0.6095    0.5808       105



In [79]:
cm = confusion_matrix(y_test, y_pred_mlp)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[0.33333333 0.84210526]
[[0.33333333 0.66666667]
 [0.15789474 0.84210526]]


## Majority vote

In [105]:
#Convert predictions of -1 to 0 since the other two models predict 0 for decrease
y_pred_svm = np.where(y_pred_svm==-1, 0, y_pred_svm)

In [106]:
#Reshape y_pred_mlp to have the same shape as the other predictions
y_pred_mlp = y_pred_mlp[:,0]

In [107]:
maj_vote = []
for p in zip(y_pred_svm, y_pred_nb, y_pred_mlp):
    #print(p)
    total_pred = sum(p)
    if total_pred > 1:
        maj_vote.append(1)
    else:
        maj_vote.append(0)

In [108]:
print(classification_report(y_test, maj_vote, digits = 4))

              precision    recall  f1-score   support

           0     0.6176    0.4375    0.5122        48
           1     0.6197    0.7719    0.6875        57

    accuracy                         0.6190       105
   macro avg     0.6187    0.6047    0.5998       105
weighted avg     0.6188    0.6190    0.6074       105



In [110]:
cm = confusion_matrix(y_test, maj_vote)
print(cm)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[[21 27]
 [13 44]]
[0.4375     0.77192982]
[[0.4375     0.5625    ]
 [0.22807018 0.77192982]]


In [128]:
y_base = df['inc_dec_lag1'].to_numpy()
y_base = y_base[417:]

In [129]:
print(len(y_base))

105


In [130]:
print(classification_report(y_test, y_base, digits = 4))

              precision    recall  f1-score   support

           0     0.6098    0.5208    0.5618        48
           1     0.6406    0.7193    0.6777        57

    accuracy                         0.6286       105
   macro avg     0.6252    0.6201    0.6197       105
weighted avg     0.6265    0.6286    0.6247       105

