## Importing packages

In [1]:
import os
os.environ['PYTHONHASHSEED']=str(2)

In [2]:
import pickle
import pandas as pd
import numpy as np
import math
import random
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

In [3]:
def reset_random_seeds():
    os.environ['PYTHONHASHSEED']=str(2)
    tf.random.set_seed(2)
    np.random.seed(2)
    random.seed(2)
reset_random_seeds()

## Loading the data

In [4]:
with open("Data Thesis/Oil companies/2011-2015/week_sentiment_senti.txt", "rb") as fp:   # Unpickling
    oil_senti = pickle.load(fp)

In [5]:
with open("Data Thesis/Oil companies/2011-2015/week_sentiment_corenlp_new.txt", "rb") as fp:   # Unpickling
    oil_nlp = pickle.load(fp)

In [6]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_corenlp_new.txt", "rb") as fp:   # Unpickling
    eufp_nlp = pickle.load(fp)

In [7]:
with open("Data Thesis/Think Tanks/2011-2015/week_sentiment_senti.txt", "rb") as fp:   # Unpickling
    eufp_senti = pickle.load(fp)

In [8]:
with open("Data Thesis/Think Tanks/2011-2015/percentage_references_oil_new.txt", "rb") as fp:   # Unpickling
    eufp_oilfreq = pickle.load(fp)

In [9]:
with open("Data Thesis/Think Tanks/2011-2015/percentage_references_opec_new.txt", "rb") as fp:   # Unpickling
    eufp_opecfreq = pickle.load(fp)

In [10]:
df = pd.read_excel("Data Thesis/Brent prices/Brent_2011_2020.xlsx")

In [11]:
print(df.head())

        Date  Weekly Price
0 2011-01-07         94.72
1 2011-01-14         97.09
2 2011-01-21         97.34
3 2011-01-28         96.62
4 2011-02-04        100.36


In [12]:
print(df.shape)

(529, 2)


In [13]:
price = df['Weekly Price']

In [14]:
price_list = price.tolist()

In [15]:
inc_dec = []
for i in range(len(price_list)):
    if i+7 > len(price_list)-1:
        break
    if price_list[i] < price_list[i+7]:
        inc_dec.append(1)
    else:
        inc_dec.append(0)

In [91]:
#Creating column for the baseline
inc_dec_lag1 = [0] #need to add one number since we cannot compare with "0-1", and we do not want to discard na later on
for i in range(1, len(price_list)):
    #print(i)
    if i+1 > len(price_list)-1:
        break
    if price_list[i] > price_list[i-1]:
        inc_dec_lag1.append(1)
    else:
        inc_dec_lag1.append(0)

In [92]:
print(inc_dec_lag1)

[0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 

In [93]:
df['inc_dec_lag1'] = pd.Series(inc_dec_lag1)

In [16]:
print(inc_dec)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [17]:
df['inc_dec01'] = pd.Series(inc_dec) #use pd.Series since last values will be filled with NaN because these are not present

In [80]:
print(df)

          Date  Weekly Price  inc_dec01  oil_senti   oil_nlp  eufp_senti  \
0   2011-01-07         94.72        1.0  -0.112676  2.028169   -0.215517   
1   2011-01-14         97.09        1.0  -0.040000  2.083333   -0.161290   
2   2011-01-21         97.34        1.0   0.022727  2.164706   -0.096970   
3   2011-01-28         96.62        1.0   0.201550  2.158730   -0.263889   
4   2011-02-04        100.36        1.0   0.228814  2.137931   -0.198157   
..         ...           ...        ...        ...       ...         ...   
213 2015-02-06         54.62        1.0   0.290909  2.223242   -0.244083   
214 2015-02-13         56.57        0.0   0.237996  2.172269   -0.338101   
215 2015-02-20         60.57        0.0   0.257634  2.156673   -0.301205   
216 2015-02-27         60.63        0.0   0.260097  2.129296   -0.163758   
217 2015-03-06         60.12        1.0   0.254386  2.165187   -0.190889   

     eufp_nlp  eufp_oilfreq  eufp_opecfreq  inc_dec_lag1  
0    1.965517      0.000000 

In [19]:
df = df[:218]

In [20]:
print(df.shape)

(218, 3)


In [21]:
print(df)

          Date  Weekly Price  inc_dec01
0   2011-01-07         94.72        1.0
1   2011-01-14         97.09        1.0
2   2011-01-21         97.34        1.0
3   2011-01-28         96.62        1.0
4   2011-02-04        100.36        1.0
..         ...           ...        ...
213 2015-02-06         54.62        1.0
214 2015-02-13         56.57        0.0
215 2015-02-20         60.57        0.0
216 2015-02-27         60.63        0.0
217 2015-03-06         60.12        1.0

[218 rows x 3 columns]


In [22]:
df['oil_senti'] = oil_senti
df['oil_nlp'] = oil_nlp
df['eufp_senti'] = eufp_senti
df['eufp_nlp'] = eufp_nlp
df['eufp_oilfreq'] = eufp_oilfreq
df['eufp_opecfreq'] = eufp_opecfreq

In [23]:
print(df)

          Date  Weekly Price  inc_dec01  oil_senti   oil_nlp  eufp_senti  \
0   2011-01-07         94.72        1.0  -0.112676  2.028169   -0.215517   
1   2011-01-14         97.09        1.0  -0.040000  2.083333   -0.161290   
2   2011-01-21         97.34        1.0   0.022727  2.164706   -0.096970   
3   2011-01-28         96.62        1.0   0.201550  2.158730   -0.263889   
4   2011-02-04        100.36        1.0   0.228814  2.137931   -0.198157   
..         ...           ...        ...        ...       ...         ...   
213 2015-02-06         54.62        1.0   0.290909  2.223242   -0.244083   
214 2015-02-13         56.57        0.0   0.237996  2.172269   -0.338101   
215 2015-02-20         60.57        0.0   0.257634  2.156673   -0.301205   
216 2015-02-27         60.63        0.0   0.260097  2.129296   -0.163758   
217 2015-03-06         60.12        1.0   0.254386  2.165187   -0.190889   

     eufp_nlp  eufp_oilfreq  eufp_opecfreq  
0    1.965517      0.000000       0.024590

## SVM using BlockingTimeSeriesSplit

In [24]:
y = df['inc_dec01'].to_numpy()
X = df.iloc[:,3:9].to_numpy()

In [27]:
X_train = X[0:150]
y_train = y[0:150]
X_test = X[150:]
y_test = y[150:]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [69]:
print(len(y_test))

68


In [71]:
print(sum(y_train))

78


In [72]:
print(sum(y_train)/len(y_train))

0.52


In [29]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.7 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [30]:
btscv = BlockingTimeSeriesSplit(n_splits=2)

for tr_idx, val_idx in btscv.split(X_train, y_train):
    print((tr_idx))
    print((val_idx))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51]
[52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74]
[ 75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110
 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126]
[127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149]


In [31]:
print(sum(y_train[52:75]))
print(sum(y_train[127:150]))

9
16


In [32]:
print(len(y_train[52:75]))
print(len(y_train[127:150]))

23
23


In [33]:
y_train = np.where(y_train==0, -1, y_train)
y_test = np.where(y_test==0, -1, y_test)
print(y_train)

[ 1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1  1  1  1 -1 -1
 -1  1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1 -1  1  1 -1
 -1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1  1  1
  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1
  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1  1
  1  1  1  1  1  1]


In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [35]:
btscv = BlockingTimeSeriesSplit(n_splits=2)
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, cv = btscv, scoring = 'f1', refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 2 folds for each of 25 candidates, totalling 50 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.821, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.821, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.000, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] ....... C=0.1, gamma=0.01, kernel=rbf, score=0.821, total=   0.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.435, total=   0.0s
[CV] C=1000, gamma=0.01, kernel=rbf ..................................
[CV] ...... C=1000, gamma=0.01, kernel=rbf, score=0.519, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.462, total=   0.0s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.600, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.333, total=   0.0s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.462, total=   0.0s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.2s finished


GridSearchCV(cv=<__main__.BlockingTimeSeriesSplit object at 0x7fe2fef03c10>,
             estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             scoring='f1', verbose=3)

In [98]:
#print best parameter after tuning
print(grid.best_params_)
  
#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, gamma=0.1)


## Performance of model on test set

In [36]:
#Since we have set refit = True, GridSearch refits the model on the whole training set using the best parameters
X_test = scaler.transform(X_test)
y_pred_svm = grid.predict(X_test)

In [37]:
print(classification_report(y_test, y_pred_svm, digits = 4))

              precision    recall  f1-score   support

          -1     0.7381    0.6458    0.6889        48
           1     0.3462    0.4500    0.3913        20

    accuracy                         0.5882        68
   macro avg     0.5421    0.5479    0.5401        68
weighted avg     0.6228    0.5882    0.6014        68



In [38]:
cm = confusion_matrix(y_test, y_pred_svm)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[0.64583333 0.45      ]
[[0.64583333 0.35416667]
 [0.55       0.45      ]]


## NB with Blocking TimeSeriesSplit

In [39]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.7 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [40]:
y = df['inc_dec01'].to_numpy()
X = df.iloc[:,3:9].to_numpy()

In [41]:
X_train = X[0:150]
y_train = y[0:150]
X_test = X[150:]
y_test = y[150:]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [42]:
scaler = PowerTransformer() #transforms the features to a more or less Gaussian distribution
X_train = scaler.fit_transform(X_train)

In [43]:
btscv = BlockingTimeSeriesSplit(n_splits=2)
param_grid = {'var_smoothing': np.logspace(0,-9, num=10)} 
  
grid = GridSearchCV(GaussianNB(), param_grid, cv = btscv, scoring = 'f1', refit = True, verbose = 3)
  
#fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] var_smoothing=1.0 ...............................................
[CV] ................... var_smoothing=1.0, score=0.200, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] ................... var_smoothing=1.0, score=0.348, total=   0.0s
[CV] var_smoothing=0.1 ...............................................
[CV] ................... var_smoothing=0.1, score=0.462, total=   0.0s
[CV] var_smoothing=0.1 ...............................................
[CV] ................... var_smoothing=0.1, score=0.444, total=   0.0s
[CV] var_smoothing=0.01 ..............................................
[CV] .................. var_smoothing=0.01, score=0.429, total=   0.0s
[CV] var_smoothing=0.01 ..............................................
[CV] .................. var_smoothing=0.01, score=0.552, total=   0.0s
[CV] var_smoothing=0.001 .............................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished


GridSearchCV(cv=<__main__.BlockingTimeSeriesSplit object at 0x7fe2fef18a60>,
             estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09])},
             scoring='f1', verbose=3)

In [44]:
#print best parameter after tuning
print(grid.best_params_)
  
#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'var_smoothing': 0.01}
GaussianNB(var_smoothing=0.01)


## Performance of model on test set

In [45]:
#Since we have set refit = True, GridSearch refits the model on the whole training set using the best parameters
X_test = scaler.transform(X_test)
y_pred_nb = grid.predict(X_test)

In [46]:
print(classification_report(y_test, y_pred_nb, digits = 4))

              precision    recall  f1-score   support

           0     0.7391    0.7083    0.7234        48
           1     0.3636    0.4000    0.3810        20

    accuracy                         0.6176        68
   macro avg     0.5514    0.5542    0.5522        68
weighted avg     0.6287    0.6176    0.6227        68



In [47]:
cm = confusion_matrix(y_test, y_pred_nb)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[0.70833333 0.4       ]
[[0.70833333 0.29166667]
 [0.6        0.4       ]]


## MLP with Blocking TimeSeriesSplit

In [48]:
y = df['inc_dec01'].to_numpy()
X = df.iloc[:,3:9].to_numpy()

In [49]:
X_train = X[0:150]
y_train = y[0:150]
X_test = X[150:]
y_test = y[150:]
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [50]:
print(y_train.shape)
print(X_train.shape)

(150,)
(150, 6)


In [51]:
input_dim = X_train.shape[1]
print(input_dim)

6


In [52]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [53]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.7 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [54]:
def FindLayerNodesLinear_one(n_layers, first_layer_nodes):
    layers = [first_layer_nodes]
    return layers

In [56]:
def createmodel_one(n_layers, first_layer_nodes, dropout_rate, lr):
    model = tf.keras.models.Sequential()
    n_nodes = FindLayerNodesLinear_one(n_layers, first_layer_nodes)
    
    #Input layer
    model.add(Input(shape=input_dim))
    
    #Hidden layer
    model.add(Dense(first_layer_nodes, input_dim=X_train.shape[1]))
    model.add(LeakyReLU())
    model.add(Dropout(dropout_rate, seed=2))
            
    #Finally, the output layer should have a single node in binary classification
    model.add(Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt, loss='binary_crossentropy') 
    
    return model

In [57]:
reset_random_seeds()

btscv = BlockingTimeSeriesSplit(n_splits=2)

param_grid = dict(n_layers=[1], first_layer_nodes = [5,6,7,8,9,10,11], 
                  dropout_rate = [0.2,0.3,0.4], epochs = [50], lr = [0.001])

Kmodel = KerasClassifier(build_fn=createmodel_one, verbose=3)
Kmodel._estimator_type = "classifier"
grid = GridSearchCV(estimator=Kmodel, param_grid=param_grid, cv=btscv, refit = True, scoring='f1', return_train_score = True)

grid.fit(X_train, y_train)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


GridSearchCV(cv=<__main__.BlockingTimeSeriesSplit object at 0x7fe2fef41bb0>,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fe2fef41970>,
             param_grid={'dropout_rate': [0.2, 0.3, 0.4], 'epochs': [50],
                         'first_layer_nodes': [5, 6, 7, 8, 9, 10, 11],
                         'lr': [0.001], 'n_layers': [1]},
             return_train_score=True, scoring='f1')

In [58]:
#print best parameter after tuning
print(grid.best_params_)
  
#print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'dropout_rate': 0.4, 'epochs': 50, 'first_layer_nodes': 9, 'lr': 0.001, 'n_layers': 1}
<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fe300f801c0>


## Performance of model on test set

In [59]:
#Since we have set refit = True, GridSearch refits the model on the whole training set using the best parameters
X_test = scaler.transform(X_test)
y_pred_mlp = grid.predict(X_test)



In [60]:
print(classification_report(y_test, y_pred_mlp, digits = 4))

              precision    recall  f1-score   support

           0     0.6667    0.6250    0.6452        48
           1     0.2174    0.2500    0.2326        20

    accuracy                         0.5147        68
   macro avg     0.4420    0.4375    0.4389        68
weighted avg     0.5345    0.5147    0.5238        68



In [61]:
cm = confusion_matrix(y_test, y_pred_mlp)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[0.625 0.25 ]
[[0.625 0.375]
 [0.75  0.25 ]]


## Majority vote

In [62]:
#Convert predictions of -1 to 0 since the other two models predict 0 for decrease
y_pred_svm = np.where(y_pred_svm==-1, 0, y_pred_svm)

In [63]:
#Reshape y_pred_mlp to have the same shape as the other predictions
y_pred_mlp = y_pred_mlp[:,0]

In [64]:
maj_vote = []
for p in zip(y_pred_svm, y_pred_nb, y_pred_mlp):
    total_pred = sum(p)
    if total_pred > 1:
        maj_vote.append(1)
    else:
        maj_vote.append(0)

In [70]:
print(maj_vote)

[0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [66]:
print(classification_report(y_test, maj_vote, digits = 4))

              precision    recall  f1-score   support

           0     0.7556    0.7083    0.7312        48
           1     0.3913    0.4500    0.4186        20

    accuracy                         0.6324        68
   macro avg     0.5734    0.5792    0.5749        68
weighted avg     0.6484    0.6324    0.6392        68



In [67]:
cm = confusion_matrix(y_test, maj_vote)
print(cm)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print(cm.diagonal())
print(cm)

[[34 14]
 [11  9]]
[0.70833333 0.45      ]
[[0.70833333 0.29166667]
 [0.55       0.45      ]]


In [103]:
y_base = df['inc_dec_lag1'].to_numpy()
y_base = y_base[150:]

In [104]:
print(classification_report(y_test, y_base, digits = 4))

              precision    recall  f1-score   support

           0     0.6977    0.6250    0.6593        48
           1     0.2800    0.3500    0.3111        20

    accuracy                         0.5441        68
   macro avg     0.4888    0.4875    0.4852        68
weighted avg     0.5748    0.5441    0.5569        68

