In [6]:
%%capture
!pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc4
!pip install autokeras

In [56]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import EarlyStopping, CSVLogger
from keras.models import load_model
import autokeras as ak

from sklearn.model_selection import train_test_split

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
cd "gdrive/My Drive/BT4222/Final Project/code"

/content/gdrive/My Drive/BT4222/Final Project/code


## <font color='navy'>0. Text preperation</font>

### Drop features

In [10]:
df = pd.read_csv('../data/tweets&sentiment&logreturns.csv')

In [11]:
df = df[['cleaned_text', '^GSPC', '^IXIC', 'VGT']]

### Drop NA values

In [12]:
df.isna().sum()

cleaned_text    1192
^GSPC              0
^IXIC              0
VGT                0
dtype: int64

In [13]:
df = df.dropna(how='any')
df = df.reset_index().drop(columns=['index'])

In [14]:
df.head()

Unnamed: 0,cleaned_text,^GSPC,^IXIC,VGT
0,thank,0.005279,0.014137,0.012381
1,big news maine court side rnc uphold ban ballo...,0.005279,0.014137,0.012381
2,thank paul,0.005279,0.014137,0.012381
3,defective ballot new york want replace happen ...,0.005279,0.014137,0.012381
4,half years secure americas border rebuild awes...,0.005279,0.014137,0.012381


### Train test split

In [15]:
y_GSPC = df['^GSPC'].to_frame().to_numpy()
y_IXIC = df['^IXIC'].to_frame().to_numpy()
y_VGT = df['VGT'].to_frame().to_numpy()
X = df['cleaned_text'].to_numpy()

In [16]:
# train is 80% of the entire data set
train_ratio = 0.8

#### GSPC

In [17]:
X_train_GSPC, X_test_GSPC, y_train_GSPC, y_test_GSPC = train_test_split(X, y_GSPC, test_size=1 - train_ratio, random_state=1)

#### IXIC

In [18]:
X_train_IXIC, X_test_IXIC, y_train_IXIC, y_test_IXIC = train_test_split(X, y_IXIC, test_size=1 - train_ratio, random_state=1)

#### VGT

In [19]:
X_train_VGT, X_test_VGT, y_train_VGT, y_test_VGT = train_test_split(X, y_VGT, test_size=1 - train_ratio, random_state=1)

## <font color='navy'>1. Build Models</font>

### Define error metrics

Write a custom function to calculate MAPE, mean average percentage error. MAPE expresses expresses accuracy as a percentage of the error. Because the MAPE is a percentage, it can be easier to understand than the other accuracy measure statistics e.g. RMSE. Refer to the picture below for a guide on the interpretation of MAPE values

<img src="https://www.researchgate.net/profile/Albert_Sese/publication/257812432/figure/tbl1/AS:601657310203931@1520457689632/nterpretation-of-typical-MAPE-values.png" width=300 align="left"/>

In [32]:
# define custom functions to calculate MAPE

def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100

#### GSPC

In [54]:
# Create checkpoints and save model weights during training 
GSPC_CHECKPOINT_PATH = "../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt"

# Path to save training history
GSPC_LOG_PATH = '../model/AutoKeras_GSPC/log/GSPC_history.log'

# Path to save the best model
GSPC_BEST_MODEL_PATH = "../model/AutoKeras_GSPC/best_model"

In [38]:
# Create a callback that saves the model's weights
GSPC_cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=GSPC_CHECKPOINT_PATH,
                                                      save_weights_only=True,
                                                      verbose=1
                                                     )

# Create a callback that saves the model's training history
GSPC_csv_logger = CSVLogger(GSPC_LOG_PATH, separator=',', append=False)

In [39]:
# Initialize the text regressor.
GSPC_reg = ak.TextRegressor(
                            overwrite=False,
                            max_trials=10, # It tries 10 different models
                            loss="mean_squared_error",
                            objective="val_loss",
                            seed=1
                           ) 

In [40]:
# Feed the text regressor with training data.
GSPC_reg.fit(X_train_GSPC, y_train_GSPC, 
             epochs=10,
             validation_split=0.2,
             callbacks=[GSPC_cp_callback, GSPC_csv_logger]
            )

Trial 10 Complete [00h 02m 15s]
val_loss: 0.00024642361677251756

Best val_loss So Far: 0.0002422880061203614
Total elapsed time: 00h 54m 22s
INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 00001: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt

Epoch 00001: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt
Epoch 2/10
Epoch 00002: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt

Epoch 00002: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt
Epoch 3/10
Epoch 00003: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt

Epoch 00003: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt
Epoch 4/10
Epoch 00004: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt

Epoch 00004: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt
Epoch 5/10
Epoch 00005: saving model to ../model/AutoKeras_GSPC/checkpoints/GSPC_cp.ckpt

Epoch 00005: saving model to ../model/AutoKeras_GSPC/check

In [41]:
# Predict with the best model
y_pred_GSPC = GSPC_reg.predict(X_test_GSPC)

In [42]:
# Evaluate the best model with testing data based on MSE
evaluate_GSPC = GSPC_reg.evaluate(X_test_GSPC, y_test_GSPC)



In [43]:
# Evaluate the best model with testing data based on MAPE
mean_absolute_percentage_error(y_test_GSPC, y_pred_GSPC)

387.7718566112492

In [50]:
# export the best keras model from the pool of candidate models
best_model_GSPC = GSPC_reg.export_model()
best_model_GSPC.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None,)]                 0         
_________________________________________________________________
tf_op_layer_ExpandDims (Tens (None, 1)                 0         
_________________________________________________________________
text_vectorization (TextVect (None, 64)                0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           640128    
_________________________________________________________________
dropout (Dropout)            (None, 64, 128)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 62, 32)            12320     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 31, 32)           

In [55]:
# save the best keras model to disk
best_model_GSPC.save(GSPC_BEST_MODEL_PATH, save_format="tf")

INFO:tensorflow:Assets written to: ../model/AutoKeras_GSPC/best_model/assets


In [57]:
# # load the best model
# best_model_GSPC = load_model(GSPC_BEST_MODEL_PATH, custom_objects=ak.CUSTOM_OBJECTS)



#### IXIC

#### VGT