In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import os
import spacy
import en_core_web_sm
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import time


ModuleNotFoundError: No module named 'keras'

In [None]:
spacy_eng = en_core_web_sm.load()
data = os.path.join(os.getcwd(),'data.csv')

In [None]:
# To map each word to a index, convert string to numerical value
class Vocabulary:
    def __init__(self, freq_threshold):
        # freq_threshold check the freq word in text, if 1 , may not important to us
        # <UNK> if the word freq appear is less than threshold, it will map to <UNK>
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)


    # tokenize the caption [I love coffee] -> ["i","love","coffee"]
    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    # build vocab
    def build_vocabulary(self, sentence_list):
        # count each caption how many times a specific word repeated
        # if over the threshold we will include it, else ignore it
        frequencies = {}
        # start with index 4 because we have include the tagging
        idx = 4

        #self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>", 4: "i"}
        #self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3, "i":4}

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1
                # if the word frequence is we want,(we just need to append 1 times only)
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1 # store word to next index

    # convert text into number
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)


        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] # if word is not in library, <UNK>
            for token in tokenized_text
        ]

In [None]:
class Dataset:
    def __init__(self, captions_file, freq_threshold=5):
        self.df = pd.read_csv(captions_file)

        # Get img, caption columns
        self.captions = self.df["Headline"]
        self.results = self.df["Price movement"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        # Send all the caption as list
        self.vocab.build_vocabulary(self.captions.tolist())

        self.X = pd.DataFrame()
        for caption in self.captions:
            numericalized_caption = [self.vocab.stoi["<SOS>"]]
            numericalized_caption += self.vocab.numericalize(caption)
            numericalized_caption.append(self.vocab.stoi["<EOS>"])
            self.X.append(np.array(numericalized_caption))

        self.Y = pd.DataFrame()
        for result in self.results:
            tokenizer = {"Down": 0, "Up": 1}
            self.Y.append(tokenizer[result])

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.Y,test_size=0.2)


In [None]:
embed_dim = 128
lstm_out = 196
dataset = Dataset(data)

model = Sequential()
model.add(Embedding(len(dataset.vocab.stoi), embed_dim,input_length = dataset.X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
# Perform classification with SVM, kernel=linear
batch_size = 32
model.fit(dataset.X_train, dataset.y_train, epochs = 7, batch_size=batch_size, verbose = 2)

In [None]:
score,acc = model.evaluate(dataset.X_test, dataset.y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % score)
print("acc: %.2f" % acc)