In [60]:
import os
import spacy
import en_core_web_sm
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time


In [31]:
spacy_eng = en_core_web_sm.load()
data = os.path.join(os.getcwd(),'data_final.csv')

In [32]:
# To map each word to a index, convert string to numerical value
class Vocabulary:
    def __init__(self, freq_threshold):
        # freq_threshold check the freq word in text, if 1 , may not important to us
        # <UNK> if the word freq appear is less than threshold, it will map to <UNK>
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)


    # tokenize the caption [I love coffee] -> ["i","love","coffee"]
    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    # build vocab
    def build_vocabulary(self, sentence_list):
        # count each caption how many times a specific word repeated
        # if over the threshold we will include it, else ignore it
        frequencies = {}
        # start with index 4 because we have include the tagging
        idx = 4

        #self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>", 4: "i"}
        #self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3, "i":4}

        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1

                else:
                    frequencies[word] += 1
                # if the word frequence is we want,(we just need to append 1 times only)
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1 # store word to next index

    # convert text into number
    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)


        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"] # if word is not in library, <UNK>
            for token in tokenized_text
        ]

array(['Down', 'Up', nan], dtype=object)

In [61]:
class Dataset:
    def __init__(self, captions_file, freq_threshold=5):
        self.df = pd.read_csv(captions_file)
        self.df = self.df.loc[self.df['Price movement'].isin(["Up", "Down"])]

        # Get img, caption columns
        self.captions = self.df["Headline"]
        self.results = self.df["Price movement"]

        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary(freq_threshold)
        # Send all the caption as list
        self.vocab.build_vocabulary(self.captions.tolist())

        vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

        self.Y = []
        for result in self.results:
            tokenizer = {"Down": 0, "Up": 1}
            self.Y.append(tokenizer[result])

        self.captions_train, self.captions_test, self.y_train, self.y_test = train_test_split(self.captions, self.Y,test_size=0.2)

        self.X_train = vectorizer.fit_transform(self.captions_train)
        self.X_test = vectorizer.transform(self.captions_test)


In [67]:
dataset = Dataset(data)

  (0, 8)	0.3437724150971783
  (0, 202)	0.3560424120383173
  (0, 52)	0.4525160743373893
  (0, 80)	0.40976245696498775
  (0, 75)	0.42843848566862414
  (0, 132)	0.23064933959392808
  (0, 194)	0.3816013691266663
  (1, 99)	0.17953753105774062
  (1, 139)	0.3431451938337875
  (1, 76)	0.31845504847688344
  (1, 15)	0.19877095874241008
  (1, 112)	0.4191902027652396
  (1, 16)	0.31500864364001885
  (1, 91)	0.31845504847688344
  (1, 106)	0.31845504847688344
  (1, 79)	0.14099705485292277
  (1, 116)	0.287018794233038
  (1, 30)	0.3592437525794593
  (2, 19)	0.47076089769887963
  (2, 190)	0.37213959204021657
  (2, 23)	0.4532299081908088
  (2, 3)	0.43840766775893425
  (2, 150)	0.4922170947919313
  (3, 190)	0.2603388153927634
  (3, 163)	0.6157862496027687
  :	:
  (486, 132)	0.22890187766290873
  (486, 15)	0.21294770270438973
  (486, 31)	0.3305499300906309
  (486, 102)	0.3987414234337518
  (486, 155)	0.37871025369339556
  (486, 42)	0.2835943807180891
  (486, 2)	0.2835943807180891
  (486, 84)	0.357835817484

In [74]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(dataset.X_train, dataset.y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(dataset.X_test)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))

report = classification_report(dataset.y_test, prediction_linear, output_dict=True)

print('Accuracy: ', report['accuracy'])
print('Up: ', report['1'])
print('Down: ', report['0'])

Training time: 0.012994s; Prediction time: 0.002003s
Accuracy:  0.4715447154471545
Up:  {'precision': 0.4727272727272727, 'recall': 0.41935483870967744, 'f1-score': 0.4444444444444444, 'support': 62}
Down:  {'precision': 0.47058823529411764, 'recall': 0.5245901639344263, 'f1-score': 0.49612403100775193, 'support': 61}
