In [1]:
import os
os.chdir('..')

In [2]:
import data_importer
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
import pickle

# SVM Trainer
Using this notebook, SVMs can be trained with token vectorizers.

## Step 1. Importing a training set
Being a supervised learning algorithm, any SVM needs a training set. By default, sentiment annotated news sentences is used provided by Levenberg, Pulman, Moilanen, Simpson, and Roberts (2014). However, this can be changed to any other annotated training set.

In [3]:
data = data_importer.import_nonfarm_data()
data = (data[data['Confidence']>0.90])

## Step 2. Vectorizing the training set
Next, we vectorize our data using the TF-IDF vectorizer from sklearn.

In [4]:
vectorizer = TfidfVectorizer(#min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(data['Sentence'])

## Step 3. Training the SVM
Now that we have our vectorized training set, we can use it to train the SVM. The kernel and other parameters of the SVM can be modified to obtain different results.

In [5]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, data['Label'])
t1 = time.time()
training_time = t1-t0
print("SVM trained in", training_time, "seconds.")

SVM trained in 11.559664964675903 seconds.


## Step 4. Storing the vectorizer and SVM
Finally, we store both the SVM and the vectorizer as pickles. This way, they can be used later by sentiment analysis models.

In [6]:
outfile = open('pickles/svm_classifier','wb')
pickle.dump(classifier_linear,outfile)
outfile.close()
outfile = open('pickles/vectorizer','wb')
pickle.dump(vectorizer,outfile)
outfile.close()