# Sentiment Analysis with Word2Vec and SVM

### Loading data

In [1]:
import pickle

train_samples = pickle.load(open('./data/train_samples.pkl', 'rb'))
train_labels = pickle.load(open('./data/train_labels.pkl', 'rb'))

split_percentage = .85
split_threshold = int(split_percentage * len(train_labels))
val_samples = train_samples[split_threshold:]
train_samples = train_samples[:split_threshold]
val_labels = train_labels[split_threshold:]
train_labels = train_labels[:split_threshold]

print(train_samples[0])

if you're the type of person who goes on the submarine ride every time you visit disneyland , you're going to love the hunt for red october . 
you'll also love the film if you enjoy cat and mouse military tactics , or if you're a sean connery or alec baldwin fan , or if you admired director john mctiernan's earlier films , die hard and predator . 
in fact , the only people likely to be disappointed with the hunt for red october are those who have read the book , since films almost never live up to the novels which inspired them . 
the hunt for red october is an epic thriller , adapted from tom clancy's best selling novel . 
set in an era before glasnost , the movie revolves around a top-secret soviet submarine , called the red october . 
the nuclear sub has a revolutionary propulsion system , which makes the vessel silent and allows it to escape sonar detection . 
the red october embarks on its maiden voyage under the command of captain marko ramius , played by sean connery . 
ramius h

### Loading the Word2Vec model

Download the word2vec model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing and unzip it in the current directory

In [2]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)  



### Preprocessing our input

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
import numpy as np

def preprocess(review):
    representation = np.zeros(300)
    n_words = 0
    for sentence in sent_tokenize(review):
        for word in word_tokenize(sentence):
            try:
                representation += model.word_vec(word)
                n_words += 1
            except:
                pass
    
    return representation / n_words

train_data = np.array([preprocess(review) for review in train_samples])
val_data = np.array([preprocess(review) for review in val_samples])

[nltk_data] Downloading package punkt to /home/tonio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  if sys.path[0] == '':


### Training our model

In [4]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf')
classifier.fit(train_data, train_labels)
print(np.mean(classifier.predict(train_data) == train_labels))
print(np.mean(classifier.predict(val_data) == val_labels))

0.8207612456747405
0.8


### Training a second model

In [5]:
classifier = SVC(kernel = 'linear')
classifier.fit(train_data, train_labels)
print(np.mean(classifier.predict(train_data) == train_labels))
print(np.mean(classifier.predict(val_data) == val_labels))

0.7709342560553634
0.7843137254901961


### Submitting the best model

In [6]:
test_samples = pickle.load(open('./data/test_samples.pkl', 'rb'))
test_data = np.array([preprocess(review) for review in test_samples])

test_labels = pickle.load(open('./data/test_labels.pkl', 'rb'))

classifier = SVC(kernel = 'rbf')
classifier.fit(
    np.concatenate([train_data, val_data], axis = 0),
    np.concatenate([train_labels, val_labels], axis = 0)
)

print(np.mean(classifier.predict(test_data) == test_labels))

  if sys.path[0] == '':


0.81
