# Sentiment Analysis with Word2Vec and SVM

### Loading data

In [1]:
import pickle

train_samples = pickle.load(open('./data/train_samples.pkl', 'rb'))
train_labels = pickle.load(open('./data/train_labels.pkl', 'rb'))

split_percentage = .85
split_threshold = int(split_percentage * len(train_labels))
val_samples = train_samples[split_threshold:]
train_samples = train_samples[:split_threshold]
val_labels = train_labels[split_threshold:]
train_labels = train_labels[:split_threshold]

### Loading the Word2Vec model

Download the word2vec model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing and unzip it in the current directory

In [2]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)  

### Preprocessing our input

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')
import numpy as np

def preprocess(review):
    representation = np.zeros(300)
    n_words = 0
    for sentence in sent_tokenize(review):
        for word in word_tokenize(sentence):
            try:
                representation += model.word_vec(word)
                n_words += 1
            except:
                pass
    
    return representation / n_words

train_data = np.array([preprocess(review) for review in train_samples]).astype(np.float32)
val_data = np.array([preprocess(review) for review in val_samples]).astype(np.float32)

[nltk_data] Downloading package punkt to /home/tonio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  representation += model.word_vec(word)


In [4]:
print(train_data.shape)

(1445, 300)


## Pytorch implementation

### Basic setup

In [5]:
import torch

w = torch.nn.Parameter(torch.randn(300, 2))
b = torch.nn.Parameter(torch.randn(2,))

input = torch.Tensor(train_data[:10])
target = torch.Tensor(train_labels[:10]).long()

output = torch.matmul(input, w) + b

print(output)
loss_function = torch.nn.CrossEntropyLoss()
loss = loss_function(output, target)
print(loss)

print(b.grad)
loss.backward()
print(b.grad)

tensor([[-1.2717, -1.6983],
        [-1.5204, -1.6406],
        [-1.6709, -1.9722],
        [-1.8916, -1.7740],
        [-1.5462, -2.1345],
        [-1.9890, -2.0022],
        [-1.8241, -2.2094],
        [-1.9009, -1.7700],
        [-1.5751, -1.8321],
        [-1.6853, -1.9322]], grad_fn=<AddBackward0>)
tensor(0.7706, grad_fn=<NllLossBackward>)
None
tensor([ 0.1515, -0.1515])


### Optimization

In [6]:
w = torch.nn.Parameter(torch.randn(300, 2))
b = torch.nn.Parameter(torch.randn(2,))
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr = 1e-2, params = [w, b])

batch_size = 64
for epoch_n in range(100):
    for sample_n in range(0, len(train_labels), batch_size):
        optimizer.zero_grad()

        input = torch.Tensor(train_data[sample_n: sample_n + batch_size])
        target = torch.Tensor(train_labels[sample_n: sample_n + batch_size]).long()

        output = torch.matmul(input, w) + b
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()

In [7]:
val_predictions = torch.matmul(torch.Tensor(val_data), w) + b
val_predictions = np.argmax(val_predictions.detach().numpy(), axis = 1)
print(np.mean(val_predictions == val_labels))

0.8235294117647058


### Introducing modules

In [8]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.w = torch.nn.Parameter(torch.randn(300, 2))
        self.b = torch.nn.Parameter(torch.randn(2,))
    
    def forward(self, input):
        return torch.matmul(input, self.w) + self.b
    
model = Model()
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr = 1e-2, params = model.parameters())

batch_size = 64
for epoch_n in range(100):   
    for sample_n in range(0, len(train_labels), batch_size):
        optimizer.zero_grad()

        input = torch.Tensor(train_data[sample_n: sample_n + batch_size])
        target = torch.Tensor(train_labels[sample_n: sample_n + batch_size]).long()

        output = model(input)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
    
val_predictions = model(torch.Tensor(val_data))
val_predictions = np.argmax(val_predictions.detach().numpy(), axis = 1)
print(np.mean(val_predictions == val_labels))

0.8313725490196079


### Introducing layers

In [9]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layer = torch.nn.Linear(300, 2)
    
    def forward(self, input):
        return self.layer(input)
    
model = Model()
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr = 1e-2, params = model.parameters())

batch_size = 64
for epoch_n in range(100):
    for sample_n in range(0, len(train_labels), batch_size):
        optimizer.zero_grad()

        input = torch.Tensor(train_data[sample_n: sample_n + batch_size])
        target = torch.Tensor(train_labels[sample_n: sample_n + batch_size]).long()

        output = model(input)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
    
val_predictions = model(torch.Tensor(val_data))
val_predictions = np.argmax(val_predictions.detach().numpy(), axis = 1)
print(np.mean(val_predictions == val_labels))

0.8392156862745098


### Sequential model

In [10]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(300, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 2)
        )
    
    def forward(self, input):
        return self.layer(input)
    
model = Model()
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr = 1e-3, params = model.parameters())

batch_size = 64
for epoch_n in range(100):
    for sample_n in range(0, len(train_labels), batch_size):
        optimizer.zero_grad()

        input = torch.Tensor(train_data[sample_n: sample_n + batch_size])
        target = torch.Tensor(train_labels[sample_n: sample_n + batch_size]).long()

        output = model(input)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
    
val_predictions = model(torch.Tensor(val_data))
val_predictions = np.argmax(val_predictions.detach().numpy(), axis = 1)
print(np.mean(val_predictions == val_labels))

0.8627450980392157


### Introducing datasets and dataloaders

In [11]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(300, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 2)
        )
    
    def forward(self, input):
        return self.layer(input)
    
    
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        super().__init__()
        
        if split == 'train':
            self.data = train_data
            self.labels = train_labels
        else:
            self.data = val_data
            self.labels = val_labels
        
    def __getitem__(self, sample_n):
        return self.data[sample_n], self.labels[sample_n]
    
    def __len__(self):
        return len(self.labels)
    
    
model = Model()
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lr = 1e-3, params = model.parameters())

batch_size = 64
train_dataset = Dataset(split = 'train')
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle = True,
    batch_size = batch_size,
    num_workers = 10,
)
val_dataset = Dataset(split = 'val')
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = batch_size,
    num_workers = 10,
)

for epoch_n in range(100):
    for batch in train_dataloader:
        input, target = batch
        
        optimizer.zero_grad()

        output = model(input)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
    
val_predictions = model(torch.Tensor(val_data))
val_predictions = np.argmax(val_predictions.detach().numpy(), axis = 1)
print(np.mean(val_predictions == val_labels))

0.8470588235294118


### Training on GPU

In [12]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(300, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 2)
        )
    
    def forward(self, input):
        return self.layer(input)
    
    
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        super().__init__()
        
        if split == 'train':
            self.data = train_data
            self.labels = train_labels
        else:
            self.data = val_data
            self.labels = val_labels
        
    def __getitem__(self, sample_n):
        return self.data[sample_n], self.labels[sample_n]
    
    def __len__(self):
        return len(self.labels)
    
device = torch.device('cuda')
    
model = Model().to(device)
loss_function = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(lr = 1e-3, params = model.parameters())

batch_size = 256
train_dataset = Dataset(split = 'train')
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle = True,
    batch_size = batch_size,
    num_workers = 10,
)
val_dataset = Dataset(split = 'val')
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = batch_size,
    num_workers = 10,
)

for epoch_n in range(100):
    for batch in train_dataloader:
        input, target = batch
        input = input.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()

        output = model(input)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
    
val_predictions = model(torch.Tensor(val_data).to(device))
val_predictions = np.argmax(val_predictions.detach().cpu().numpy(), axis = 1)
print(np.mean(val_predictions == val_labels))

0.8431372549019608


### Evaluation procedure

In [13]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(300, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, 2)
        )
    
    def forward(self, input):
        return self.layer(input)
    
    
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        super().__init__()
        
        if split == 'train':
            self.data = train_data
            self.labels = train_labels
        else:
            self.data = val_data
            self.labels = val_labels
        
    def __getitem__(self, sample_n):
        return self.data[sample_n], self.labels[sample_n]
    
    def __len__(self):
        return len(self.labels)
    
device = torch.device('cuda')
    
model = Model().to(device)
loss_function = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

batch_size = 256
train_dataset = Dataset(split = 'train')
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle = True,
    batch_size = batch_size,
    num_workers = 10,
)
val_dataset = Dataset(split = 'val')
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = batch_size,
    num_workers = 10,
)

for epoch_n in range(100):
    model.train()
    for batch in train_dataloader:
        input, target = batch
        input = input.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()

        output = model(input)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
        
    model.eval()
    val_acc = []
    for batch in val_dataloader:
        input, target = batch
        input = input.to(device)
        target = target.to(device)
        
        with torch.no_grad():
            output = model(input)
        
        val_acc += (output.argmax(1) == target).tolist()
    print(np.mean(val_acc))

0.49411764705882355
0.6941176470588235
0.7294117647058823
0.7098039215686275
0.6705882352941176
0.7372549019607844
0.7294117647058823
0.7490196078431373
0.7294117647058823
0.7686274509803922
0.7764705882352941
0.7254901960784313
0.7607843137254902
0.788235294117647
0.788235294117647
0.7803921568627451
0.7764705882352941
0.7803921568627451
0.7843137254901961
0.7803921568627451
0.7843137254901961
0.7803921568627451
0.7607843137254902
0.7725490196078432
0.7764705882352941
0.7764705882352941
0.792156862745098
0.792156862745098
0.792156862745098
0.7843137254901961
0.807843137254902
0.788235294117647
0.8
0.792156862745098
0.8
0.8
0.803921568627451
0.803921568627451
0.807843137254902
0.807843137254902
0.807843137254902
0.8235294117647058
0.8235294117647058
0.8196078431372549
0.8313725490196079
0.8274509803921568
0.8352941176470589
0.8352941176470589
0.8235294117647058
0.8196078431372549
0.8274509803921568
0.8274509803921568
0.8352941176470589
0.8470588235294118
0.8235294117647058
0.8274509803