In [1]:
import pandas as pd
from string import punctuation
import numpy as np
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch import optim
import json

In [65]:
with open("sentiment labelled sentences/sentiment.txt") as f:
    reviews = f.read()
    
data = pd.DataFrame([review.split('\t') for review in reviews.split('\n')])

data.columns = ['Review','Sentiment']

data = data.sample(frac=1)

In [66]:
data.head()

Unnamed: 0,Review,Sentiment
1771,Great time - family dinner on a Sunday night.,1
394,(My mother and brother had to do this)When I s...,1
2448,"The screen size is big, key pad lit well enoug...",1
1263,"Good service, very clean, and inexpensive, to ...",1
2959,"However, the keypads are so tinny that I somet...",0


In [85]:
def split_words_reviews(data):
    text = list(data['Review'].values)
    clean_text = []
    for t in text:
        clean_text.append(t.translate(str.maketrans('', '', punctuation)).lower().rstrip())
    tokenized = [word_tokenize(x) for x in clean_text]
    all_text = []
    for tokens in tokenized:
        for t in tokens:
            all_text.append(t)
    return tokenized, set(all_text)

reviews, vocab = split_words_reviews(data)

reviews[0]

['great', 'time', 'family', 'dinner', 'on', 'a', 'sunday', 'night']

In [86]:
def create_dictionaries(words):
    word_to_int_dict = {w:i+1 for i, w in enumerate(words)}
    int_to_word_dict = {i:w for w, i in word_to_int_dict.items()}
    return word_to_int_dict, int_to_word_dict

word_to_int_dict, int_to_word_dict = create_dictionaries(vocab)

int_to_word_dict

{1: 'tonight',
 2: 'attractive',
 3: 'magic',
 4: 'acknowledged',
 5: 'inexpensive',
 6: 'receiving',
 7: 'expert',
 8: 'nervous',
 9: 'muffled',
 10: 'appalling',
 11: 'snow',
 12: 'shined',
 13: 'waste',
 14: 'treo',
 15: '20th',
 16: 'loop',
 17: 'address',
 18: 'explains',
 19: 'story',
 20: 'sequels',
 21: 'garfield',
 22: 'classywarm',
 23: 'loneliness',
 24: 'take',
 25: 'styling',
 26: 'plan',
 27: 'charismatic',
 28: 'scot',
 29: 'thrillers',
 30: 'groundbreaking',
 31: 'shirley',
 32: 'noticed',
 33: 'hay',
 34: 'started',
 35: 'wit',
 36: 'artless',
 37: 'prefer',
 38: 'hollywood',
 39: 'cheaply',
 40: 'carlys',
 41: 'starts',
 42: '510',
 43: 'handset',
 44: 'dogs',
 45: 'conflict',
 46: 'comparablypriced',
 47: 'transcendant',
 48: 'woo',
 49: 'im',
 50: 'slow',
 51: 'wornout',
 52: 'wanted',
 53: 'me',
 54: 'brunch',
 55: 'contstruct',
 56: 'coziness',
 57: 'die',
 58: 'mainly',
 59: 'instruction',
 60: 'rightthe',
 61: 'lucio',
 62: 'idealogical',
 63: 'move',
 64: 'wait

In [None]:
with open('word_to_int_dict.json', 'w') as fp:
    json.dump(word_to_int_dict, fp)

In [87]:
print(np.max([len(x) for x in reviews]))
print(np.mean([len(x) for x in reviews]))

70
11.783666666666667


In [88]:
def pad_text(tokenized_reviews, seq_length):
    
    reviews = []
    
    for review in tokenized_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append(['']*(seq_length-len(review)) + review)
        
    return np.array(reviews)

padded_sentences = pad_text(reviews, seq_length = 50)

padded_sentences[0]

array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', 'great', 'time', 'family',
       'dinner', 'on', 'a', 'sunday', 'night'], dtype='<U33')

In [90]:
int_to_word_dict[0] = ''
word_to_int_dict[''] = 0

In [91]:
encoded_sentences = np.array([[word_to_int_dict[word] for word in review] for review in padded_sentences])

encoded_sentences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0, 3869,  643,
       4472, 3286, 3868, 3218, 1261,  505])

In [33]:
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.8):
        super().__init__()
        
        self.n_vocab = n_vocab  
        self.n_layers = n_layers 
        self.n_hidden = n_hidden 
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                          
        embedded_words = self.embedding(input_words)
        lstm_out, h = self.lstm(embedded_words) 
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden)
        fc_out = self.fc(lstm_out)                  
        sigmoid_out = self.sigmoid(fc_out)              
        sigmoid_out = sigmoid_out.view(batch_size, -1)  
        
        sigmoid_last = sigmoid_out[:, -1]
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):
        
        device = "cpu"
        weights = next(self.parameters()).data
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        
        return h


In [34]:
n_vocab = len(word_to_int_dict)
n_embed = 50
n_hidden = 100
n_output = 1
n_layers = 2

net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)

In [13]:
labels = np.array([int(x) for x in data['Sentiment'].values])

train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2

total = len(encoded_sentences)
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = torch.Tensor(encoded_sentences[:train_cutoff]).long(), torch.Tensor(labels[:train_cutoff]).long()
valid_x, valid_y = torch.Tensor(encoded_sentences[train_cutoff : valid_cutoff]).long(), torch.Tensor(labels[train_cutoff : valid_cutoff]).long()
test_x, test_y = torch.Tensor(encoded_sentences[valid_cutoff:]).long(), torch.Tensor(labels[valid_cutoff:])

train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)

batch_size = 1

train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

In [35]:
print_every = 2400
step = 0
n_epochs = 3
clip = 5  
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

In [36]:
for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1  
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            net.eval()
            valid_losses = []

            for v_inputs, v_labels in valid_loader:
                       
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

  app.launch_new_instance()


Epoch: 1/3 Step: 2400 Training Loss: 0.2800 Validation Loss: 0.7575
Epoch: 2/3 Step: 4800 Training Loss: 0.0129 Validation Loss: 0.6676
Epoch: 3/3 Step: 7200 Training Loss: 0.0673 Validation Loss: 0.6420


In [99]:
# torch.save(net.state_dict(), 'model.pkl')

In [113]:
net = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net.load_state_dict(torch.load('model.pkl'))

<All keys matched successfully>

In [125]:
net.eval()
test_losses = []
num_correct = 0

for inputs, labels in test_loader:

    test_output, test_h = net(inputs)
    loss = criterion(test_output, labels)
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))    

Test Loss: 0.6598
Test Accuracy: 0.76


In [40]:
def preprocess_review(review):
    review = review.translate(str.maketrans('', '', punctuation)).lower().rstrip()
    tokenized = word_tokenize(review)
    if len(tokenized) >= 50:
        review = tokenized[:50]
    else:
        review= ['0']*(50-len(tokenized)) + tokenized
    
    final = []
    
    for token in review:
        try:
            final.append(word_to_int_dict[token])
            
        except:
            final.append(word_to_int_dict[''])
        
    return final

In [114]:
def predict(review):
    net.eval()
    words = np.array([preprocess_review(review)])
    padded_words = torch.from_numpy(words)
    pred_loader = DataLoader(padded_words, batch_size = 1, shuffle = True)
    for x in pred_loader:
        output = net(x)[0].item()
    
    msg = "This is a positive review." if output >= 0.5 else "This is a negative review."
    print(msg)
    print('Prediction = ' + str(output))

In [123]:
predict("The film was good")

This is a positive review.
Prediction = 0.917565107345581


In [124]:
predict("It was not good")

This is a negative review.
Prediction = 0.2955784499645233
