In [1]:
import re
from collections import Counter
import pickle
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

In [2]:
reviews = []
labels = []

with open("../input/reviews.txt", "r") as lines:
    for line in lines:
        reviews.append(re.sub('\s+', ' ', line).strip())
    
with open("../input/labels.txt", "r") as lines:
    for line in lines:
        labels.append(re.sub('\s+', ' ', line).strip())
        
print(reviews[-1])
print(labels[-1])

this is one of the dumbest films i ve ever seen . it rips off nearly ever type of thriller and manages to make a mess of them all . br br there s not a single good line or character in the whole mess . if there was a plot it was an afterthought and as far as acting goes there s nothing good to say so ill say nothing . i honestly cant understand how this type of nonsense gets produced and actually released does somebody somewhere not at some stage think oh my god this really is a load of shite and call it a day . its crap like this that has people downloading illegally the trailer looks like a completely different film at least if you have download it you haven t wasted your time or money don t waste your time this is painful .
negative


In [3]:
# now let's create and save the dictionary to use later
char_counter = Counter()

for review in reviews:
    char_counter.update(list(review))
    
character_keys = {e[0]: idx+1 for idx, e in enumerate(char_counter.most_common())}
print(character_keys)

label_keys = {'positive': 0, 'negative': 1}
print(label_keys)

with open('char_dict.pkl', 'wb') as file:
    pickle.dump(character_keys, file)
    
with open('label_dict.pkl', 'wb') as file:
    pickle.dump(label_keys, file)

{' ': 1, 'e': 2, 't': 3, 'a': 4, 'i': 5, 'o': 6, 's': 7, 'n': 8, 'r': 9, 'h': 10, 'l': 11, 'd': 12, 'c': 13, 'm': 14, 'u': 15, 'f': 16, 'g': 17, 'y': 18, 'b': 19, 'w': 20, 'p': 21, '.': 22, 'v': 23, 'k': 24, 'j': 25, 'x': 26, 'z': 27, 'q': 28}
{'positive': 0, 'negative': 1}


In [4]:
# now we can load the dicts whenever we want

with open('char_dict.pkl', 'rb') as file:
    character_keys = pickle.load(file)
    
with open('label_dict.pkl', 'rb') as file:
    label_keys = pickle.load(file)

print(character_keys)
print(label_keys)

{' ': 1, 'e': 2, 't': 3, 'a': 4, 'i': 5, 'o': 6, 's': 7, 'n': 8, 'r': 9, 'h': 10, 'l': 11, 'd': 12, 'c': 13, 'm': 14, 'u': 15, 'f': 16, 'g': 17, 'y': 18, 'b': 19, 'w': 20, 'p': 21, '.': 22, 'v': 23, 'k': 24, 'j': 25, 'x': 26, 'z': 27, 'q': 28}
{'positive': 0, 'negative': 1}


In [5]:
# let's then tokenize our reviews and our labels
reviews_tk = []
labels_tk = []

for review in reviews:
    tk = [character_keys[char] for char in list(review)]
    reviews_tk.append(tk)
    
labels_tk = [label_keys[label] for label in labels]

print(reviews_tk[-1])
print(labels_tk[-1])

[3, 10, 5, 7, 1, 5, 7, 1, 6, 8, 2, 1, 6, 16, 1, 3, 10, 2, 1, 12, 15, 14, 19, 2, 7, 3, 1, 16, 5, 11, 14, 7, 1, 5, 1, 23, 2, 1, 2, 23, 2, 9, 1, 7, 2, 2, 8, 1, 22, 1, 5, 3, 1, 9, 5, 21, 7, 1, 6, 16, 16, 1, 8, 2, 4, 9, 11, 18, 1, 2, 23, 2, 9, 1, 3, 18, 21, 2, 1, 6, 16, 1, 3, 10, 9, 5, 11, 11, 2, 9, 1, 4, 8, 12, 1, 14, 4, 8, 4, 17, 2, 7, 1, 3, 6, 1, 14, 4, 24, 2, 1, 4, 1, 14, 2, 7, 7, 1, 6, 16, 1, 3, 10, 2, 14, 1, 4, 11, 11, 1, 22, 1, 19, 9, 1, 19, 9, 1, 3, 10, 2, 9, 2, 1, 7, 1, 8, 6, 3, 1, 4, 1, 7, 5, 8, 17, 11, 2, 1, 17, 6, 6, 12, 1, 11, 5, 8, 2, 1, 6, 9, 1, 13, 10, 4, 9, 4, 13, 3, 2, 9, 1, 5, 8, 1, 3, 10, 2, 1, 20, 10, 6, 11, 2, 1, 14, 2, 7, 7, 1, 22, 1, 5, 16, 1, 3, 10, 2, 9, 2, 1, 20, 4, 7, 1, 4, 1, 21, 11, 6, 3, 1, 5, 3, 1, 20, 4, 7, 1, 4, 8, 1, 4, 16, 3, 2, 9, 3, 10, 6, 15, 17, 10, 3, 1, 4, 8, 12, 1, 4, 7, 1, 16, 4, 9, 1, 4, 7, 1, 4, 13, 3, 5, 8, 17, 1, 17, 6, 2, 7, 1, 3, 10, 2, 9, 2, 1, 7, 1, 8, 6, 3, 10, 5, 8, 17, 1, 17, 6, 6, 12, 1, 3, 6, 1, 7, 4, 18, 1, 7, 6, 1, 5, 11, 11, 1, 7, 

In [6]:
# sort reviews by sequence length (no longer needed)
# labels_tk = [l for r, l in sorted(zip(reviews_tk, labels_tk), key=lambda pair: -len(pair[0]))]
# reviews_tk = sorted(reviews_tk, key=lambda r: -len(r))

# for review, label in zip(reviews_tk[:15], labels_tk[:15]):
#     print(len(review), label)

In [7]:
# now we padd the reviews
max_length = 2000
reviews_lengths = []

for idx, review in enumerate(reviews_tk):
    review = review[:max_length]
    reviews_lengths.append(len(review))
    review = ([0] * (max_length - len(review))) + review
    reviews_tk[idx] = review
    
for review, label, lengths in zip(reviews_tk[-15:], labels_tk[-15:], reviews_lengths[-15:]):
    print(len(review), lengths, label)

2000 272 1
2000 783 0
2000 1168 1
2000 978 0
2000 1432 1
2000 636 0
2000 1556 1
2000 333 0
2000 832 1
2000 2000 0
2000 1663 1
2000 966 0
2000 1242 1
2000 929 0
2000 736 1


In [8]:
# make training, validation and test splits
train_count = 20000
val_test_count = 2500

train_x = np.array(reviews_tk[:train_count])
train_y = np.array(labels_tk[:train_count])

val_x = np.array(reviews_tk[train_count:train_count+val_test_count])
val_y = np.array(labels_tk[train_count:train_count+val_test_count])

test_x = np.array(reviews_tk[-val_test_count:])
test_y = np.array(labels_tk[-val_test_count:])

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)
print(val_x.shape, val_y.shape)

(20000, 2000) (20000,)
(2500, 2000) (2500,)
(2500, 2000) (2500,)


In [9]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size=128
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [10]:
# let's now convert the tokenized lists to torch Tensors (no longer necessary)

# reviews_tk = torch.from_numpy(np.array(reviews_tk))
# labels_tk = torch.from_numpy(np.array(labels_tk))
    
# print(reviews_tk[-1][-10:]) # only prints the last 10 chars, but the tensor is very big, as big as the one shown above
# print(labels_tk[-1])

In [11]:
# create model
vocab_size = len(character_keys) + 1 # the + 1 is to account for the 0 that is used for padding

class SentimentModel(nn.Module):
    def __init__(self):
        super(SentimentModel, self).__init__()

        self.embedding_layer = nn.Embedding(vocab_size, 32, padding_idx=0)
        self.cnn1 = nn.Conv1d(32, 256, 3, padding=1, stride=2)
        self.rnn = nn.GRU(256, 512, batch_first=True, num_layers=2, dropout=0.5)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(512, 2)
        
        self.hidden = None
    
    def forward(self, x):
        embeddings = self.embedding_layer(x)
        convolved = F.relu(self.cnn1(embeddings.transpose(1, 2)).transpose(2, 1))
        # embeddings = nn.utils.rnn.pack_padded_sequence(embeddings, lengths, batch_first=True)
        lstm_out, self.hidden = self.rnn(convolved, self.hidden)
        # lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)
        out = out[:,-1,:]
        out = torch.log_softmax(out, dim=1)
        
        return out

In [12]:
model = SentimentModel()
model

SentimentModel(
  (embedding_layer): Embedding(29, 32, padding_idx=0)
  (cnn1): Conv1d(32, 256, kernel_size=(3,), stride=(2,), padding=(1,))
  (rnn): GRU(256, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [13]:
# let's train our model for 1 epoch

epochs = 10

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()

model.cuda()
for e in range(1, epochs+1):
    total_loss = 0
    total_accuracy = 0
    model.train()
    batch = 0
    for reviews, labels in train_loader:
        batch += 1
        reviews, labels = reviews.cuda(), labels.cuda()
        
        optimizer.zero_grad()
        model.hidden = None
        
        pred = model(reviews)
        loss = criterion(pred, labels)
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 4)
        optimizer.step()
        
        total_loss += loss.item()
        
        equals = torch.argmax(pred, dim=1).view(-1) == labels.view(-1)
        total_accuracy += torch.mean(equals.type(torch.FloatTensor))
            
        print(f"EPOCH {e} ({batch}/{len(train_loader)}) - loss {total_loss/batch:.4f} - acc {total_accuracy/batch:.4f}", end='\r') 
    
    valid_loss = 0
    valid_accuracy = 0
    with torch.no_grad():
        model.eval()
        for reviews, labels in valid_loader:
            reviews, labels = reviews.cuda(), labels.cuda()
            model.hidden = None
            
            pred = model(reviews)
            loss = criterion(pred, labels)
            
            valid_loss += loss.item()
            
            equals = torch.argmax(pred, dim=1).view(-1) == labels.view(-1)
            valid_accuracy += torch.mean(equals.type(torch.FloatTensor))
    
    print(f"EPOCH {e} - loss {total_loss/len(train_loader):.4f} - acc {total_accuracy/len(train_loader):.4f} - val_loss {valid_loss/len(valid_loader):.4f} - val_acc {valid_accuracy/len(valid_loader):.4f}")
    

EPOCH 1 - loss 0.6942 - acc 0.5217 - val_loss 0.6795 - val_acc 0.5646
EPOCH 2 - loss 0.6812 - acc 0.5609 - val_loss 0.6635 - val_acc 0.5774
EPOCH 3 - loss 0.6710 - acc 0.5920 - val_loss 0.7016 - val_acc 0.5660
EPOCH 4 - loss 0.5949 - acc 0.6811 - val_loss 0.5445 - val_acc 0.7243
EPOCH 5 - loss 0.4712 - acc 0.7720 - val_loss 0.4320 - val_acc 0.7980
EPOCH 6 - loss 0.4058 - acc 0.8128 - val_loss 0.4401 - val_acc 0.7950
EPOCH 7 - loss 0.3625 - acc 0.8377 - val_loss 0.3889 - val_acc 0.8217
EPOCH 8 - loss 0.3211 - acc 0.8629 - val_loss 0.4209 - val_acc 0.8069
EPOCH 9 - loss 0.2719 - acc 0.8859 - val_loss 0.4194 - val_acc 0.8287
EPOCH 10 - loss 0.2144 - acc 0.9137 - val_loss 0.4232 - val_acc 0.8153


In [14]:
test_loss = 0
test_accuracy = 0
model.cuda()
with torch.no_grad():
    model.eval()
    for reviews, labels in test_loader:
        reviews, labels = reviews.cuda(), labels.cuda()
        model.hidden = None

        pred = model(reviews)
        loss = criterion(pred, labels)

        test_loss += loss.item()

        equals = torch.argmax(pred, dim=1).view(-1) == labels.view(-1)
        test_accuracy += torch.mean(equals.type(torch.FloatTensor))

print(f"Final model -> loss: {test_loss / len(test_loader):.4f} - accuracy: {test_accuracy / len(test_loader):.4f}")
    

Final model -> loss: 0.4226 - accuracy: 0.8255


In [15]:
torch.save(model.state_dict(), 'model.pt')