In [None]:
import re
from collections import Counter
import pickle
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

In [None]:
reviews = []
labels = []

with open("../input/reviews.txt", "r") as lines:
    for line in lines:
        reviews.append(re.sub('\s+', ' ', line).strip())
    
with open("../input/labels.txt", "r") as lines:
    for line in lines:
        labels.append(re.sub('\s+', ' ', line).strip())
        
print(reviews[-1])
print(labels[-1])

In [None]:
# now let's create and save the dictionary to use later
char_counter = Counter()

for review in reviews:
    char_counter.update(list(review))
    
character_keys = {e[0]: idx+1 for idx, e in enumerate(char_counter.most_common())}
print(character_keys)

label_keys = {'positive': 0, 'negative': 1}
print(label_keys)

with open('char_dict.pkl', 'wb') as file:
    pickle.dump(character_keys, file)
    
with open('label_dict.pkl', 'wb') as file:
    pickle.dump(label_keys, file)

In [None]:
# now we can load the dicts whenever we want

with open('char_dict.pkl', 'rb') as file:
    character_keys = pickle.load(file)
    
with open('label_dict.pkl', 'rb') as file:
    label_keys = pickle.load(file)

print(character_keys)
print(label_keys)

In [None]:
# let's then tokenize our reviews and our labels
reviews_tk = []
labels_tk = []

for review in reviews:
    tk = [character_keys[char] for char in list(review)]
    reviews_tk.append(tk)
    
labels_tk = [label_keys[label] for label in labels]

print(reviews_tk[-1])
print(labels_tk[-1])

In [None]:
# sort reviews by sequence length (no longer needed)
# labels_tk = [l for r, l in sorted(zip(reviews_tk, labels_tk), key=lambda pair: -len(pair[0]))]
# reviews_tk = sorted(reviews_tk, key=lambda r: -len(r))

# for review, label in zip(reviews_tk[:15], labels_tk[:15]):
#     print(len(review), label)

In [None]:
# now we padd the reviews
max_length = 2000
reviews_lengths = []

for idx, review in enumerate(reviews_tk):
    review = review[:max_length]
    reviews_lengths.append(len(review))
    review = ([0] * (max_length - len(review))) + review
    reviews_tk[idx] = review
    
for review, label, lengths in zip(reviews_tk[-15:], labels_tk[-15:], reviews_lengths[-15:]):
    print(len(review), lengths, label)

In [None]:
# make training, validation and test splits
train_count = 20000
val_test_count = 2500

train_x = np.array(reviews_tk[:train_count])
train_y = np.array(labels_tk[:train_count])

val_x = np.array(reviews_tk[train_count:train_count+val_test_count])
val_y = np.array(labels_tk[train_count:train_count+val_test_count])

test_x = np.array(reviews_tk[-val_test_count:])
test_y = np.array(labels_tk[-val_test_count:])

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)
print(val_x.shape, val_y.shape)

In [None]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size=128
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [None]:
# let's now convert the tokenized lists to torch Tensors (no longer necessary)

# reviews_tk = torch.from_numpy(np.array(reviews_tk))
# labels_tk = torch.from_numpy(np.array(labels_tk))
    
# print(reviews_tk[-1][-10:]) # only prints the last 10 chars, but the tensor is very big, as big as the one shown above
# print(labels_tk[-1])

In [None]:
# create model
vocab_size = len(character_keys) + 1 # the + 1 is to account for the 0 that is used for padding

class SentimentModel(nn.Module):
    def __init__(self):
        super(SentimentModel, self).__init__()

        self.embedding_layer = nn.Embedding(vocab_size, 256, padding_idx=0)
        self.cnn1 = nn.Conv1d(256, 256, 3, padding=1, stride=2)
        self.rnn = nn.GRU(256, 512, batch_first=True, num_layers=2, dropout=0.5)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(512, 2)
        
        self.hidden = None
    
    def forward(self, x):
        embeddings = self.embedding_layer(x)
        convolved = F.relu(self.cnn1(embeddings.transpose(1, 2)).transpose(2, 1))
        # embeddings = nn.utils.rnn.pack_padded_sequence(embeddings, lengths, batch_first=True)
        lstm_out, self.hidden = self.rnn(convolved, self.hidden)
        # lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)
        out = out[:,-1,:]
        out = torch.log_softmax(out, dim=1)
        
        return out

In [None]:
model = SentimentModel()
model

In [None]:
# let's train our model for 1 epoch

epochs = 10

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()

model.cuda()
for e in range(1, epochs+1):
    total_loss = 0
    total_accuracy = 0
    model.train()
    batch = 0
    for reviews, labels in train_loader:
        batch += 1
        reviews, labels = reviews.cuda(), labels.cuda()
        
        optimizer.zero_grad()
        model.hidden = None
        
        pred = model(reviews)
        loss = criterion(pred, labels)
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 4)
        optimizer.step()
        
        total_loss += loss.item()
        
        equals = torch.argmax(pred, dim=1).view(-1) == labels.view(-1)
        total_accuracy += torch.mean(equals.type(torch.FloatTensor))
            
        print(f"EPOCH {e} ({batch}/{len(train_loader)}) - loss {total_loss/batch:.4f} - acc {total_accuracy/batch:.4f}", end='\r') 
    
    valid_loss = 0
    valid_accuracy = 0
    with torch.no_grad():
        model.eval()
        for reviews, labels in valid_loader:
            reviews, labels = reviews.cuda(), labels.cuda()
            model.hidden = None
            
            pred = model(reviews)
            loss = criterion(pred, labels)
            
            valid_loss += loss.item()
            
            equals = torch.argmax(pred, dim=1).view(-1) == labels.view(-1)
            valid_accuracy += torch.mean(equals.type(torch.FloatTensor))
    
    print(f"EPOCH {e} - loss {total_loss/len(train_loader):.4f} - acc {total_accuracy/len(train_loader):.4f} - val_loss {valid_loss/len(valid_loader):.4f} - val_acc {valid_accuracy/len(valid_loader):.4f}")
    

In [None]:
test_loss = 0
test_accuracy = 0
model.cuda()
with torch.no_grad():
    model.eval()
    for reviews, labels in test_loader:
        reviews, labels = reviews.cuda(), labels.cuda()
        model.hidden = None

        pred = model(reviews)
        loss = criterion(pred, labels)

        test_loss += loss.item()

        equals = torch.argmax(pred, dim=1).view(-1) == labels.view(-1)
        test_accuracy += torch.mean(equals.type(torch.FloatTensor))

print(f"Final model -> loss: {test_loss / len(test_loader):.4f} - accuracy: {test_accuracy / len(test_loader):.4f}")
    

In [None]:
torch.save(model.state_dict(), 'model.pt')