In [None]:
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter, OrderedDict
import itertools

import torch 
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

import nltk
from nltk.corpus import stopwords

In [None]:
with open('deep-learning-v2-pytorch/sentiment-analysis-network/reviews.txt', 'r') as f:
    reviews = f.read()
with open('deep-learning-v2-pytorch/sentiment-analysis-network/labels.txt', 'r') as f:
    labels = f.read()

In [None]:
# This function will recieve the imported reviews (ch by ch) and return 
def clean_text(text):
    ''' This Function recieves reviews (ch by ch) and returns a list of
    reviews without punctuation and stopwords'''
    # remove punctuation
    s = ''.join(ch.lower() for ch in text if ch not in punctuation)
    
    # separate each review and add to a list so that I have a list of reviews
    separated_reviews = []

    for review in s.split('\n'):
        review = ''.join(review)
        separated_reviews.append(review)

        
    return separated_reviews

In [None]:
reviews = clean_text(reviews)

In [5]:
labels = clean_text(labels)

In [6]:
class ReviewEncoder:
    def __init__(self):
        self.__words_dict = {}
        self.__indexer = 1
    def word_dict(self):
        return self.__words_dict
    def encode(self, text):
        encoded_review = []
        words = text.split()
        #print(words)
        for word in words:
            if word in self. __words_dict:
                encoded_review.append(self.__words_dict[word])
            else:
                self.__words_dict[word] = self.__indexer
                self.__indexer += 1
                encoded_review.append(self.__words_dict[word])
        return encoded_review
    
    def len_dict(self):
        return len(self.__words_dict)

In [7]:
encoder = ReviewEncoder()

In [8]:
encoded_reviews = []
for review in reviews:
    encoded_reviews.append(encoder.encode(review))

In [9]:
encoded_labels = []

for label in labels:
    encoded_labels.append(encoder.encode(label))

In [10]:
def drop_empty_reviews(text):
    
    full_reviews = []
    
    for index, review in enumerate(text):
        if len(review) != 0:
            full_reviews.append(review)
            
    return full_reviews

In [11]:
# check the index of the zero len review
index_to_remove = []
for i, review in enumerate(encoded_reviews):
    if len(review) == 0:
        index_to_remove.append(i)
index_to_remove

[25000]

In [12]:
# remove label with index 25000
del encoded_labels[25000]

In [13]:
encoded_reviews = drop_empty_reviews(encoded_reviews)

In [14]:
def padding_truncation(encoded_review_list):
    max_review_size = 200
    padded_review = []
    for review in encoded_review_list:
        if len(review) < max_review_size:
            padding = max_review_size - len(review)
            review = ([0]*padding + review)
            padded_review.append(review)
        elif len(review) > max_review_size:
            review = review[:max_review_size]
            padded_review.append(review)
        else:
            padded_review.append(review)
            
    return padded_review

In [15]:
padded_reviews = padding_truncation(encoded_reviews)

rev = padded_reviews[0]
rev = list(filter(lambda x: x != 0, rev))
print(rev)
d = encoder.word_dict()

 print([get_key(d, c) for c in rev])

def get_key(d, v):
    for key, value in d.items():
        if value == v:
            return key
    return str(-1)


decoded_rev = ' '.join([ for c in rev])
print(decoded_rev)

training

validation

type(padded_reviews)

In [16]:
padded_reviews = np.asarray(padded_reviews, dtype=int)

In [17]:
type(padded_reviews)

numpy.ndarray

In [18]:
type(encoded_labels)

list

In [19]:
encoded_labels[:4]

[[1482], [6782], [1482], [6782]]

In [20]:
encoded_labels_ = []
for sublist in encoded_labels:
    for item in sublist:
        encoded_labels_.append(item)

In [21]:
encoded_labels = np.asarray(encoded_labels_, dtype='int32')

In [22]:
enc_labels = []
for label in encoded_labels:
    if label == 1482:
        enc_labels.append(1)
    else:
        enc_labels.append(0)

In [23]:
encoded_labels = np.asarray(enc_labels, dtype='int32')

In [24]:
# Defining training, validation and testing sets

training = int(len(padded_reviews) * 0.8)
validation = int(training + len(padded_reviews)*0.1)

train_x = padded_reviews[:training]
train_y = encoded_labels[:training]

val_x = padded_reviews[training:validation]
val_y = encoded_labels[training:validation]

test_x = padded_reviews[validation:]
test_y = encoded_labels[validation:]

In [25]:
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [26]:
train_loader = DataLoader(dataset=train_dataset, batch_size=50, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=50, shuffle=True)

In [27]:
train_on_gpu = torch.cuda.is_available()

In [37]:
class RNN(nn.Module):
    
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, drop_prob, out_features):
        
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.out_features = out_features
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout = drop_prob)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, out_features)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        
        batch_size = x.size(0)
        
        embedding_out = self.embedding(x)
        lstm_out, hidden = self.lstm(embedding_out)

        lstm_out = lstm_out.contiguous().view(-1, 256)

        lstm_out_dropout = self.dropout(lstm_out)
        
        fc_out = self.fc(lstm_out_dropout)

        sig_out = self.sigmoid(fc_out)
    
        
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:,-1]
        
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda(),
                  weight.new(self.num_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                      weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        
        return hidden
        


In [38]:
num_embeddings = 74072 + 1
batch_size = 50
embedding_dim = 400
hidden_size = 256
num_layers = 2
drop_prob = 0.5
out_features = 1

In [39]:
model = RNN(num_embeddings = num_embeddings, embedding_dim = embedding_dim, hidden_size = hidden_size, 
            num_layers = num_layers, drop_prob = drop_prob, out_features = out_features)

In [40]:
model

RNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [41]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

epochs = 4
counter = 0
print_every = 100
clip=5

In [42]:
model.cuda()

RNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
model.train()

for e in range(epochs):
    
    h = model.init_hidden(batch_size)
    for x, y in train_loader:
        counter += 1
        
        x, y = x.cuda(), y.cuda()
        x = x.long()
        
        h = tuple([each.data for each in h])
        model.zero_grad()
        
        output, hidden = model(x, h)

        loss = criterion(output.squeeze(), y.float())
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        
        if counter % print_every == 0:
            
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            for x, y in valid_loader:
                x, y = x.cuda(), y.cuda()
                x = x.long()
                
                val_h = tuple([each.data for each in h])
                
                output, val_h = model(x, val_h)
                val_loss = criterion(output.squeeze(), y.float())
                
                val_losses.append(val_loss.item())
            model.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.645193... Val Loss: 0.620983
Epoch: 1/4... Step: 200... Loss: 0.682910... Val Loss: 0.618320
Epoch: 1/4... Step: 300... Loss: 0.456034... Val Loss: 0.756873
Epoch: 1/4... Step: 400... Loss: 0.704611... Val Loss: 0.686533
