In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

## Visualising the data

In [2]:
with open('reviews.txt') as f:
    reviews = f.read()

with open('labels.txt') as f:
    labels = f.read()

In [3]:
print(reviews[:2000])
print()
print(labels[:20])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

We see that the reviews are saperated by new lines so are labels

## Data Preprocessing
We need data to be in proper format to feed into the neural network. So encode the words into a unique number and we will also clean a bit
1. First we remove the punctuations
2. Next we remove new line characters and combine everything into one
3. Next we will create a list of words

In [4]:
#removing punctuations
from string import punctuation
print('Punctuations: ', punctuation)

reviews = reviews.lower()
all_text = ''.join([ch for ch in reviews if ch not in punctuation])

Punctuations:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
#joining by new lines, combining and splitting by words
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)
all_text = all_text.split()

In [6]:
all_text[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

## Encode the data
Here we convert each unique word into an integer so that we can feed into the neural network. We use `Counter` so that the most frequent word will be given the lowest integer.

In [7]:
from collections import Counter

vocab2int = {}
i = 1
for x, y in Counter(all_text).most_common():
    vocab2int[x] = i
    i+=1

In [8]:
reviews_ints = []
for sent in reviews_split:
    a = []
    for word in sent.split():
        a.append(vocab2int[word])
        
    reviews_ints.append(a)

### Test code for above implementation

In [9]:
print('Unique words: ', len(vocab2int))
print()

print('Tokenized view:\n', reviews_ints[:1])

Unique words:  74072

Tokenized view:
 [[21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]]


## Encoding the labels
We have only two labels. Those must be encoded. We will convert positive label to 1 and negative label to 0

In [10]:
labels_encoded = []
labels_split = labels.split('\n')

for x in labels_split:
    if x == 'positive':
        labels_encoded.append(1)
    else:
        labels_encoded.append(0)

labels_encoded[:3]

[1, 0, 1]

You can verify these outputs with the starting outputted values

## Removing the outliers
We want to make sure that our model will not get some outliers i.e., values that are extremely big or too small.

For other values, we want to make sure that all the reviews should be of same lenth.

In [11]:
review_lengths = Counter([len(x) for x in reviews_ints])
print('Reviews with zero length: ', review_lengths[0])
print('Reviews with max length: ', max(review_lengths))

Reviews with zero length:  1
Reviews with max length:  2514


As we see that we have one review that has zero length and also we see that the maximum length is very large. So we want to remove the reviews with shorter length and truncate the reviews with longer length so that our model will learn more effectively

In [12]:
print('Number of reviews before removing the outliers: ', len(reviews_ints))

#removing the reviews with zero length
non_zero_idx = [ii for ii, rev in enumerate(reviews_ints) if len(rev)!=0]

reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels_encoded = np.array([labels_encoded[ii] for ii in non_zero_idx])

print('Number of reviews after removing the outliers: ', len(reviews_ints))

Number of reviews before removing the outliers:  25001
Number of reviews after removing the outliers:  25000


## Padding and truncating reviews
Now we want to truncate the reviews that are too long to some specified length and the reviews that are shorter than specified length then we will pad them with 0s. The good sequence length will be 20

In [13]:
def pad_features(reviews_ints, seq_length):
    '''
    This function will return an array of reviews that are either truncated to seq_length
    or padded with 0s
    '''
    features = []
    
    for rev in reviews_ints:
        if len(rev) >= seq_length:
            features.append(rev[:seq_length])
        else:
            features.append([0]*(seq_length-len(rev)) + rev)
    
    return np.array(features)

### Test code to test the above function

In [14]:
seq_length = 200

features = pad_features(reviews_ints, seq_length)

#features should have same rows as reviews_ints
print('Rows in features: {} -- Rows in reviews_ints: {}'.format(len(features), len(reviews_ints)))

#features should have same number of columns as sequence length
print('Columns in features: {} -- Sequence length: {}'.format(len(features[0]), seq_length))

Rows in features: 25000 -- Rows in reviews_ints: 25000
Columns in features: 200 -- Sequence length: 200


In [15]:
print(features[:30, :10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   54    10    14   116    60   798   552    71   364     5]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    1   330   578    34     3   162   748  2731     9   325]
 [    9    11 10171  5305  1946   689   444    22   280   673]
 [    0     0     0     0     0     0     0     0     0

## Splitting into training, validation and test sets
We will split the data into three parts. We will declare a `split_frac` that will tell how much of the data should be in training data and the remaining data is split into two halves, one for validation and other for testing

First I have generated random indexes of size 80% of length of features. Then I have used these indexes to extract traaining samples and out of remaining samples, I have taken 50% to validation and other 50% test set.

In [16]:
split_frac = 0.8

train_idx = np.random.choice(np.arange(len(features)), int(len(features)*split_frac), replace=False)

train_x = np.array([features[ii] for ii in train_idx])
train_y = np.array([labels_encoded[ii] for ii in train_idx])

valid_reviews = np.array([features[ii] for ii in range(len(features)) if ii not in train_idx])
valid_labels  = np.array([labels_encoded[ii] for ii in range(len(features)) if ii not in train_idx])

valid_x = valid_reviews[:int(valid_reviews.shape[0]/2)]
test_x = valid_reviews[int(valid_reviews.shape[0]/2):]

valid_y = valid_labels[:int(valid_reviews.shape[0]/2)]
test_y = valid_labels[int(valid_reviews.shape[0]/2):]

In [17]:
#printing the shape of splitted data
print('Feature Shapes\n',
      'Train set:', train_x.shape,
      '\nValid set:', valid_x.shape,
      '\nTest set:', test_x.shape)

Feature Shapes
 Train set: (20000, 200) 
Valid set: (2500, 200) 
Test set: (2500, 200)


## DataLoaders and Batching
Now we have x,y that is inputs and targets, we convert them to dataset `TensorDataset` class, next we can use this dataset to pass to `DataLoader` class and get batches of data

In [18]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

#defining the batch size
batch_size = 50

trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
validloader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
testloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [19]:
#obtaining one batch of training data
sample_x, sample_y = next(iter(trainloader))

print('Shape of sample_x: ', sample_x.shape)
print('sample_x: \n', sample_x)
print()
print('Shape of sample_y: ', sample_y.shape)
print('sample_y: \n', sample_y)

Shape of sample_x:  torch.Size([50, 200])
sample_x: 
 tensor([[    0,     0,     0,  ...,   288,  3278,    45],
        [    0,     0,     0,  ...,  1169,  3526,  5876],
        [    0,     0,     0,  ...,    55,    55,   799],
        ...,
        [   11,   392,    20,  ...,   276,   752,    17],
        [    0,     0,     0,  ...,    58,   541,    48],
        [    0,     0,     0,  ...,   194, 10895,   130]], dtype=torch.int32)

Shape of sample_y:  torch.Size([50])
sample_y: 
 tensor([1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
        0, 0], dtype=torch.int32)


## Defining the sentiment network
We have to define the network. Here we will use the `embedding layer` as input to the `lstm` because we have a lot of words and using one-hot encoding to these would be inefficient.
1. We defined the embedding layer to convert the words tokens(integers) to embeddings of some specific size.
2. Defined a lstm layers that takes inputs from the embedding layer
3. Define a fully connected layer that maps lstm outputs to our desired output size
4. Use sigmoid activation function, which turns outputs in the value of range 0-1

In [20]:
#checking availability of gpu
cuda_available = torch.cuda.is_available()
if cuda_available:
    print('Training on GPU')
else:
    print('Training on CPU')

Training on CPU


In [33]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_p=0.2):
        super().__init__()
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_p)
        self.fc = nn.Linear(hidden_dim, output_size)
        
        self.dropout = nn.Dropout(p=drop_p)
        
    
    def forward(self, x, hidden):
        
        batch_size = x.shape[0]
        
        x = self.embedding(x)
        output, hidden = self.lstm(x,hidden)
        
        #stack up lstm outputs
        output = output.contiguous().view(-1, self.hidden_dim)
        
        output = self.dropout(output)
        
        
        output = self.fc(output)
        output = F.sigmoid(output)
        
        #reshape to be batch first
        output = output.view(batch_size, -1)
        output = output[:, -1] #get the last batch
        
        
        return output, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        if cuda_available:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

## Instantiate the model
Here, we'll instantiate the network. First up, defining the hyperparameters.
1. vocab_size: Size of our vocabulary or the range of values for our input, word tokens
2. output_size: Size of our desired output; the number of class scores we want to output (pos/neg)
3. embedding_dim: Number of columns in the embedding lookup table; size of our embeddings.
4. hidden_dim: Number of units in the hidden layers of our LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
5. n_layers: Number of LSTM layers in the network. Typically between 1-3

In [34]:
#instantiate the model with hyper parameters
vocab_size = len(vocab2int)+1 #+1 for the padding 0
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

model = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(model)

SentimentRNN(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2)
)


## Training
Now it's time to train our model. 
This time we use different loss function for only one output called `Binary Cross Entropy Loss`.

The hyper parameters for training are:
1. lr: learning rate
2. epochs: the number of epochs
3. clip: the maximum gradient value to clip at

In [44]:
def train(model,trainloader, validloader, batch_size,clip=5, print_every=50, epochs=1, lr=0.01):
    
    if cuda_available:
        model = model.cuda()
    
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    counter = 0 
    for e in range(epochs):
        
        hidden = model.init_hidden(batch_size)
        for x, y in trainloader:
            counter += 1
            
            if cuda_available:
                x, y = x.cuda(), y.cuda()
            
            hidden = tuple([each.data for each in hidden])
            output, hidden = model(x.type(torch.LongTensor), hidden)
            loss = criterion(output.squeeze(), y.float())
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            
            if counter%print_every==0:
                
                val_h = model.init_hidden(batch_size)
                
                model.eval()
                
                for x, y in validloader:
                    
                    val_h = tuple([each.data for each in val_h])
                    
                    if cuda_available:
                        x, y = x.cuda(), y.cuda()
                        
                    output, hidden = model(x.type(torch.LongTensor), val_h)
                    loss_valid = criterion(output.squeeze(), y.float())
                    
                print('Epoch: {}/{}'.format(e+1, epochs),
                        'Training Loss: {}'.format(loss.item()),
                        'Validation loss: {}'.format(loss_valid.item()))
                model.train()
                break
        

In [45]:
train(model, trainloader, validloader, batch_size)



Epoch: 1/1 Training Loss: 0.6673436164855957 Validation loss: 0.6653851866722107
