In [0]:
import numpy as np

# read data from reviews and labels file.
with open('reviews.txt', 'r') as f:
    reviews_ = f.readlines()
with open('labels.txt', 'r') as f:
    
    labels = f.readlines()

In [2]:
# One of the most important task is to visualize data before starting with any ML task. 
for i in range(5):
    print(labels[i] + "\t: " + reviews_[i][:100] + "...")

positive
	: bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life...
negative
	: story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terr...
positive
	: homelessness  or houselessness as george carlin stated  has been an issue for years but never a plan...
negative
	: airport    starts as a brand new luxury    plane is loaded up with valuable paintings  such belongin...
positive
	: brilliant over  acting by lesley ann warren . best dramatic hobo lady i have ever seen  and love sce...




We can see there are a lot of punctuation marks like fullstop(.), comma(,), new line (\n) and so on and we need to remove it. 

Here is a list of all the punctuation marks that needs to be removed 
```
(!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~)
```


In [0]:
# Make everything lower case to make the whole dataset even. 
reviews = ''.join(reviews_).lower()

In [0]:
# complete the function below to remove punctuations and save it in no_punct_text
from nltk.tokenize import RegexpTokenizer
def text_without_punct_returnwords(reviews):
    tokenizer = RegexpTokenizer(r'\w+')
    words=tokenizer.tokenize(reviews)
    return words

In [0]:
# split the formatted no_punct_text into words
words = text_without_punct_returnwords(reviews)

In [6]:
# once you are done print the ten words that should yield the following output
words[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

In [7]:
# print the total length of the words
len(words)

6020196

In [8]:
# Total number of unique words
len(set(words))

74072


Next step is to create a vocabulary. This way every word is mapped to an integer number.
```
Example: 1: hello, 2: I, 3: am, 4: Robo and so on...
```


In [0]:
# Lets create a vocab out of it

# feel free to use this import 
from collections import Counter

## Let's keep a count of all the words and let's see how many words are there. 
def word_count(words):
    return Counter(words)

counts=word_count(words)

In [10]:
print (counts['wonderful'])

print (counts['bad'])
counts=counts.most_common()

1658
9308


In [11]:
# define a vocabulary for the words
def vocabulary(counts):
  vocab_to_int=dict()
  vocab=[]
  i=1
  for key,number in counts:
    vocab.append(key)
    vocab_to_int[f'{key}']=i
    i+=1
  return vocab_to_int,vocab
vocab_int,vocab = vocabulary(counts)
print(len(vocab_int))
print(vocab[1])

74072
and


In [0]:
# map each vocab word to an integer. Also, start the indexing with 1 as we will use 
# '0' for padding and we dont want to mix the two.
vocab_to_int = vocab_int

In [13]:
# verify if the length is same and if 'and' is mapped to the correct integer value.
print(len(vocab_to_int))
print(vocab_to_int['and'])

74072
2


Let's see what positve words in positive reviews we have and what we have in negative reviews. 

In [0]:
positive_counts = Counter()
negative_counts = Counter()

In [0]:
for i in range(len(reviews_)):
    if(labels[i] == 'positive\n'):
        for word in reviews_[i].split(" "):
            positive_counts[word] += 1
    else:
        for word in reviews_[i].split(" "):
            negative_counts[word] += 1

In [16]:
labels[:10]

['positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n',
 'positive\n',
 'negative\n']

In [17]:
positive_counts.most_common()[:10]

[('', 537968),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235)]

In [18]:
negative_counts.most_common()[:10]

[('', 548962),
 ('.', 167538),
 ('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327)]

The above is just to show the most common words in the positive and negative sentences. However, there are a lot of unnecessary words like `the`, `a`, `was`, and so on. Can you find a way to show the relevant words and not these words? 

```
Stop Words removal or normalizing each term.
```

In [19]:
words[:30]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such',
 'as',
 'teachers',
 'my',
 'years',
 'in',
 'the',
 'teaching',
 'profession',
 'lead',
 'me']

In [20]:
[vocab_to_int[word] for word in words[:30]]

[21025,
 308,
 6,
 3,
 1050,
 207,
 8,
 2138,
 32,
 1,
 171,
 57,
 15,
 49,
 81,
 5785,
 44,
 382,
 110,
 140,
 15,
 5194,
 60,
 154,
 9,
 1,
 4975,
 5852,
 475,
 71]

In [21]:
print(vocab_to_int['bromwell'])
import re
no_punct_text = re.sub(r'[^\w\s]','',reviews)
reviews_split = no_punct_text.split('\n')

21025


## One hot encoding

We need one hot encoding for the labels. Think of a reason why we need one hot encoded labels for classes?

* Write the one hot encoding logic in the `one_hot` function.
* we use 1 for positive label and 0 for negative label.
* Save all the values in the `encoded_labels` function.

In [0]:
# 1 for positive label and 0 for negative label
def one_hot(labels):
  val=[]
  for i in labels:
    if i=='positive\n':
      val.append(1)
    else:
      val.append(0)
  return val
encoded_labels = one_hot(labels)

In [0]:
#print the length of your label and uncomment next line only if the encoded_labels size is 25001.
# If you dont get the intuition behind this step, print encoded_labels to see it.
#encoded_labels = encoded_labels[:25000]

In [24]:
len(encoded_labels)
print(encoded_labels[:10])

[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]


In [0]:
reviews_ints = []
for review in reviews_split:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [26]:
# This step is to see if any review is empty and we remove it. Otherwise the input will be all zeroes.
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [27]:
print('Number of reviews before removing outliers: ', len(reviews_ints))

## remove any reviews/labels with zero length from the reviews_ints list.

# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length reviews and their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


In [28]:
len(encoded_labels)

25000

## Task 4: Padding the data

>A function that returns an array `features` that contains the padded data, of a standard size, that we'll pass to the network. 
* The data should come from `review_ints`, since we want to feed integers to the network. 
* Each row should be `seq_length` elements long. 
* For reviews shorter than `seq_length` words, **left pad** with 0s. That is, if the review is `['best', 'movie', 'ever']`, `[117, 18, 128]` as integers, the row will look like `[0, 0, 0, ..., 0, 117, 18, 128]`. 
* For reviews longer than `seq_length`, use only the first `seq_length` words as the feature vector.

As a small example, if the `seq_length=10` and an input review is: 
```
[117, 18, 128]
```
The resultant, padded sequence should be: 

```
[0, 0, 0, 0, 0, 0, 0, 117, 18, 128]
```

**Your final `features` array should be a 2D array, with as many rows as there are reviews, and as many columns as the specified `seq_length`.**

In [0]:
#  logic for padding the data
def pad_features(reviews_ints, seq_length):
    new_list=[]
    for i in reviews_ints: 
        if len(i)<=seq_length:
            new_list.append((seq_length-len(i))*[0]+i)
        else:
            new_list.append(i[:seq_length])
    return np.array(new_list)
    

In [30]:
 

seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)

## test statements - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
print(features[:30,:10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   54    10    14   116    60   798   552    71   364     5]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    1   330   578    34     3   162   748  2731     9   325]
 [    9    11 10171  5305  1946   689   444    22   280   673]
 [    0     0     0     0     0     0     0     0     0

Now we have everything ready. It's time to split our dataset into `Train`, `Test` and `Validate`. 


##  Lets create train, test and val split in the ratio of 8:1:1.  


In [0]:
train_frac = 0.8
val_frac = 0.1
test_frac = 0.1


def train_test_val_split(features):
    train_x=features[:int(0.8*len(features))]
    val_x=features[int(0.8*len(features)):int(0.9*len(features))]
    test_x=features[int(0.9*len(features)):]
    return train_x,val_x,test_x
                          
def train_test_val_labels(encoded_labels):
    train_y=encoded_labels[:int(0.8*len(encoded_labels))]
    val_y=encoded_labels[int(0.8*len(encoded_labels)):int(0.9*len(encoded_labels))]
    test_y=encoded_labels[int(0.9*len(encoded_labels)):]
    return np.array(train_y),np.array(val_y),np.array(test_y)
    

train_x, val_x, test_x = train_test_val_split(features)
train_y, val_y, test_y = train_test_val_labels(encoded_labels)

In [32]:
## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


## DataLoaders and Batching

After creating training, test, and validation data, we can create DataLoaders for this data by following two steps:
1. Create a known format for accessing our data, using [TensorDataset](https://pytorch.org/docs/stable/data.html#) which takes in an input set of data and a target set of data with the same first dimension, and creates a dataset.
2. Create DataLoaders and batch our training, validation, and test Tensor datasets.

```
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
train_loader = DataLoader(train_data, batch_size=batch_size)
```

This is an alternative to creating a generator function for batching our data into full batches.

### A generator function for the dataset. 

In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets for train, test and val
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50 

# make sure to SHUFFLE your training data. Keep Shuffle=True.
train_loader = DataLoader(train_data,batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data,batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data,batch_size=batch_size, shuffle=True)

In [34]:
# obtain one batch of training data and label. 
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[   10,    84,   329,  ...,    41,   235,     9],
        [   11,    18,     6,  ...,     1,   870,  1264],
        [ 1281,     1,   112,  ...,  3656, 11307,  1163],
        ...,
        [ 5017,    14,  6662,  ...,     4,     1,   512],
        [   11,    18,   166,  ...,  2115,  2194,     6],
        [    0,     0,     0,  ...,    52,    72,   560]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        1, 1])


In [35]:
# Check if GPU is available.
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.



LSTM  Model

Now create a class named SentimentLSTM with `n_layers=2`, and rest all hyperparameters same as before. Also, create an embedding layer and feed the output of the embedding layer as input to the LSTM model. Dont forget to add a regularizer (dropout) layer after the LSTM layer with p=0.4 to prevent overfitting. 

In [0]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    """
    The LSTM model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.4):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentLSTM, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.4)
        
        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)
        
        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
          hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

## Instantiate the network

Here, we'll instantiate the network. First up, defining the hyperparameters.

* `vocab_size`: Size of our vocabulary or the range of values for our input, word tokens.
* `output_size`: Size of our desired output; the number of class scores we want to output (pos/neg).
* `embedding_dim`: Number of columns in the embedding lookup table; size of our embeddings.
* `hidden_dim`: Number of units in the hidden layers of our LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* `n_layers`: Number of LSTM layers in the network. Typically between 1-3

In [37]:
# Instantiate the model with these hyperparameters
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)


print(net)

SentimentLSTM(
  (embedding): Embedding(74073, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.4)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [0]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


### Loss Functions
We are using `BCELoss (Binary Cross Entropy Loss)` since we have two output classes. 


In [39]:
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.605741... Val Loss: 0.657327
Epoch: 1/4... Step: 200... Loss: 0.568148... Val Loss: 0.570506
Epoch: 1/4... Step: 300... Loss: 0.593185... Val Loss: 0.612960
Epoch: 1/4... Step: 400... Loss: 0.463784... Val Loss: 0.624486
Epoch: 2/4... Step: 500... Loss: 0.525115... Val Loss: 0.547588
Epoch: 2/4... Step: 600... Loss: 0.437125... Val Loss: 0.488235
Epoch: 2/4... Step: 700... Loss: 0.495678... Val Loss: 0.471928
Epoch: 2/4... Step: 800... Loss: 0.413771... Val Loss: 0.475361
Epoch: 3/4... Step: 900... Loss: 0.360896... Val Loss: 0.453201
Epoch: 3/4... Step: 1000... Loss: 0.475974... Val Loss: 0.476717
Epoch: 3/4... Step: 1100... Loss: 0.263890... Val Loss: 0.432978
Epoch: 3/4... Step: 1200... Loss: 0.246858... Val Loss: 0.425624
Epoch: 4/4... Step: 1300... Loss: 0.408047... Val Loss: 0.519153
Epoch: 4/4... Step: 1400... Loss: 0.156555... Val Loss: 0.459965
Epoch: 4/4... Step: 1500... Loss: 0.117863... Val Loss: 0.486934
Epoch: 4/4... Step: 1600... Loss: 


Now we write a prediction function to predict the output for the test set created and calculate the accuracy of the test set.

In [40]:
def predict(test_loader):
      
  # Get test data loss and accuracy
  test_losses = [] # track loss
  num_correct = 0

  # init hidden state
  h = net.init_hidden(batch_size)

  net.eval()
  # iterate over test data
  for inputs, labels in test_loader:

      # Creating new variables for the hidden state, otherwise
      # we'd backprop through the entire training history
      h = tuple([each.data for each in h])

      if(train_on_gpu):
          inputs, labels = inputs.cuda(), labels.cuda()
      
      # get predicted outputs
      output, h = net(inputs, h)
      
      # calculate loss
      test_loss = criterion(output.squeeze(), labels.float())
      test_losses.append(test_loss.item())
      
      # convert output probabilities to predicted class (0 or 1)
      pred = torch.round(output.squeeze())  # rounds to the nearest integer
      
      # compare predictions to true label
      correct_tensor = pred.eq(labels.float().view_as(pred))
      correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
      num_correct += np.sum(correct)


  # accuracy over all test data
  test_acc = num_correct/len(test_loader.dataset)
  print("Test accuracy: {:.3f}".format(test_acc*100))
predict(test_loader)

Test accuracy: 80.840
