In [5]:
import torch as T
import numpy as np
import torch.nn as nn
from torch.optim import Adam
import numpy as np
from torch.utils.data import TensorDataset
from torch.utils.data.dataloader import DataLoader

DATA_LOADER


In [21]:
class DATA_LOADER():
    def __init__(self):
        with open(r'/content/drive/MyDrive/dl_datasets/reviews.txt', 'r') as f:
            self.reviews = f.read()

        with open(r'/content/drive/MyDrive/dl_datasets/labels.txt', 'r') as f:
            self.labels = f.read()

        from string import punctuation

        self.reviews = self.reviews.lower()
        self.reviews = ''.join(
            [i for i in self.reviews if i not in punctuation])

        # ITS TO BE USED LATER FOR ENCODING THE REVIEWS TO INTEGERS
        self.reviews_words = self.reviews.split('\n')
        self.reviews = ' '.join(self.reviews_words)
        self.reviews = self.reviews.split()

        self.labels = self.labels.split('\n')
        self.labels = [int(i == 'positive') for i in self.labels]

        # REVIEWS CONTAINS ALL THE INDIVIDUAL WORDS. WE HAVE TO FIND THE NO. OF OCCURANCES OF THE WORDS.
        # SORT THE LIST BASED ON THE MAX OCCURANCES, I.E WORD 'THE' HAS THE HIGHEST OCCURANCES AND HENCE ITS GIVEN INDEX OF 1.
        # NOW TAKE THE REVIEW WORDS AND ENCODE THEM WITH THE INDEX VALUE.

        from collections import Counter

        # COUNT THE NO. OF OCCURANCES OF THE WORDS
        self.reviews = Counter(self.reviews)
        # SORTING THE WORD LIST BASED ON MAX OCCURANCES
        self.words = sorted(self.reviews, key=self.reviews.get, reverse=True)

        self.words_to_int = {word: ii for ii, word in enumerate(
            self.words, 1)}   # INDEXING THE WORDS IN THE SORTED ORDER
        self.encoded_review = []                                             # ENCODED VALUES FOR THE REVIEWS

        for review in self.reviews_words:
            self.encoded_review.append(
                [self.words_to_int[word] for word in review.split()])

        # TO REMOVE REVIEWS OF ZERO LENGTH I.E REVIEW AT LAST IS AN EMPTY ARRAY.

        len_of_rev = [len(i) for i in self.encoded_review]
        max_rev_length = max(len_of_rev)                                # TO FIND THE MAX LENGTH REVIEW

        non_zero_idx = [ii for ii, review in enumerate(
            self.encoded_review) if len(review) != 0]

        self.encoded_review = [self.encoded_review[ii] for ii in non_zero_idx]
        self.encoded_labels = np.array(
            [self.labels[ii] for ii in non_zero_idx], dtype=np.int64)

        # TO FEED TO THE MODEL WE NEED TO TRUNCATE OR EXTEND ALL OUR REVIEWS TO A STD LENGTH LET US TAKE MEAN LENGTH
        self.features = self.std_length(self.encoded_review, int(
            np.ceil(np.mean(len_of_rev))))
        
        self.seq_len = int(np.ceil(np.mean(len_of_rev)))

    def std_length(self, encoded_review, seq_len):
        features = np.zeros((len(encoded_review), seq_len), dtype=np.int64)
        for i, j in enumerate(encoded_review):
            ind = min(len(j), seq_len)
            features[i, -ind:] = j[:ind]
        return features

        
        #print(features.shape)
        #print(features[0], labels[0])

    def sentiment_dataloader(self):
        train_split = int(len(self.features) * 0.8)
        val_split = int(len(self.features) * 0.9)

        train_features, val_features, test_features = self.features[:train_split], \
                                                self.features[train_split:val_split], self.features[val_split:]

        train_labels, val_labels, test_labels = self.encoded_labels[:train_split], \
            self.encoded_labels[train_split:val_split], self.encoded_labels[val_split:]

        train_data = TensorDataset(T.from_numpy(train_features), T.from_numpy(train_labels))
        val_data = TensorDataset(T.from_numpy(val_features), T.from_numpy(val_labels))
        test_data = TensorDataset(T.from_numpy(test_features), T.from_numpy(test_labels))

        train_loader = DataLoader(train_data, batch_size=50, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=50, shuffle=True)
        test_loader = DataLoader(test_data, batch_size=50, shuffle=True)

        return train_loader, val_loader, test_loader

MODEL

In [8]:
class LSTM_SENTIMENT(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hid_dims, n_layers, out_dims):
        super().__init__()
        self.hid_dims = hid_dims
        self.out_dims = out_dims
        self.n_layers = n_layers

        self.drop_prob = 0.5
        self.lr = 0.001
        self.device = 'cuda' if T.cuda.is_available() else 'cpu'

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hid_dims, n_layers,
                            dropout=self.drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(self.drop_prob)

        self.fc = nn.Linear(self.hid_dims, self.out_dims)

        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x, h):
        
        batch_size = x.shape[0]

        embed = self.embedding(x.long()).to(self.device)
        #print(h[0].shape)
        o, h = self.lstm(embed, h)
        
        o = o[:,-1,:]
        
        o = self.dropout(o)
        
        o = self.sigmoid(self.fc(o))
        
        return o, h
    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data

        hidden = (weight.new_zeros(self.n_layers, batch_size, self.hid_dims).to(self.device),
                  weight.new_zeros(self.n_layers, batch_size, self.hid_dims).to(self.device))

        return hidden

TRAINING

In [25]:
dat = DATA_LOADER()
train, val, test = dat.sentiment_dataloader()
vocab_size = len(dat.words_to_int) + 1

out_dim = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
batch_size = 50
lstm = LSTM_SENTIMENT(vocab_size, embedding_dim,
                      hidden_dim, n_layers, out_dim)
lstm.to(lstm.device)
#print(lstm)

loss_fn = nn.BCELoss()
optimizer = Adam(lstm.parameters(), lr = 0.001)

ctr = 0

print_every = 100

In [15]:
for e in range(4):
    
    hidden = lstm.init_hidden(batch_size)

    for inp, tgt in train:
        lstm.train()
        ctr += 1
        hidden = tuple([x.data for x in hidden]) 

        lstm.zero_grad()

        inp, tgt = inp.to(lstm.device), tgt.to(lstm.device)
        
        #print(hidden[0].shape, hidden[1].shape)
        out, hidden = lstm(inp, hidden)

        loss = loss_fn(out.squeeze(), tgt.float())

        loss.backward()

        nn.utils.clip_grad_norm_(lstm.parameters(), 5)

        optimizer.step()

        if ctr % print_every == 0:
            val_loss = []

            val_h = lstm.init_hidden(batch_size)
            
            lstm.eval()

            for val_x, val_y in val:
                
                val_h = tuple([x.data for x in val_h])

                val_x, val_y = val_x.to(lstm.device), val_y.to(lstm.device)

                val_out, val_h = lstm(val_x, val_h)

                val_l = loss_fn(val_out.squeeze(), val_y.float())
                val_loss.append(val_l.item())
            
            print(f'\
                EPOCH : {e:>2}\
                STEP :  {ctr:>2}\
                VAL_LOSS : {np.mean(val_loss):>8.6f}\
                LOSS : {loss.item():>8.6f}')

                EPOCH :  0                STEP :  100                VAL_LOSS : 0.649618                LOSS : 0.647455
                EPOCH :  0                STEP :  200                VAL_LOSS : 0.576907                LOSS : 0.654679
                EPOCH :  0                STEP :  300                VAL_LOSS : 0.595175                LOSS : 0.558573
                EPOCH :  0                STEP :  400                VAL_LOSS : 0.569343                LOSS : 0.562032
                EPOCH :  1                STEP :  500                VAL_LOSS : 0.494353                LOSS : 0.392687
                EPOCH :  1                STEP :  600                VAL_LOSS : 0.482486                LOSS : 0.286285
                EPOCH :  1                STEP :  700                VAL_LOSS : 0.505159                LOSS : 0.367636
                EPOCH :  1                STEP :  800                VAL_LOSS : 0.453432                LOSS : 0.378100
                EPOCH :  2              

TEST

In [16]:
test_losses = []
num_correct = 0

h = lstm.init_hidden(batch_size)

lstm.eval()

for inputs, labels in test:

    h = tuple([each.data for each in h])

    inputs, labels = inputs.to(lstm.device), labels.to(lstm.device)
    
    output, h = lstm(inputs, h)

    test_loss = loss_fn(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())

    pred = T.round(output.squeeze())

    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy()) 
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))

test_acc = num_correct/len(test.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.458
Test accuracy: 0.821


In [17]:
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'

In [18]:
from string import punctuation
def tokenize_review(test_review):
    test_review = test_review.lower()
    test_text = ''.join([c for c in test_review if c not in punctuation])
    test_words = test_text.split()
    test_ints = []
    test_ints.append([dat.words_to_int.get(word, 0) for word in test_words])
    return test_ints

In [22]:
def predict(model, test_review, sequence_length=241):

    model.eval()
    test_ints = tokenize_review(test_review)

    features = dat.std_length(test_ints, sequence_length)
    feature_tensor = T.from_numpy(features)
    
    batch_size = feature_tensor.shape[0]

    h = model.init_hidden(batch_size)
  
    feature_tensor = feature_tensor.to(model.device)

    output, h = model(feature_tensor, h)

    pred = T.round(output.squeeze()) 
    
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))

    if(pred.item()==1):
        print("Positive review!")
    else:
        print("Negative review")

In [26]:
predict(lstm, test_review_neg, dat.seq_len)

Prediction value, pre-rounding: 0.487199
Negative review
