# Sentiment Classification of Stanford Sentiment TreeBank (SST)

## Data Handling and cleaning 

In [1]:
! wget -q http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

In [2]:
!unzip -q /content/stanfordSentimentTreebank.zip

In [3]:
import os
import pandas as pd

In [4]:
sst_dir = 'stanfordSentimentTreebank'

In [5]:
sentiment_labels = pd.read_csv(
    os.path.join(sst_dir, "sentiment_labels.txt"),
     names=['phrase_ids', 'sentiment_values'], 
     sep="|",
      header=0)

sentiment_labels.head()

Unnamed: 0,phrase_ids,sentiment_values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [6]:
def discretize_label(label):
    if label <= 0.2: 
      return 0
    elif label <= 0.4: 
      return 1
    elif label <= 0.6: 
      return 2
    elif label <= 0.8: 
      return 3

    return 4

In [7]:
sentiment_labels['sentiment_values'] = sentiment_labels['sentiment_values'].apply(discretize_label)
sentiment_labels.head()

Unnamed: 0,phrase_ids,sentiment_values
0,0,2
1,1,2
2,2,2
3,3,2
4,4,2


In [8]:
sentence_ids = pd.read_csv(
    os.path.join(sst_dir, "datasetSentences.txt"),
     sep="\t")

sentence_ids.head()

Unnamed: 0,sentence_index,sentence
0,1,The Rock is destined to be the 21st Century 's...
1,2,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha..."


In [9]:
mapping = pd.read_csv(
    os.path.join(sst_dir, "dictionary.txt"),
     sep="|", 
     names=['phrase', 'phrase_ids'])

mapping.head()

Unnamed: 0,phrase,phrase_ids
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [10]:
train_test_split = pd.read_csv(
    os.path.join(sst_dir, "datasetSplit.txt"))

train_test_split.head()

Unnamed: 0,sentence_index,splitset_label
0,1,1
1,2,1
2,3,2
3,4,2
4,5,2


In [11]:
sentence_phrase_merge = pd.merge(sentence_ids, mapping, left_on='sentence', right_on='phrase')
sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')
dataset = pd.merge(sentence_phrase_split, sentiment_labels, on='phrase_ids')

In [12]:
dataset['phrase_cleaned'] = dataset['sentence'].str.replace(
    r"\s('s|'d|'re|'ll|'m|'ve|n't)\b",
     lambda m: m.group(1))

dataset.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase_ids,splitset_label,sentiment_values,phrase_cleaned
0,1,The Rock is destined to be the 21st Century 's...,The Rock is destined to be the 21st Century 's...,226166,1,3,The Rock is destined to be the 21st Century's ...
1,2,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of `` Th...,226300,1,4,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic,Effective but too-tepid biopic,13995,2,2,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...,If you sometimes like to go to the movies to h...,14123,2,3,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha...","Emerges as something rare , an issue movie tha...",13999,2,4,"Emerges as something rare , an issue movie tha..."


In [13]:
dataset = dataset[['sentence','sentiment_values','splitset_label']].rename(
    columns={"sentence":"text","sentiment_values":"label"})
dataset

Unnamed: 0,text,label,splitset_label
0,The Rock is destined to be the 21st Century 's...,3,1
1,The gorgeously elaborate continuation of `` Th...,4,1
2,Effective but too-tepid biopic,2,2
3,If you sometimes like to go to the movies to h...,3,2
4,"Emerges as something rare , an issue movie tha...",4,2
...,...,...,...
11281,A real snooze .,0,1
11282,No surprises .,1,1
11283,We 've seen the hippie-turned-yuppie plot befo...,3,1
11284,Her fans walked out muttering words like `` ho...,0,1


In [14]:
train = dataset[dataset.splitset_label==1].drop('splitset_label',axis='columns').reset_index(drop=True)
train

Unnamed: 0,text,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer\/composer Bryan Adams contributes a sle...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3
...,...,...
8112,A real snooze .,0
8113,No surprises .,1
8114,We 've seen the hippie-turned-yuppie plot befo...,3
8115,Her fans walked out muttering words like `` ho...,0


In [15]:
valid = dataset[dataset.splitset_label==2].drop('splitset_label',axis='columns').reset_index(drop=True)
test = dataset[dataset.splitset_label==3].drop('splitset_label',axis='columns').reset_index(drop=True)
len(valid),len(test)

(2125, 1044)

In [16]:
data_dict = dict()
data_dict['train'] = train
data_dict['dev'] = valid
data_dict['test'] = test

In [17]:
data_dict['train']

Unnamed: 0,text,label
0,The Rock is destined to be the 21st Century 's...,3
1,The gorgeously elaborate continuation of `` Th...,4
2,Singer\/composer Bryan Adams contributes a sle...,3
3,You 'd think by now America would have had eno...,2
4,Yet the act is still charming here .,3
...,...,...
8112,A real snooze .,0
8113,No surprises .,1
8114,We 've seen the hippie-turned-yuppie plot befo...,3
8115,Her fans walked out muttering words like `` ho...,0


## Creating Data Pipelines for modeling 

### Defining Fields

Now we shall be defining LABEL as a LabelField, which is a subclass of Field that sets sequen tial to False (as it’s our numerical category class). TWEET is a standard Field object, where we have decided to use the spaCy tokenizer and convert all the text to lower‐ case.

In [18]:
# Import Library
import random
import torch, torchtext
from torchtext import legacy
from torchtext.legacy import data

# Manual Seed
SEED = 43
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f6ea95fc830>

In [19]:
Text = data.Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
Label = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =False)

In [20]:
fields=[('text',Text),('label',Label)]

In [21]:
fields

[('text', <torchtext.legacy.data.field.Field at 0x7f6ea82b7090>),
 ('label', <torchtext.legacy.data.field.LabelField at 0x7f6e62b47fd0>)]

Armed with our declared fields, lets convert from pandas to list to torchtext. We could also use TabularDataset to apply that definition to the CSV directly but showing an alternative approach too.

In [33]:
example_train=[data.Example.fromlist([data_dict['train'].text[i],data_dict['train'].label[i]],fields) for i in range (data_dict['train'].shape[0])]

In [34]:
# Creating train dataset
train = data.Dataset(example_train, fields)

In [35]:
example_train[0].text[:20]

['The',
 'Rock',
 'is',
 'destined',
 'to',
 'be',
 'the',
 '21st',
 'Century',
 "'s",
 'new',
 '`',
 '`',
 'Conan',
 "''",
 'and',
 'that',
 'he',
 "'s",
 'going']

In [36]:
example_dev=[data.Example.fromlist([data_dict['dev'].text[i],data_dict['dev'].label[i]],fields) for i in range (data_dict['dev'].shape[0])]

In [37]:
# Creating dev dataset
dev = data.Dataset(example_dev, fields)

In [38]:
(len(train),len(dev))

(17517, 2125)

### Building Vocabulary

At this point we would have built a one-hot encoding of each word that is present in the dataset—a rather tedious process. Thankfully, torchtext will do this for us, and will also allow a max_size parameter to be passed in to limit the vocabulary to the most common words. This is normally done to prevent the construction of a huge, memory-hungry model. We don’t want our GPUs too overwhelmed, after all. 

In [39]:
Text.build_vocab(train,vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
Label.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.33MB/s]                          
100%|█████████▉| 399645/400000 [00:14<00:00, 27340.47it/s]

By default, torchtext will add two more special tokens, <unk> for unknown words and <pad>, a padding token that will be used to pad all our text to roughly the same size to help with efficient batching on the GPU.

In [40]:
print('Size of input vocab : ', len(Text.vocab))
print('Size of label vocab : ', len(Label.vocab))
print('Top 10 words appreared repeatedly :', list(Text.vocab.freqs.most_common(10)))
print('Labels : ', Label.vocab.stoi)

Size of input vocab :  17491
Size of label vocab :  5
Top 10 words appreared repeatedly : [('.', 15888), (',', 13992), ('the', 11976), ('a', 9147), ('of', 8766), ('and', 8701), ('to', 5978), ('-', 5401), ('is', 5003), ("'s", 4796)]
Labels :  defaultdict(None, {3: 0, 1: 1, 2: 2, 4: 3, 0: 4})


In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
train_iterator, valid_iterator = data.BucketIterator.splits((train, dev), batch_size = 32, 
                                                            sort_key = lambda x: len(x.text),
                                                            sort_within_batch=True, device = device)

Save the vocabulary for later use

In [None]:
import os, pickle
with open('tokenizer.pkl', 'wb') as tokens: 
    pickle.dump(Text.vocab.stoi, tokens)

## Defining Our Model

We use the Embedding and LSTM modules in PyTorch to build a simple model for classifying tweets.

In this model we create three layers. 
1. First, the words in our tweets are pushed into an Embedding layer, which we have established as a 300-dimensional vector embedding. 
2. That’s then fed into a 2 stacked-LSTMs with 100 hidden features (again, we’re compressing down from the 300-dimensional input like we did with images). We are using 2 LSTMs for using the dropout.
3. Finally, the output of the LSTM (the final hidden state after processing the incoming tweet) is pushed through a standard fully connected layer with three outputs to correspond to our three possible classes (negative, positive, or neutral).

In [43]:
import torch.nn as nn
import torch.nn.functional as F

class classifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout,pad_idx):
        
        super().__init__()          
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx = pad_idx)
        
        # LSTM layer
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           dropout=dropout,
                           batch_first=True,
                           bidirectional=True)
  
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        # text = [batch size, sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]
      
        # packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        
        packed_output, (hidden, cell) = self.encoder(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
    
        # Hidden = [batch size, hid dim * num directions]
        dense_outputs = self.fc(hidden)   
        
        # Final activation function softmax
        output = dense_outputs[0]
            
        return output

In [60]:
# Define hyperparameters
size_of_vocab = len(Text.vocab)
embedding_dim = 100
num_hidden_nodes = 256
num_output_nodes = 5
num_layers = 2
dropout = 0.5
PAD_IDX = Text.vocab.stoi[Text.pad_token]
# Instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes, num_output_nodes, num_layers, dropout, PAD_IDX)

In [61]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

classifier(
  (embedding): Embedding(17491, 100, padding_idx=1)
  (encoder): LSTM(100, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=5, bias=True)
)
The model has 4,060,529 trainable parameters


## Model Training and Evaluation

First define the optimizer and loss functions

In [62]:
import torch.optim as optim

# define optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=2e-3)
criterion = nn.CrossEntropyLoss()

# define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    _, predictions = torch.max(preds, 1)
    
    correct = (predictions == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
# push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

The main thing to be aware of in this new training loop is that we have to reference `batch.tweets` and `batch.labels` to get the particular fields we’re interested in; they don’t fall out quite as nicely from the enumerator as they do in torchvision.

**Training Loop**

In [63]:
def train(model, iterator, optimizer, criterion):
    
    # initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    # set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        # resets the gradients after every batch
        optimizer.zero_grad()   
        
        # retrieve text and no. of words
        text, text_lengths = batch.text
        
        # convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        # compute the loss
        loss = criterion(predictions, batch.label)        
        
        # compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        # backpropage the loss and compute the gradients
        loss.backward()       
        
        # update the weights
        optimizer.step()      
        
        # loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

**Evaluation Loop**

In [64]:
def evaluate(model, iterator, criterion):
    
    # initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    # deactivating dropout layers
    model.eval()
    
    # deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            # retrieve text and no. of words
            text, text_lengths = batch.text
            
            # convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            # compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            # keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

**Let's Train and Evaluate**

In [65]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    # train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    # evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% \n')

	Train Loss: 1.466 | Train Acc: 35.47%
	 Val. Loss: 1.488 |  Val. Acc: 33.18% 

	Train Loss: 0.934 | Train Acc: 64.02%
	 Val. Loss: 1.681 |  Val. Acc: 35.33% 

	Train Loss: 0.403 | Train Acc: 86.08%
	 Val. Loss: 2.084 |  Val. Acc: 35.95% 

	Train Loss: 0.120 | Train Acc: 96.38%
	 Val. Loss: 2.778 |  Val. Acc: 34.67% 

	Train Loss: 0.047 | Train Acc: 98.72%
	 Val. Loss: 3.521 |  Val. Acc: 34.72% 



### 4.  Outcomes for 25 inputs 



We need to make sure we look the ouputs produced by our model makes sense, so that we can make sure the model is working!

In [132]:
model.eval()
num = 0
for batch in valid_iterator:

  if num>=25:
    break
  else:
    text, text_lengths = batch.text
    
    predictions = model(text, text_lengths).squeeze()
    preds = F.softmax(predictions,dim=-1)
    if text_lengths.item()>5:
      print(num+1)
      print("tokens")
      print([Text.vocab.itos[x] for x in text.tolist()[0]])
      print()
      print("Predicted Rating: ",torch.argmax(preds).item(),"      ","Actual Rating: ",batch.label.item())
      print("Correct predcition" if torch.argmax(preds)==batch.label.squeeze() else "Wrong Classification")
      print('---------------------------------------')
      num+=1

1
tokens
['Effective', 'but', 'too', '-', 'tepid', 'biopic']

Predicted Rating:  2        Actual Rating:  2
Correct predcition
---------------------------------------
2
tokens
['<unk>', 'if', 'overly', 'talky', 'documentary', '.']

Predicted Rating:  0        Actual Rating:  2
Wrong Classification
---------------------------------------
3
tokens
['Light', ',', 'cute', 'and', 'forgettable', '.']

Predicted Rating:  2        Actual Rating:  2
Correct predcition
---------------------------------------
4
tokens
['Between', 'the', 'drama', 'of', 'Cube', '?']

Predicted Rating:  2        Actual Rating:  2
Correct predcition
---------------------------------------
5
tokens
['It', 'is', 'nature', 'against', 'progress', '.']

Predicted Rating:  0        Actual Rating:  2
Wrong Classification
---------------------------------------
6
tokens
['A', 'fascinating', 'and', 'fun', 'film', '.']

Predicted Rating:  3        Actual Rating:  3
Correct predcition
---------------------------------------
7
t

### 5. 10 False Positives

These are specifically texts where the model predicts the reviews to have higher scores like 3 or 4 but the actual label is 0,1 or 2.

In [142]:
model.eval()
num = 0
for batch in valid_iterator:

  if num>=10:
    break
  else:
    text, text_lengths = batch.text
    
    predictions = model(text, text_lengths).squeeze()
    preds = F.softmax(predictions,dim=-1)
    if text_lengths.item()>5 and (torch.argmax(preds)!=batch.label.squeeze()) and (batch.label.squeeze() <= 2) and (torch.argmax(preds).item()>2):
      print(num+1)
      print("tokens")
      print([Text.vocab.itos[x] for x in text.tolist()[0]])
      print()
      print("Predicted Rating: ",torch.argmax(preds).item(),"      ","Actual Rating: ",batch.label.item())
      print('---------------------------------------')
      num+=1

1
tokens
['This', 'is', 'pretty', 'dicey', 'material', '.']

Predicted Rating:  3        Actual Rating:  1
---------------------------------------
2
tokens
['It', "'s", 'like', 'a', 'poem', '.']

Predicted Rating:  3        Actual Rating:  0
---------------------------------------
3
tokens
['D.J.', '<unk>', 'as', '<unk>', 'Jones', '?']

Predicted Rating:  4        Actual Rating:  2
---------------------------------------
4
tokens
['An', 'awkward', 'and', '<unk>', 'movie', '.']

Predicted Rating:  4        Actual Rating:  1
---------------------------------------
5
tokens
['The', 'central', 'story', 'lacks', 'punch', '.']

Predicted Rating:  4        Actual Rating:  1
---------------------------------------
6
tokens
['...', 'too', 'dull', 'to', 'enjoy', '.']

Predicted Rating:  4        Actual Rating:  1
---------------------------------------
7
tokens
['Showtime', 'is', 'closer', 'to', '<unk>', '.']

Predicted Rating:  4        Actual Rating:  1
---------------------------------------
