## SENTIMENT ANALYSIS WITH RNN & CNN ARCHITECTURES

In this  notebook the aim is to develop models based on different Neural Network architectures for the task of binary sentiment classification. Two CNN models and two LSTM models were trained and evaluated on IMDB dataset (http://ai.stanford.edu/~amaas/data/sentiment/) which contains negative and positive movie reviews (50.000 in total). In two of the four models, pre-trained GloVe embeddings were used (https://nlp.stanford.edu/projects/glove/) 

In [36]:
#!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
#!tar -xvf aclImdb_v1.tar.gz -C /content/drive/MyDrive/Deep_Learning_Ass_2_3

In [37]:
#!wget https://nlp.stanford.edu/data/glove.6B.zip
#!unzip '/content/drive/MyDrive/Deep_Learning_Ass_2_3/glove.6B.zip' -d "/content/drive/MyDrive/Deep_Learning_Ass_2_3"

In [5]:
import glob
import string
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import torch.nn.functional as F
from torch import optim
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
# Set Device

is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


## Load & Clean data

In [9]:
os.chdir('/content/drive/MyDrive/Deep_Learning_Ass_2_3/aclImdb')
!pwd

/content/drive/MyDrive/Deep_Learning_Ass_2_3/aclImdb


In [10]:
pos_files = glob.glob('./**/pos/*.txt',recursive=True)
neg_files = glob.glob('./**/neg/*.txt',recursive=True)

def read_review (file):
  with open(file,'r',encoding='utf-8') as reader:
    text = reader.read().lower().rstrip()
    return text

def process_review (text): 
  stop = set(stopwords.words('english'))
  text = text.replace('<br /><br />','') 
  text = ' '.join(token for token in text.split() if token not in stop)
  text = re.sub(r"[^\w\s]", '', text)
  text = re.sub(r"\d", '',text)
  return text 
    
def get_all_reviews (files):
  return [process_review(read_review(file)) for file in files]

neg_reviews = get_all_reviews(pos_files)
pos_reviews = get_all_reviews(neg_files)

neg_df = pd.DataFrame(list(zip(neg_reviews, [0]*len(neg_reviews))),
               columns =['Reviews', 'Sentiment'])

pos_df = pd.DataFrame(list(zip(pos_reviews, [1]*len(pos_reviews))),
               columns =['Reviews', 'Sentiment'])

data = pd.concat([neg_df,pos_df]).sample(frac=1)
data.reset_index(drop=True,inplace=True)

labels = np.array([int(value) for value in data['Sentiment'].values]).astype('float64')
inputs = np.array([value for value in data['Reviews'].values])

print(data)

                                                 Reviews  Sentiment
0      know john singletons smart guy coz made boyz n...          1
1      disappointed movie plotwise weak bordering sil...          1
2      ive seen film sky cinema long ago must admit r...          0
3      yes copy vhs uncut great condition transfered ...          0
4      relatively small budget animated film  million...          0
...                                                  ...        ...
49995  movie powerful watched movie  suppose work  am...          0
49996  reason give movie even single star much ending...          1
49997  film made senegal based guess loosely carmen b...          1
49998  comment movie gave  rating opinion thats prett...          1
49999  movies grabbed attention like one has see want...          0

[50000 rows x 2 columns]


## Split Data into Train, Validation and Test

In [12]:
split_ratio = 0.8

labels = list(data['Sentiment'])
inputs = list(data['Reviews'])
X_train, X_remain, y_train, y_remain = train_test_split(inputs, labels, train_size=split_ratio)
X_valid, X_test, y_valid, y_test = train_test_split(X_remain, y_remain, train_size=0.5)

## Build Vocabulary & Create Mappings

In [None]:
def tokenize(sentences):
  tokenized_sentences = []
  for sent in sentences:
    tokenized_sentences.append(word_tokenize(sent))
  return tokenized_sentences

def build_vocab (reviews):
  tokens = []
  for tok_review in tokenize(reviews):
    for token in tok_review:
      tokens.append(token)
  vocab = set(tokens)
  return vocab

def create_dicts(unique_words):
  word2idx = {}
  word2idx['<pad>'] = 0
  word2idx['<ukn>'] = 1
  idx = 2
  for word in unique_words:
    word2idx[word] = idx
    idx += 1
  idx2word = {idx : word for word, idx in word2idx.items()}
  return word2idx, idx2word

In [None]:
vocab = build_vocab(neg_reviews + pos_reviews)
word2idx, idx2word = create_dicts(vocab) 

print ('unique words:\t',len(word2idx))

unique words:	 215098


In [None]:
MAX_LEN = np.max([len(review)for review in tokenize(inputs)])
print ('Max review sequence length before encoding:\t',MAX_LEN)

Max review sequence length before encoding:	 1443


## Encode Input Sentences Into Vectors

In [None]:
seq_length =  300

def encoding (reviews, max_len):
  """ 
  1) Truncate very long input sentences
  2) Add padding idx (0) short sentences
  2) Encode input sentences in arrays of shape N x seq_length
  """
  padded_reviews = []
  tokenized_rev = tokenize(reviews)
  for review in tokenized_rev:
    if len(review) >= max_len:
      padded_reviews.append(review[:max_len])
    else:
      padded_reviews.append(['<pad>']*(max_len-len(review)) + review)
  encoded_reviews = np.array([[word2idx[word]for word in review]for review in padded_reviews]).astype('float64')
  return encoded_reviews

In [None]:
X_train_encoded = encoding (X_train,seq_length)
X_valid_encoded = encoding (X_valid, seq_length)
X_test_encoded = encoding (X_test, seq_length)

print("\t\t\tFeatures Shapes:")
print("Train set: \t\t{}".format(X_train_encoded.shape),
      "\nValidation set: \t{}".format(X_valid_encoded.shape),
      "\nTest set: \t\t{}".format(X_test_encoded.shape))

			Features Shapes:
Train set: 		(40000, 300) 
Validation set: 	(5000, 300) 
Test set: 		(5000, 300)


## Load Pretrained Embeddings (Glove)

In [None]:
def load_pretrained_vectors(word2idx, file_name):

  N_vocab = len(word2idx) # vocab dimensions
  d = 50  # embedding dimensions
  
  # Initilize random embeddings
  embeddings = np.random.uniform(-0.25, 0.25, (N_vocab, d)) 
  embeddings[word2idx['<pad>']] = np.zeros((50,))

  print("Loading pretrained vectors...")
    
  with open(file_name, 'rt', encoding='utf-8',newline='\n',errors='ignore') as lines:
    count = 0
    for line in lines:
      tokens = line.rstrip().split(' ')
      word = tokens[0]
      if word in word2idx:
        count += 1
        embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)
  print()
  print(f"{count} / {N_vocab} pretrained vectors found.")

  return embeddings

# create glove embedding tensors
glove_embeddings  = load_pretrained_vectors(word2idx,'../glove.6B.50d.txt' )
glove_embeddings = torch.from_numpy(glove_embeddings)
print()
print (glove_embeddings.shape)


Loading pretrained vectors...

76033 / 215098 pretrained vectors found.

torch.Size([215098, 50])


## Create Datasets &  Dataloaders


In [None]:
# Create batches of data tensors 

batch_size = 20 

train_data = TensorDataset(torch.LongTensor(X_train_encoded).long(), torch.LongTensor(y_train).long())
valid_data = TensorDataset(torch.LongTensor(X_valid_encoded).long(), torch.LongTensor(y_valid).long())
test_data = TensorDataset(torch.LongTensor(X_test_encoded).long(), torch.LongTensor(y_test).long())

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
emb = nn.Embedding.from_pretrained(glove_embeddings)
for input, l in train_loader:
  print('simple encoding:\n')
  print (input)
  print('\nembedding:\n')
  print ((emb(input[0])))
  break

simple encoding:

tensor([[     0,      0,      0,  ..., 144492,  68679, 112848],
        [     0,      0,      0,  ...,  52245, 137848, 171019],
        [     0,      0,      0,  ...,  30122,  59280, 160719],
        ...,
        [ 21039,  34302,  32027,  ..., 183450, 200298, 159130],
        [     0,      0,      0,  ...,   1021, 157224, 121197],
        [     0,      0,      0,  ...,  43904,  14694, 112848]])

embedding:

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.4006,  0.4848, -0.0599,  ...,  0.1685, -0.4125, -0.1568],
        [ 0.4040,  0.3878,  0.5340,  ...,  0.8472, -0.5572,  0.3002],
        [ 0.0265,  0.3374,  0.0657,  ..., -0.3398, -0.2304,  0.1907]],
       dtype=torch.float64)


## Build Models

### Hyperparameters Settings

In [None]:
vocab_size = len(word2idx)
embed_dim = 50
hidden_dim = 64
output_dim = 1
dropout = 0.2
n_layers = 1
lr = 0.01
n_epochs = 5
n_filters = 100
filter_sizes = [3,4,5]
pad_idx = word2idx['<pad>']

### LSTM

In [None]:
class LSTM(nn.Module):

    def __init__(self,
                       output_dim = output_dim,
                       embed_dim = embed_dim,
                       hidden_dim = hidden_dim,
                       n_layers = n_layers,
                       pretrained_emb = None,
                       vocab_size = vocab_size,
                       drop=dropout):

        super(LSTM, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        # Embedding L
        if pretrained_emb is not None:
            self.vocab_size, self.embed_dim = pretrained_emb.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_emb,
                                                          freeze=False)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim)  
        # LSTM L
        self.lstm = nn.LSTM(self.embed_dim, hidden_dim, n_layers,batch_first=True,
                            dropout=drop)      
        # Fully Connected L
        self.fc = nn.Linear(hidden_dim, output_dim)
        # Dropout L
        self.dropout = nn.Dropout(drop)
        # Output
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):

        batch_size = x.size(0)     
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)    
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)           
        out = self.dropout(lstm_out)
        out = self.fc(out)      
        sig_out = self.sig(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] 
               
        return sig_out, hidden
      
    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        
        return hidden

### CNN

In [None]:
class CNN_1(nn.Module):
    def __init__(self, 
                 pretrained_emb=None,
                 vocab_siz=vocab_size,
                 embed_dim=embed_dim,
                 n_filters=n_filters,
                 filter_sizes=filter_sizes,
                 output_dim=output_dim, 
                 dropout=dropout,
                 pad_idx=pad_idx):
        
        super().__init__()

        # Embedding L
        if pretrained_emb is not None:
            self.vocab_size, self.embed_dim = pretrained_emb.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_emb, freeze=False)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_dim, padding_idx=pad_idx, max_norm=5.0)
        
        # Conv Network L
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, self.embed_dim)) 
                                    for fs in filter_sizes
                                    ]) 
        # Fully Connected L
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        # Dropout L
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):    
        
        embedded = self.embedding(text) # [batch size, sent len, emb dim]      
        embedded = embedded.unsqueeze(1) # [batch size, 1, sent len, emb dim]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] # [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] # [batch size, n_filters]
        cat = self.dropout(torch.cat(pooled, dim = 1)) # [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

## Create Model Instances & Define Loss Functions and Optimizers

In [None]:
cnn = CNN_1()
cnn.to(device)
cnn_gl = CNN_1(pretrained_emb=glove_embeddings)
cnn_gl.to(device)
lstm =  LSTM()
lstm.to(device)
lstm_gl = LSTM(pretrained_emb=glove_embeddings)
lstm_gl.to(device)

loss_fn_BCE = nn.BCELoss() # for LSTM models
loss_fn_BCEwl = nn.BCEWithLogitsLoss() # for CNN models

opt_cnn = torch.optim.Adam(cnn.parameters(), lr=lr)
opt_cnn_gl = torch.optim.Adam(cnn_gl.parameters(), lr=lr)
opt_lstm = torch.optim.Adam(lstm.parameters(), lr=lr)
opt_lstm_gl = torch.optim.Adam(lstm_gl.parameters(), lr=lr)

  "num_layers={}".format(dropout, num_layers))


## Train and Validate Models

In [None]:
def accuracy(preds, targets, m_type=None):
  """
  Compute accuracy per batch
  """
  if m_type == 'lstm':
    rounded_preds = torch.round(preds)
  else:
    rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == targets).float()
  acc = correct.sum() / len(correct)
  return acc

In [None]:
def train_LSTM(model, loader, optimizer, loss_fn, m_type='lstm'):
    """
    Train LSTM models
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    model.float()

    h = model.init_hidden(batch_size)

    for inputs, labels in loader:     
        inputs, labels = inputs.to(device), labels.to(device)
        h = tuple([each.data for each in h])
        optimizer.zero_grad()
        output, h = model(inputs.squeeze(), h)
        loss = loss_fn(output, labels.float())
        acc = accuracy(output, labels.float(), m_type=m_type)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(),5)
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    train_loss, train_accuracy = epoch_loss / len(loader), epoch_acc / len(loader) 

    return train_loss, train_accuracy

def evaluate_LSTM(model, loader, loss_fn, m_type = 'lstm'):
    """ 
    Evaluate LSTM models
    """
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    model.float()

    h = model.init_hidden(batch_size)
    
    with torch.no_grad():
    
        for inputs, labels in loader:
          inputs, labels = inputs.to(device), labels.to(device)
          h = tuple([each.data for each in h])
          output, h = model(inputs.squeeze(), h)
          loss = loss_fn(output, labels.float())          
          acc = accuracy(output, labels.float(), m_type=m_type)
          epoch_loss += loss.item()
          epoch_acc += acc.item()

    eval_loss, eval_accuracy = epoch_loss / len(loader), epoch_acc / len(loader)

    return eval_loss, eval_accuracy

In [None]:
def train_CNN(model, loader, optimizer, loss_fn):
    """ 
    Train CNN models
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    model.float()

    for inputs, labels in loader:      
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs).squeeze(1)
        loss = loss_fn(output, labels.float())
        acc = accuracy(output, labels.float())
        loss.backward()
        optimizer.step()       
        acc = accuracy(output, labels.float())
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    train_loss, train_accuracy = epoch_loss / len(loader), epoch_acc / len(loader) 

    return train_loss, train_accuracy

def evaluate_CNN(model, loader, loss_fn):
    """ 
    Evaluate CNN models
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    model.float()

    with torch.no_grad():
    
        for inputs, labels in loader:
          inputs, labels = inputs.to(device), labels.to(device)
          output = model(inputs).squeeze(1)
          loss = loss_fn(output, labels.float())
          acc = accuracy(output, labels.float())
          epoch_loss += loss.item()
          epoch_acc += acc.item()

    eval_loss, eval_accuracy = epoch_loss / len(loader), epoch_acc / len(loader)

    return eval_loss, eval_accuracy

In [None]:
def train_validate_model(model, train_loader, valid_loader, optimizer, loss_fn, n_epochs, model_type=None, name=None):
    """ 
    Train & Evaluate model on Training & Validation Sets
    """
    
    best_valid_loss = float('inf')

    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train Acc':^11} | {'Val Loss':^10} | {'Val Acc':^9}")

    for epoch in range(n_epochs):

      if model_type == 'lstm':
        train_loss, train_acc = train_LSTM(model, train_loader, optimizer, loss_fn)
        valid_loss, valid_acc = evaluate_LSTM(model, valid_loader, loss_fn)   
      else:
        train_loss, train_acc = train_CNN(model, train_loader, optimizer, loss_fn)
        valid_loss, valid_acc = evaluate_CNN(model, valid_loader, loss_fn)  

      if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          torch.save(model.state_dict(), f'{name}.pt')

      print(f"{epoch + 1:^7} | {train_loss:^12.6f} | {train_acc*100:^11.6f}% |{valid_loss:^10.6f} | {valid_acc*100:^9.2f}%") 

In [None]:
def load_and_test_model(model, test_loader, loss_fn, m_type=None, name=None):
  """
  Load saved model & Evaluate on Test Set
  """ 
  model.load_state_dict(torch.load(name))
  if m_type == 'lstm':
    test_loss, test_acc = evaluate_LSTM(model, test_loader, loss_fn)
  else:
    test_loss, test_acc = evaluate_CNN(model, test_loader, loss_fn)
  
  return test_loss, test_acc*100

In [None]:
train_validate_model(lstm, train_loader, valid_loader, opt_lstm, loss_fn_BCE, n_epochs=n_epochs, model_type = 'lstm',name='new_lstm')

Start training...

 Epoch  |  Train Loss  |  Train Acc  |  Val Loss  |  Val Acc 
   1    |   0.496397   |  76.265001 % | 0.389510  |   84.68  %
   2    |   0.329478   |  86.802501 % | 0.430764  |   82.74  %
   3    |   0.325519   |  86.842501 % | 0.415927  |   82.38  %
   4    |   0.325787   |  86.770001 % | 0.439804  |   80.90  %
   5    |   0.358766   |  84.730001 % | 0.478681  |   80.58  %


In [None]:
train_validate_model(cnn,train_loader, valid_loader, opt_cnn ,loss_fn_BCEwl ,n_epochs=n_epochs, name='new_cnn')

Start training...

 Epoch  |  Train Loss  |  Train Acc  |  Val Loss  |  Val Acc 
   1    |   0.597998   |  77.977501 % | 0.321513  |   86.22  %
   2    |   0.360030   |  89.527501 % | 1.089861  |   73.70  %
   3    |   0.234858   |  94.305001 % | 0.948121  |   84.00  %
   4    |   0.258795   |  95.545000 % | 1.678691  |   82.82  %
   5    |   0.293086   |  96.212500 % | 2.427796  |   82.84  %


In [None]:
train_validate_model(lstm_gl, train_loader, valid_loader, opt_lstm_gl, loss_fn_BCE, n_epochs=n_epochs, model_type = 'lstm',name='new_lstm_gl')

Start training...

 Epoch  |  Train Loss  |  Train Acc  |  Val Loss  |  Val Acc 
   1    |   0.384332   |  82.787502 % | 0.307830  |   87.56  %
   2    |   0.169246   |  94.045001 % | 0.365190  |   86.60  %
   3    |   0.091679   |  97.020000 % | 0.447440  |   86.20  %
   4    |   0.051924   |  98.320000 % | 0.486047  |   86.54  %
   5    |   0.043674   |  98.600000 % | 0.538481  |   84.54  %


In [None]:
train_validate_model(cnn_gl, train_loader, valid_loader, opt_cnn_gl, loss_fn_BCEwl , n_epochs=n_epochs, name='new_cnn_gl')

Start training...

 Epoch  |  Train Loss  |  Train Acc  |  Val Loss  |  Val Acc 
   1    |   0.494493   |  80.232501 % | 0.406822  |   85.00  %
   2    |   0.336728   |  90.147501 % | 0.535621  |   85.28  %
   3    |   0.266452   |  94.387501 % | 1.343444  |   84.22  %
   4    |   0.253996   |  95.840000 % | 2.045397  |   83.34  %
   5    |   0.304491   |  96.722500 % | 2.876580  |   83.92  %


## Test Models

In [None]:
lstm_loss , lstm_acc = load_and_test_model(lstm, test_loader, loss_fn_BCE, name='new_lstm.pt', m_type='lstm')
lstm_gl_loss, lstm_gl_acc = load_and_test_model(lstm_gl, test_loader, loss_fn_BCE, name='new_lstm_gl.pt' , m_type='lstm')
cnn_loss, cnn_acc = load_and_test_model(cnn, test_loader, loss_fn_BCEwl, name='new_cnn.pt')
cnn_gl_loss, cnn_gl_acc = load_and_test_model(cnn_gl, test_loader, loss_fn_BCEwl, name='new_cnn_gl.pt')

In [None]:
print(f"{'LSTM':^7} | {lstm_loss:^12.6f} | {lstm_acc:^11.6f}%")
print(f"{'CNN':^7} | {cnn_loss:^12.6f} | {cnn_acc:^11.6f}%")
print(f"{'LSTM_gl':^7} | {lstm_gl_loss:^12.6f} | {lstm_gl_acc:^11.6f}%")
print(f"{'CNN_gl':^7} | {cnn_gl_loss:^12.6f} | {cnn_gl_acc:^11.6f}%")

 LSTM   |   0.389397   |  84.400001 %
  CNN   |   0.330557   |  86.880001 %
LSTM_gl |   0.300666   |  88.300001 %
CNN_gl  |   0.398911   |  84.940002 %
