## Download the data

In [1]:
!wget https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/reviews.txt -P data/
!wget https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/labels.txt -P data/
!wget https://github.com/bekou/multihead_joint_entity_relation_extraction/raw/master/data/CoNLL04/vecs.lc.over100freq.zip -P data/
!unzip data/vecs.lc.over100freq.zip -d data/    

--2021-11-16 09:59:18--  https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33678267 (32M) [text/plain]
Saving to: ‘data/reviews.txt’


2021-11-16 09:59:25 (5.74 MB/s) - ‘data/reviews.txt’ saved [33678267/33678267]

--2021-11-16 09:59:26--  https://raw.githubusercontent.com/cezannec/CNN_Text_Classification/master/data/labels.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 225000 (220K) [text/plain]
Saving to: ‘data/labels.txt’


2021-11-16 

## Load the data

In [None]:
import numpy as np
# read data from text files
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [None]:
"""
print the first 1000 characters from the text--note this is not yet 
ready for classification, we should split it to sentences first
"""
print(reviews[:1000])

In [None]:
"""
print the first 20 characters from the text--note this is not yet 
ready for classification, we should split it to sentences first
"""
labels[:20]

## Data Pre-processing

In [None]:
from string import punctuation

"""First, let's remove all punctuation. Then get all the text without the newlines and split it into individual words."""

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of all words
all_words = all_text.split() # this is the vocabulary

In [None]:
reviews_split[0] # this is what the data look like

In [None]:
len(reviews_split) # this is the size of the data

### Encoding the Labels

The review labels are "positive" or "negative". 
Split them with the newline character to have the same size as reviews split and then transform them into 0 or 1 to feed them to the neural network. 1 in the case that the label is positive, 0 in the case that it is negative

In [None]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])

In [None]:
labels_split[0]

In [None]:
len(labels_split)

### Removing Outliers

In [None]:
from collections import Counter
# Build a dictionary that maps indices to review lengths
counts = Counter(all_words)
# outlier review stats
# counting words in each review
review_lens = Counter([len(x.split()) for x in reviews_split])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

In [None]:
#Getting rid of extremely long or short reviews; the outliers

print('Number of reviews before removing outliers: ', len(reviews_split))

## remove any reviews/labels with zero length from the reviews_ints list.

# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_split) if len(review.split()) != 0]

# remove 0-length reviews and their labels
reviews_split = [reviews_split[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_split))

## Using a Pre-Trained Embedding Layer

Next, we will tokenize the reviews; turn list of words that make up a given review into a list of tokenized integers that represent those words. Typically, this is done by creating a dictionary that maps each unique word in a vocabulary to a specific integer value.

In this example, I'll use a mapping that already exists, in a pre-trained embedding layer. Below, I am loading a pre-trained embeddings matrix.

You can download the matrix and explore it your self (from the link at the beginning of the exercise). It contains 412128 tokens with the corresponding pretrained vector for each word.

In [None]:
# import Word2Vec loading capabilities
from gensim.models import KeyedVectors
# Creating the model
embed_lookup = KeyedVectors.load_word2vec_format('data/vecs.lc.over100freq.txt', 
                                                 binary=False,unicode_errors='ignore')

### Embedding Layer

In [None]:
"""You can think of an embedding layer as a lookup table, where the rows are indexed by word token and the 
columns hold the embedding values. For example, row 4 is the embedding vector for the word that maps to the integer value 4.
In the cells below, we are storing the words in the pre-trained vocabulary, and printing out the size of the 
vocabulary and word embeddings. The embedding dimension from the pre-trained model is 50."""

# store pretrained vocab
pretrained_words = []
for word in embed_lookup.vocab:
    pretrained_words.append(word)

In [None]:
row_idx = 4

# get word/embedding in that row
word = pretrained_words[row_idx] # get words by index
embedding = embed_lookup[word] # embeddings by word

# vocab and embedding info
print("Size of Vocab: {}\n".format(len(pretrained_words)))
print('Word in vocab: {}\n'.format(word))
print('Length of embedding: {}\n'.format(len(embedding)))
print('Associated embedding: \n', embedding)

In [None]:
# print a few common words
for i in range(3,8):
    print(pretrained_words[i])

## Tokenize reviews

The pre-trained embedding layer already has tokens associated with each word in the dictionary. We want to use that same mapping to tokenize all the reviews in the movie review corpus. We will encode any unknown words (words that appear in the reviews but not in the pre-trained vocabulary) as the <unk> token, 9; This appears in the 9th position of the embedding matrix

In [None]:
# convert reviews to tokens
def tokenize_all_reviews(embed_lookup, reviews_split):
    # split each review into a list of words
    reviews_words = [review.split() for review in reviews_split]

    tokenized_reviews = []
    for review in reviews_words:
        ints = []
        for word in review:
            try:
                idx = embed_lookup.vocab[word].index
            except: 
                idx = 9
                
            ints.append(idx)
        tokenized_reviews.append(ints)
    
    return tokenized_reviews


In [None]:
tokenized_reviews = tokenize_all_reviews(embed_lookup, reviews_split)

In [None]:
# testing code and printing a tokenized review for the first document
print(tokenized_reviews[0])

---
## Padding sequences

To deal with both short and very long reviews, we should have the same length for all reviews in the neural network
For that, we will pad all reviews shorter than a specific length with a number. For us, it will be 1 because this is 
the PADDING id in the embeddings matrix (validate that in the vocabulary). 
For all reviews that are longer than this specific length, we will cut them.


As a small example, if the `seq_length=10` and an input, tokenized review is: 
```
[117, 18, 128]
```
The resultant, padded sequence should be: 

```
[1, 1, 1, 1 1, 1, 1, 117, 18, 128]
```

**Your final `features` array should be a 2D array, with as many rows as there are reviews, and as many columns as the specified `seq_length`.**

In [None]:
def pad_features(tokenized_reviews, seq_length):
    ''' Return features of tokenized_reviews, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(tokenized_reviews), seq_length), dtype=int)+1

    # for each review, I grab that review and 
    for i, row in enumerate(tokenized_reviews):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [None]:
# Test your implementation!

seq_length = 200

features = pad_features(tokenized_reviews, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(tokenized_reviews), "Features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

In [None]:
print (features.shape)

---
## Training, Validation, and Test Data

In [None]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

## DataLoaders and Batching

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

#This part is similar to what we have done in the previous exercises

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# shuffling and batching data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# Sentiment Network with PyTorch

The complete model consists of a few layers:

**1. An [embedding layer](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding)**
* This converts our word tokens (integers) into embedded vectors of a specific size.
* In this case, the vectors/weights of this layer will come from the **pretrained** lookup table defined above. 

**2. A few [convolutional layers](https://pytorch.org/docs/stable/nn.html#convolution-layers)**
* These are defined by an input size, number of filters/feature maps to output, and a kernel size.
* The output of these layers will go through a ReLu activation function and pooling layer in the `forward` function.

**3. A fully-connected, output layer**
* This maps the convolutional layer outputs to a desired output_size (1 sentiment class).

**4. A sigmoid activation layer**
* This turns the output logit into a value 0-1; a class score.

There is also a dropout layer, which will prevent overfitting, placed between the convolutional outputs and the final, fully-connected layer.

See the original paper, [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf).

### The Embedding Layer

The embedding layer comes from our pre-trained `embed_lookup` model. By default, the weights of this layer are set to the vectors from the pre-trained model and frozen, so it will just be used as a lookup table. You could train your own embedding layer here, but it will speed up the training process to use a pre-trained model.

### The Convolutional Layer(s)

So why use CNNs on text? In the same way that a 3x3 filter can look over a patch of an image, a 1x2 filter can look over a 2 sequential words in a piece of text, i.e. a bi-gram. In this CNN model we will use multiple filters of different sizes which will look at the bi-grams (two words, a 1x2 filter), tri-grams (three words, a 1x3 filter) and/or n-grams (a 1x$n$ filter) within the text.

The intuition here is that the appearance of certain bi-grams, tri-grams and n-grams within the review will be a good indication of the final sentiment.

We can then use a filter that is [n x emb_dim]. This will cover $n$ sequential words entirely, as their width will be emb_dim dimensions.

The kernel_sizes would (3, 50), (4, 50), and (5, 50); to look at 3-, 4-, and 5- sequences of word embeddings at a time (50 is the size of the word embeddings). Each of these three layers will produce 100 filtered outputs, where 100 is the number of filters. 

The kernels only move in one dimension: down to a sequence of word embeddings. In other words, these kernels move along a sequence of words, in time!

### Maxpooling Layers

In the `forward` function, we apply a ReLu activation to the outputs of all convolutional layers and a maxpooling layer over the input sequence dimension. The maxpooling layer will get us an indication of whether some high-level text feature was found. 

In [None]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SentimentCNN(nn.Module):
    """
    The embedding layer + CNN model that will be used to perform sentiment analysis.
    """

    def __init__(self, embed_model, vocab_size, output_size, embedding_dim,
                 num_filters=100, kernel_sizes=[3, 4, 5], freeze_embeddings=True, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentCNN, self).__init__()

        # set class vars
        self.num_filters = num_filters
        self.embedding_dim = embedding_dim
        
        # 1. embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # set weights to pre-trained
        self.embedding.weight = nn.Parameter(torch.from_numpy(embed_model.vectors)) # all vectors
        # (optional) freeze embedding weights
        if freeze_embeddings:
            self.embedding.requires_grad = False
        
        # 2. convolutional layers             
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = num_filters, 
                                kernel_size = (kernel_sizes[0], embedding_dim))
        
        self.conv_1 = nn.Conv2d(in_channels = 1, 
                                out_channels = num_filters, 
                                kernel_size = (kernel_sizes[1], embedding_dim))
        
        self.conv_2 = nn.Conv2d(in_channels = 1, 
                                out_channels = num_filters, 
                                kernel_size = (kernel_sizes[2], embedding_dim))
        
        self.fc = nn.Linear(len(kernel_sizes) * num_filters, output_size)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        """
        Defines how a batch of inputs, x, passes through the model layers.
        Returns a single, sigmoid-activated class score as output.
        """
        # embedded vectors
        embeds = self.embedding(x) # (batch_size, seq_length, embedding_dim)
        # embeds.unsqueeze(1) creates a channel dimension that conv layers expect
        embeds = embeds.unsqueeze(1)
                
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embeds).squeeze(3))
        conved_1 = F.relu(self.conv_1(embeds).squeeze(3))
        conved_2 = F.relu(self.conv_2(embeds).squeeze(3))
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))    
        
        logit = self.fc(cat)
        
        # sigmoid-activated --> a class score
        return self.sig(logit)

## Instantiate the network

Here, I'll instantiate the network. First up, defining the hyperparameters.

* `vocab_size`: Size of our vocabulary or the range of values for our input, word tokens.
* `output_size`: Size of our desired output; the number of class scores we want to output (pos/neg).
* `embedding_dim`: Number of columns in the embedding lookup table; size of our embeddings.
* `num_filters`: Number of filters that each convolutional layer produces as output.
* `filter_sizes`: A list of kernel sizes; one convolutional layer will be created for each kernel size.

Any parameters I did not list, are left as the default value.

In [None]:
# Instantiate the model w/ hyperparams

vocab_size = len(pretrained_words)
output_size = 1 # binary class (1 or 0)
embedding_dim = len(embed_lookup[pretrained_words[0]]) # 50-dim vectors
num_filters = 100
kernel_sizes = [3, 4, 5]

net = SentimentCNN(embed_lookup, vocab_size, output_size, embedding_dim,
                   num_filters, kernel_sizes)

print(net)

## Training

In [None]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [None]:
# training loop
def train(net, train_loader, epochs, print_every=100):

    # move model to GPU, if available
    if(train_on_gpu):
        net.cuda()

    counter = 0 # for printing
    
    # train for some number of epochs
    net.train()
    for e in range(epochs):

        # batch loop
        for inputs, labels in train_loader:
            counter += 1

            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output = net(inputs)

            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels.float())
            loss.backward()
            optimizer.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_losses = []
                net.eval()
                for inputs, labels in valid_loader:

                    if(train_on_gpu):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output = net(inputs)
                    val_loss = criterion(output.squeeze(), labels.float())

                    val_losses.append(val_loss.item())

                net.train()
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))

In [None]:
# training params

epochs = 2 # this is approx where I noticed the validation loss stop decreasing
print_every = 100

train(net, train_loader, epochs, print_every=print_every)

## Testing

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0


net.eval()
# iterate over test data
for inputs, labels in test_loader:

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output = net(inputs)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))