## Corpus

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Text file Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

## Preprocess
- tokenize
- Clean punctuation, stopwords
- Assign labels

In [3]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from string import punctuation

ignore = stopwords.words("english")

def remove_stopwords(tokens):
    x = [token for token in tokens if token not in ignore]
    return x

def remove_punctuation(tokens):
    x = [token for token in tokens if token not in punctuation]
    return x

def clean(tokens):
    #x = remove_stopwords(text)
    x = remove_punctuation(x)

    return x

In [4]:
from tqdm import tqdm

# label -> 1 for pos and 0 for neg
def prepare_corpus():
    # dataset is a list of tuples
    # (label, tokens)
    corpus = list()

    # idx -> label
    text_paths = [neg_path, pos_path]
    for idx, path in enumerate(text_paths):
        texts = read_text_files(path)
        for i in tqdm(range(len(texts)), desc="prepare_corpus"):
            text = texts[i]
            # tokenize
            tokens = tokenize(text)
            
            # clean
            tokens = remove_punctuation(tokens)
            tokens = remove_stopwords(tokens)
            
            # append
            corpus.append((idx, tokens))

    return corpus

In [5]:
corpus = prepare_corpus()

prepare_corpus: 100%|██████████| 31783/31783 [00:04<00:00, 6749.07it/s]
prepare_corpus: 100%|██████████| 32937/32937 [00:05<00:00, 6573.49it/s]


In [6]:
corpus[0]

(0,
 ['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive'])

In [33]:
len(corpus)

64720

## Embedding - Word2Vec

Google news embeddings this time!

In [7]:
# https://radimrehurek.com/gensim/downloader.html

import os
import gensim.downloader as dl
from gensim.models import KeyedVectors

pretrained_model_name = "word2vec-google-news-300"
model_dl_path = os.path.join(dl.BASE_DIR, pretrained_model_name, f"{pretrained_model_name}.gz")

if os.path.exists(model_dl_path):
    # load model
    corpus_embeddings = dl.load(pretrained_model_name)
else:
    # download
    print(f"Model will be downloaded at {model_dl_path}")
    corpus_embeddings = dl.load("word2vec-google-news-300")

### Vocabulary

In [8]:
# list of all the words word2vec has processed
vocabulary = corpus_embeddings.index_to_key
vocab_len = len(vocabulary)

In [9]:
vocab_len

3000000

In [10]:
vocabulary[:10]

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

## Encode all tokens with indices from embedding

In [11]:
def encode_corpus_tokens_with_embed_idx(corpus):
    encoded_corpus = list()
    for i in tqdm(range(len(corpus)), desc="encode_tokens_with_embed_idx"):
        idxs = []
        label, tokens = corpus[i]

        for token in tokens:
            try:
                idx = corpus_embeddings.key_to_index[token]
            except:
                # if token isn't in the vocab
                idx = 0

            idxs.append(idx)
        
        
        encoded_corpus.append((label, idxs))



    return encoded_corpus

In [12]:
encoded_corpus = encode_corpus_tokens_with_embed_idx(corpus)

encode_tokens_with_embed_idx: 100%|██████████| 64720/64720 [00:00<00:00, 163687.16it/s]


In [13]:
encoded_corpus[0][1]

[4123, 54, 4154, 6717, 152, 1411, 447, 3554, 817]

## Padding

Left pad with 0

However we need a sequence length. 

In [14]:
# get max sequences length
sentences = [s[1] for s in corpus]
max_seq_len = max(len(s) for s in sentences)
max_seq_len

85

In [15]:
import numpy as np

def pad_tokens(encoded_corpus, seq_len= 200):
    padded = np.zeros(
        (len(encoded_corpus), seq_len),
        dtype=np.int32
    )

    for i in tqdm(range(len(corpus)), desc="pad"):
        tokens = encoded_corpus[i][1]

        # nltk's stopwords are a bit agrressive, ignore token lists with 0 size
        if len(tokens) == 0:
            continue

        padded[i, -len(tokens):] = np.array(tokens)

    return padded

In [16]:
padded_tokens = pad_tokens(encoded_corpus)

pad: 100%|██████████| 64720/64720 [00:00<00:00, 369833.21it/s]


## Input and Labels?

In [17]:
X = padded_tokens # input
y = np.array([c[0] for c in encoded_corpus])  #label

In [18]:
print(X.shape)
print(y.shape)

(64720, 200)
(64720,)


## Split data

In [19]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    X, y, random_state=42, train_size=0.8
)

In [20]:
# https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

In [21]:
print(f"x_train = {x_train.shape} # y_train = {y_train.shape}")
print(f"x_val = {x_val.shape} # y_val = {y_val.shape}")
print(f"x_test = {x_test.shape} # y_test = {y_test.shape}")

x_train = (41420, 200) # y_train = (41420,)
x_val = (10356, 200) # y_val = (10356,)
x_test = (12944, 200) # y_test = (12944,)


## Convert to TensorData

In [22]:
import torch
from torch.utils.data import TensorDataset

training_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))

## DataLoader for Torch

Let torch handle the shuffling and etc yada stuff for MiniBatch

Why MiniBatch? Dataset is big and feeding everything at once won't generalize well. (Even if the machine can handle it!)

In [23]:
from torch.utils.data import DataLoader

# define a batch size
batch_size = 50

train_loader = DataLoader(training_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)

## CNN Model

This model is based on https://arxiv.org/abs/1408.5882

In [24]:
import torch.nn as nn
import torch.nn.functional as F

class SentimentClassifierCNN(nn.Module):
    def __init__(self, freeze_embeddings=True):
        super(SentimentClassifierCNN, self).__init__()
        
        # properties
        self.kernel_sizes = [3,4,5]
        self.num_filters = 100
        self.embedding_dim = 300
        self.output_size = 1
        self.vocab_size=vocab_len

        # convert embeddings to tensors!
        self.corpus_embedding = torch.from_numpy(corpus_embeddings.vectors)

        # neural network 

        # embedding layer
        # by default we're freezing embeddings
        self.embedding = nn.Embedding.from_pretrained(self.corpus_embedding, freeze=freeze_embeddings)

        # conv layers
        # 3 conv layers, since 3 kernel sizes
        self.conv1d = nn.ModuleList([
            nn.Conv2d(1, self.num_filters, (k, self.embedding_dim), padding=(k - 2, 0))

            for k in self.kernel_sizes
        ])

        # final linear layer
        self.linear = nn.Linear(len(self.kernel_sizes) * self.num_filters, self.output_size)

        # dropout and sigmoid
        # why sigmoid? Well, binary classification task!
        self.dropout = nn.Dropout(0.1)
        self.sigmoid = nn.Sigmoid()

    # helper 
    def conv_and_pool(self, x, conv):
        """
        Convolutional + max pooling layer
        """
        # squeeze last dim to get size: (batch_size, num_filters, conv_seq_length)
        x = F.relu(conv(x)).squeeze(3)
        
        # 1D pool over conv_seq_length
        # squeeze to get size: (batch_size, num_filters)
        x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x_max

    def forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.unsqueeze(1) # reshape for conv (vector to matrix)

        conv_out =  [self.conv_and_pool(embeds, conv) for conv in self.conv1d]

        # concate convolution outputs as a "vector"
        out = torch.cat(conv_out, 1)
        # apply dropout
        out = self.dropout(out)

        # linear 
        out = self.linear(out)

        return self.sigmoid(out)

In [25]:
cnn = SentimentClassifierCNN()
print(cnn)

SentimentClassifierCNN(
  (embedding): Embedding(3000000, 300)
  (conv1d): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1), padding=(2, 0))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1), padding=(3, 0))
  )
  (linear): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (sigmoid): Sigmoid()
)


## Hyperparams

In [26]:
learning_rate = 0.001

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)

## Create Device
Using accelerate from huggingface
https://huggingface.co/docs/accelerate/index.html


In [27]:
from accelerate import Accelerator

accelerator = Accelerator()
device = accelerator.device

print(device)

cuda


## Add model, dataloader, optimizer and dataset to device

In [28]:
cnn, train_loader, val_loader, optimizer = accelerator.prepare(
    cnn, train_loader, val_loader, optimizer
)

## Train

In [29]:
epochs = 2

def train_cnn(model, train_loader, val_loader, epochs, optimizer, loss_fn, accl=accelerator):
    print_counter = 0 # print loss for each 100th count

    for e in tqdm(range(epochs), desc=f"train_cnn_for_{epochs}_epochs"):
        model.train()
        for input, label in train_loader:
            print_counter += 1
            # zero gradients
            model.zero_grad()

            # forward pass
            output = model(input)

            # backprop
            loss = loss_fn(output.squeeze(), label.float())
            accl.backward(loss)
            optimizer.step()

            # log loss 
            if print_counter % 100 == 0:
                validation_losses = []
                
                model.eval() # switch mode
                with torch.no_grad():
                    for val_input, val_label in val_loader:
                        val_output = model(val_input)
                        val_loss = loss_fn(val_output.squeeze(), val_label.float())
                        validation_losses.append(val_loss.item())
                    print(f"Epoch: {e + 1}/{epochs}\tStep: {print_counter}\tTrain Loss: {loss.item()}\tValidation Loss: {np.mean(validation_losses)}")

                model.train()

            


train_cnn(
    model=cnn,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=epochs,
    optimizer=optimizer,
    loss_fn=loss_fn,
)

train_cnn_for_2_epochs:   0%|          | 0/2 [00:00<?, ?it/s]Epoch: 1/2	Step: 100	Train Loss: 0.6616680026054382	Validation Loss: 0.6519948782829138
Epoch: 1/2	Step: 200	Train Loss: 0.656464695930481	Validation Loss: 0.6416473127901554
Epoch: 1/2	Step: 300	Train Loss: 0.6641656160354614	Validation Loss: 0.6364171649687566
Epoch: 1/2	Step: 400	Train Loss: 0.5873491764068604	Validation Loss: 0.6347373434557364
Epoch: 1/2	Step: 500	Train Loss: 0.6204197406768799	Validation Loss: 0.6329718263676534
Epoch: 1/2	Step: 600	Train Loss: 0.7505196332931519	Validation Loss: 0.6394698854822379
Epoch: 1/2	Step: 700	Train Loss: 0.6436971426010132	Validation Loss: 0.6307122068336377
train_cnn_for_2_epochs:  50%|█████     | 1/2 [00:17<00:17, 17.49s/it]Epoch: 1/2	Step: 800	Train Loss: 0.6809712648391724	Validation Loss: 0.6301377673561757
Epoch: 2/2	Step: 900	Train Loss: 0.554826557636261	Validation Loss: 0.6250122315608538
Epoch: 2/2	Step: 1000	Train Loss: 0.63131183385849	Validation Loss: 0.6252176211

## Inference

In [30]:
test_x_tensor = torch.from_numpy(x_test)
test_x_tensor = test_x_tensor.to(device)

def classify_sentiment(model, test_data):
    model.eval()
    with torch.no_grad():

        out = model(test_data)
        out = torch.round(out.squeeze())
    	
        return out.cpu().detach().numpy()

In [31]:
y_pred = classify_sentiment(cnn, test_x_tensor)
y_pred

array([1., 1., 1., ..., 0., 1., 1.], dtype=float32)

## Evaluation

In [32]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.66      0.54      0.59      6328
           1       0.62      0.74      0.68      6616

    accuracy                           0.64     12944
   macro avg       0.64      0.64      0.63     12944
weighted avg       0.64      0.64      0.64     12944

