## Dataset

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Data Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

In [3]:
pos_texts = read_text_files(pos_path)
neg_texts = read_text_files(neg_path)

## Clean extra characters
new lines, tabs yada yada

In [4]:
def clean(texts):
    cleaned = []
    for text in texts:
        cleaned_text = text.strip().lower()
        cleaned.append(cleaned_text)

    return cleaned

In [5]:
pos_cleaned = clean(pos_texts)
neg_cleaned = clean(neg_texts)

## Embedding - Word2Vec

In [6]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from tqdm import tqdm


# remove stopwords, use the already cleaned lists
# then label

ignore = stopwords.words("english")

# label: 1 for pos, 0 for neg
def process_one_class(label, data):
    processed = list()
    for i in tqdm(range(len(data)), desc=f"preprocess_{label}"):
        text = data[i]
        raw = tokenize(text)
        cleaned = []
        
        for tok in raw:
            if tok not in ignore:
                cleaned.append(tok)
        

        processed.append((label, cleaned))
        
    return processed


def preprocess():
    data = [] # label, tokens
    
    pos_processed = process_one_class(1, pos_cleaned)
    neg_processed = process_one_class(0, neg_cleaned)
        
    data.extend(pos_processed)
    data.extend(neg_processed)
    
    return data

In [7]:
data = preprocess()

preprocess_1: 100%|██████████| 32937/32937 [00:04<00:00, 7031.39it/s]
preprocess_0: 100%|██████████| 31783/31783 [00:04<00:00, 7400.40it/s]


In [8]:
sentences = [s[1] for s in data]
labels = [s[0] for s in data]

### Word2Vec from Gensim
Train a word2vec model using the gensim implementation

In [9]:
import multiprocessing
from gensim.models import Word2Vec

In [10]:
# 300 dims for the embedding
EMBEDDING_DIMS = 300
# TODO: can try different dimensions later on

In [11]:
cores = multiprocessing.cpu_count()

# keep one cpu core free or some operating systems may kill the process :P
model = Word2Vec(size=EMBEDDING_DIMS, workers=cores-1, max_vocab_size=100000)

# build vocab
model.build_vocab(sentences)

# train
%time model.train(sentences, total_examples=len(sentences), epochs=50)

CPU times: user 9min 30s, sys: 514 ms, total: 9min 31s
Wall time: 27.4 s


(34694418, 46480100)

### Vocabulary

In [32]:
# list of all the words word2vec has processed
vocabulary = model.wv.index2word
vocab_len = len(vocabulary)

## Word2Vec Embedding to Matrix
Word2Vec has vectors for each word in the dictionary it built

In [38]:
import numpy as np

# n_rows : number of words
# n_cols : embedding dims
def create_embedding_matrix(vocabulary, model, n_rows, n_cols):
    matrix = np.zeros((n_rows, n_cols))
    
    for idx, word in enumerate(vocabulary):
        embedding_vector = model.wv[word]
        matrix[idx] = embedding_vector
        
    return matrix
        

embed_matrix = create_embedding_matrix(vocabulary, model, vocab_len, EMBEDDING_DIMS)

In [39]:
embed_matrix


array([[-0.09179415,  0.52475208,  0.24306419, ..., -0.28713584,
        -0.39636341,  0.41431141],
       [-0.47100335,  0.81594521,  0.49166444, ..., -0.27325055,
         0.6266045 ,  0.88453633],
       [-0.07136358,  0.39773157, -0.26612186, ...,  0.36736411,
        -0.70003915,  0.15846795],
       ...,
       [ 0.44517314,  0.90030897,  0.64907324, ..., -0.28547865,
         0.05625157,  0.55440789],
       [-0.09424435, -0.13661237, -0.0543666 , ..., -0.04453824,
         0.18002456, -0.13313498],
       [-0.08679727,  0.46403295, -0.07928399, ...,  0.01737124,
        -0.14444058,  0.0568588 ]])

## Encode sentences based on their word index from w2v model

In [50]:
# find the max seq len first
max_seq_len = max([len(s) for s in sentences])
max_seq_len

135

In [53]:
word_map = dict()
for idx, word in enumerate(vocabulary):
    word_map[word] = idx

word_map["oov"] = -1

In [54]:
def encode(sentences, vocab=vocabulary, max_len=max_seq_len):
    encoded = np.zeros(shape=(len(sentences), max_len), dtype=np.int32)

    for i in tqdm(range(len(sentences)), desc="encode"):
        sentence = sentences[i]
        for j, token in enumerate(sentence):
            if token in vocab:
                encoded[i][j] = word_map[token]
            else:
                encoded[i][j] = word_map["oov"]

    return encoded


encoded_texts = encode(sentences)

encode: 100%|██████████| 64720/64720 [00:26<00:00, 2400.48it/s]


## Split data

In [57]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    encoded_texts, labels, random_state=42, train_size=0.7
)

print(x_train.shape)
print(x_test.shape)

(45304, 135)
(19416, 135)


## CNN Model

In [58]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [59]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [22]:
"""
    Allowed embedding dims = 300
    Since we have pre-trained embeddings for these values
    n_filters - number of filters for convolutions
    filter_size - size of the filters (3, 4, 7 etc.)
    hidden_dims - number of hidden dimensions
    """

class PolarityCLF(nn.Module):
    def __init__(self, embedding_dims, n_filters, filter_size, hidden_dims, vocab_size, embeddings):
        super(PolarityCLF, self).__init__()
        
        self.embedding_dims = embedding_dims
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.hidden_dims = hidden_dims 
        self.vocab_size = vocab_size

        # convert numpy style embeddings to tensors
        self.embeddings = nn.Embedding.from_pretrained(
            torch.FloatTensor(embeddings)
        )
        
        
        self.conv1 = nn.Conv1d(in_channels=self.n_filters, 
                               kernel_size=self.filter_size,
                               out_channels=1)
        
        self.conv2 = nn.Conv1d(in_channels=self.n_filters, 
                               kernel_size=self.filter_size,
                               out_channels=1)
        
    def forward(self, x):
        pass


Parameter containing:
tensor([[-0.0918,  0.5248,  0.2431,  ..., -0.2871, -0.3964,  0.4143],
        [-0.4710,  0.8159,  0.4917,  ..., -0.2733,  0.6266,  0.8845],
        [-0.0714,  0.3977, -0.2661,  ...,  0.3674, -0.7000,  0.1585],
        ...,
        [ 0.4452,  0.9003,  0.6491,  ..., -0.2855,  0.0563,  0.5544],
        [-0.0942, -0.1366, -0.0544,  ..., -0.0445,  0.1800, -0.1331],
        [-0.0868,  0.4640, -0.0793,  ...,  0.0174, -0.1444,  0.0569]])