## Dataset

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Data Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

In [3]:
pos_texts = read_text_files(pos_path)
neg_texts = read_text_files(neg_path)

## Clean extra characters
new lines, tabs yada yada

In [4]:
def clean(texts):
    cleaned = []
    for text in texts:
        cleaned_text = text.strip().lower()
        cleaned.append(cleaned_text)

    return cleaned

In [5]:
pos_cleaned = clean(pos_texts)
neg_cleaned = clean(neg_texts)

## Embedding - Word2Vec

In [6]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from tqdm import tqdm


# remove stopwords, use the already cleaned lists
# then label

ignore = stopwords.words("english")

# label: 1 for pos, 0 for neg
def process_one_class(label, data):
    processed = list()
    for i in tqdm(range(len(data)), desc=f"preprocess_{label}"):
        text = data[i]
        raw = tokenize(text)
        cleaned = []
        
        for tok in raw:
            if tok not in ignore:
                cleaned.append(tok)
        

        processed.append((label, cleaned))
        
    return processed


def preprocess():
    data = [] # label, tokens
    
    pos_processed = process_one_class(1, pos_cleaned)
    neg_processed = process_one_class(0, neg_cleaned)
        
    data.extend(pos_processed)
    data.extend(neg_processed)
    
    return data

In [7]:
data = preprocess()

preprocess_1: 100%|██████████| 32937/32937 [00:04<00:00, 6811.08it/s]
preprocess_0: 100%|██████████| 31783/31783 [00:04<00:00, 7117.57it/s]


In [8]:
sentences = [s[1] for s in data]
labels = [s[0] for s in data]

### Word2Vec from Gensim

In [9]:
import gensim
import multiprocessing
import random
from gensim.models import Word2Vec

In [10]:
# 300 dims for the embedding
EMBEDDING_DIMS = 300

In [11]:
cores = multiprocessing.cpu_count()

# keep one cpu core free or some operating systems may kill the process :P
model = Word2Vec(size=EMBEDDING_DIMS, workers=cores-1, max_vocab_size=100000)

# build vocab
model.build_vocab(sentences)

# train
%time model.train(sentences, total_examples=len(sentences), epochs=50)

CPU times: user 9min 25s, sys: 508 ms, total: 9min 26s
Wall time: 27.2 s


(34695514, 46480100)

### Vocabulary

In [12]:
vocabulary = model.wv.vocab
vocab_len = len(vocabulary.keys())

## Word2Vec Embedding to Matrix
Because we need one for CNN as embedding weights

In [13]:
import numpy as np

# n_rows : number of words
# n_cols : embedding dims
def create_embedding_matrix(vocabulary, model, n_rows, n_cols):
    matrix = np.zeros((n_rows, n_cols))
    
    for idx, word in enumerate(vocabulary.keys()):
        embedding_vector = model.wv[word]
        matrix[idx] = embedding_vector
        
    return matrix
        

embed_matrix = create_embedding_matrix(vocabulary, model, vocab_len, EMBEDDING_DIMS)

## Create input matrix for CNN


### Create word -> idx mapping for vocab

In [16]:
word_map = dict()

words = list(vocabulary.keys())
len(words)

14741

In [17]:
for idx, word in enumerate(words):
    word_map[word] = idx + 1
    
# for oov
word_map["oov"] = 0

### Encode texts

In [18]:
max_seq_len = max([len(s) for s in sentences])
max_seq_len

135

In [30]:
def encode(max_len, sentences):
    encoded = np.zeros(shape=(len(sentences), max_len), dtype=np.int32)
    
    for i in tqdm(range(len(sentences)), desc="encode"):
        sentence = sentences[i]
        for j, token in enumerate(sentence):
            if token in words:
                encoded[i][j] = word_map[token]
            else:
                encoded[i][j] = word_map["oov"]
            
    return encoded

In [32]:
encoded_texts = encode(max_seq_len, sentences)

encode: 100%|██████████| 64720/64720 [00:29<00:00, 2176.96it/s]


## CNN Model

In [33]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [35]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
class PolarityCLF(nn.Module):
    def __init__(self, embedding_dims, n_filters, filter_size, hidden_dims, vocab_size):
        super(PolarityCLF, self).__init__()
        
        self.embedding_dims = embedding_dims
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.hidden_dims = hidden_dims 
        self.vocab_size = vocab_size
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        
        self.conv1 = nn.Conv1d(in_channels=self.n_filters, 
                               kernel_size=self.filter_size,
                               out_channels=1)
        
        self.conv2 = nn.Conv1d(in_channels=self.n_filters, 
                               kernel_size=self.filter_size,
                               out_channels=1)


In [40]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f3bec00c820>

<gensim.models.word2vec.Word2Vec at 0x7f3bec00c790>