## Dataset

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Data Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

In [3]:
pos_texts = read_text_files(pos_path)
neg_texts = read_text_files(neg_path)

## Clean extra characters
new lines, tabs yada yada

In [4]:
def clean(texts):
    cleaned = []
    for text in texts:
        cleaned_text = text.strip().lower()
        cleaned.append(cleaned_text)

    return cleaned

In [5]:
pos_cleaned = clean(pos_texts)
neg_cleaned = clean(neg_texts)

## Embedding - Word2Vec

In [6]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from tqdm import tqdm


# remove stopwords, use the already cleaned lists
# then label

ignore = stopwords.words("english")

# label: 1 for pos, 0 for neg
def process_one_class(label, data):
    processed = list()
    for i in tqdm(range(len(data)), desc=f"preprocess_{label}"):
        text = data[i]
        raw = tokenize(text)
        cleaned = []
        
        for tok in raw:
            if tok not in ignore:
                cleaned.append(tok)
        

        processed.append((label, cleaned))
        
    return processed


def preprocess():
    data = [] # label, tokens
    
    pos_processed = process_one_class(1, pos_cleaned)
    neg_processed = process_one_class(0, neg_cleaned)
        
    data.extend(pos_processed)
    data.extend(neg_processed)
    
    return data

In [7]:
data = preprocess()

preprocess_1: 100%|██████████| 32937/32937 [00:04<00:00, 7000.89it/s]
preprocess_0: 100%|██████████| 31783/31783 [00:04<00:00, 7368.70it/s]


In [8]:
sentences = [s[1] for s in data]
labels = [s[0] for s in data]

### Word2Vec from Gensim
Train a word2vec model using the gensim implementation

In [9]:
import multiprocessing
from gensim.models import Word2Vec

In [10]:
# 300 dims for the embedding
EMBEDDING_DIMS = 300
# TODO: can try different dimensions later on

In [11]:
cores = multiprocessing.cpu_count()

# keep one cpu core free or some operating systems may kill the process :P
model = Word2Vec(size=EMBEDDING_DIMS, workers=cores-1, max_vocab_size=100000)

# build vocab
model.build_vocab(sentences)

# train
%time model.train(sentences, total_examples=len(sentences), epochs=50)

CPU times: user 9min 26s, sys: 611 ms, total: 9min 27s
Wall time: 27.6 s


(34695673, 46480100)

### Vocabulary

In [12]:
# list of all the words word2vec has processed
vocabulary = model.wv.index2word
vocab_len = len(vocabulary)

## Word2Vec Embedding to Matrix
Word2Vec has vectors for each word in the dictionary it built

In [13]:
import numpy as np

# n_rows : number of words
# n_cols : embedding dims
def create_embedding_matrix(vocabulary, model, n_rows, n_cols):
    matrix = np.zeros((n_rows, n_cols))
    
    for idx, word in enumerate(vocabulary):
        embedding_vector = model.wv[word]
        matrix[idx] = embedding_vector
        
    return matrix
        

embed_matrix = create_embedding_matrix(vocabulary, model, vocab_len, EMBEDDING_DIMS)

In [14]:
embed_matrix


array([[-0.40927795, -0.43080142,  0.15101083, ...,  0.08582862,
        -0.22945747, -0.44214246],
       [-0.65913212, -0.68323773,  0.07078896, ...,  0.38974309,
        -0.03229431, -0.39852932],
       [-0.25950655, -0.08383306,  0.49412736, ..., -0.74423391,
        -1.20270228, -0.98595756],
       ...,
       [ 0.04885043,  0.11058627,  0.0613017 , ..., -0.52569211,
        -0.07075758, -0.72186452],
       [ 0.30743369,  0.35076639, -0.3995038 , ..., -0.16616501,
        -0.08174261,  0.31847289],
       [ 0.2969161 , -0.17900603,  0.2625545 , ...,  0.58485961,
         0.2599811 , -0.3364369 ]])

## Encode sentences based on their word index from w2v model

In [15]:
# find the max seq len first
max_seq_len = max([len(s) for s in sentences])
max_seq_len

135

In [16]:
word_map = dict()
for idx, word in enumerate(vocabulary):
    word_map[word] = idx

word_map["oov"] = -1

In [17]:
def encode(sentences, vocab=vocabulary, max_len=max_seq_len):
    encoded = np.zeros(shape=(len(sentences), max_len), dtype=np.int32)

    for i in tqdm(range(len(sentences)), desc="encode"):
        sentence = sentences[i]
        for j, token in enumerate(sentence):
            if token in vocab:
                encoded[i][j] = word_map[token]
            else:
                encoded[i][j] = word_map["oov"]

    return encoded


encoded_texts = encode(sentences)

encode: 100%|██████████| 64720/64720 [00:27<00:00, 2321.99it/s]


## Split data

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    encoded_texts, labels, random_state=42, train_size=0.7
)

print(x_train.shape)
print(x_test.shape)

(45304, 135)
(19416, 135)


## CNN Model

In [19]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [20]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [26]:
"""
    Allowed embedding dims = 300
    Since we have pre-trained embeddings for these values
    n_filters - number of filters for convolutions
    filter_size - size of the filters (3, 4, 7 etc.)
    hidden_dims - number of hidden dimensions
    """

class PolarityCLF(nn.Module):
    def __init__(self, embedding_dims, n_filters, filter_size, hidden_dims, vocab_size, embeddings, n_classes):
        super(PolarityCLF, self).__init__()
        
        self.embedding_dims = embedding_dims
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.hidden_dims = hidden_dims 
        self.vocab_size = vocab_size
        self.n_classes = n_classes

        # convert numpy style embeddings to tensors
        self.embeddings = nn.Embedding.from_pretrained(
            torch.FloatTensor(embeddings)
        )
        
        # 2 conv1D layers
        self.conv1 = nn.Conv1d(in_channels=self.n_filters, 
                               kernel_size=self.filter_size,
                               out_channels=1)
        
        self.conv2 = nn.Conv1d(in_channels=self.n_filters, 
                               kernel_size=self.filter_size,
                               out_channels=1)

        # followed by a linear layer
        self.linear = nn.Linear(in_features=self.embedding_dims,
                                out_features=self.n_classes)
        
    def forward(self, x):
        # convert x to tensor first
        x = torch.LongTensor(x)
        x = self.embeddings(x)
        print(x)

clf = PolarityCLF(embedding_dims=135,
                  n_filters=50,
                  filter_size=3,
                  hidden_dims=50,
                  vocab_size=vocab_len,
                  embeddings=embed_matrix,
                  n_classes=2)

# clf = clf.to(device)
# print(clf)
clf(x_train)

IndexError: index out of range in self