## Dataset

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Data Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

In [3]:
pos_texts = read_text_files(pos_path)
neg_texts = read_text_files(neg_path)

## Clean extra characters
new lines, tabs yada yada

In [4]:
def clean(texts):
    cleaned = []
    for text in texts:
        cleaned_text = text.strip().lower()
        cleaned.append(cleaned_text)

    return cleaned

In [5]:
pos_cleaned = clean(pos_texts)
neg_cleaned = clean(neg_texts)

## Embedding - Word2Vec

In [6]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from tqdm import tqdm


# remove stopwords, use the already cleaned lists
# then label

ignore = stopwords.words("english")

# label: 1 for pos, 0 for neg
def process_one_class(label, data):
    processed = list()
    for i in tqdm(range(len(data)), desc=f"preprocess_{label}"):
        text = data[i]
        raw = tokenize(text)
        cleaned = []
        
        for tok in raw:
            if tok not in ignore:
                cleaned.append(tok)
        

        processed.append((label, cleaned))
        
    return processed


def preprocess():
    data = [] # label, tokens
    
    pos_processed = process_one_class(1, pos_cleaned)
    neg_processed = process_one_class(0, neg_cleaned)
        
    data.extend(pos_processed)
    data.extend(neg_processed)
    
    return data

In [7]:
data = preprocess()

preprocess_1: 100%|██████████| 32937/32937 [00:04<00:00, 7036.55it/s]
preprocess_0: 100%|██████████| 31783/31783 [00:04<00:00, 7376.42it/s]


In [8]:
sentences = [s[1] for s in data]
labels = [s[0] for s in data]

### Word2Vec from Gensim
Train a word2vec model using the gensim implementation

In [9]:
import multiprocessing
from gensim.models import Word2Vec

In [10]:
# 300 dims for the embedding
EMBEDDING_DIMS = 50
# TODO: can try different dimensions later on

In [11]:
cores = multiprocessing.cpu_count()

# keep one cpu core free or some operating systems may kill the process :P
model = Word2Vec(size=EMBEDDING_DIMS, workers=cores-1, max_vocab_size=100000, min_count=1)

# build vocab
model.build_vocab(sentences)

# train
%time model.train(sentences, total_examples=len(sentences), epochs=50)

CPU times: user 6min 55s, sys: 627 ms, total: 6min 56s
Wall time: 24.6 s


(37399031, 46480100)

### Vocabulary

In [12]:
# list of all the words word2vec has processed
vocabulary = model.wv.index2word
vocab_len = len(vocabulary)

## Word2Vec Embedding to Matrix
Word2Vec has vectors for each word in the dictionary it built

In [13]:
import numpy as np

embed_matrix = model.wv.vectors

In [14]:
embed_matrix

array([[ 0.01894527, -0.1141565 ,  0.6068214 , ..., -0.13673158,
        -0.2572196 , -0.38790774],
       [-1.1172026 ,  0.4654107 , -0.2715074 , ..., -0.30207562,
        -1.3615185 , -1.0738413 ],
       [-0.78043205, -1.2733043 , -0.70380944, ..., -0.97919273,
        -0.50045604,  0.30972093],
       ...,
       [ 0.10908616,  0.08208456, -0.03539661, ..., -0.01122672,
        -0.19403192, -0.15186326],
       [ 0.00722521,  0.09293254,  0.00449636, ...,  0.33342472,
        -0.25800857,  0.2712339 ],
       [ 0.12569751,  0.11907349, -0.04390107, ..., -0.0576267 ,
        -0.20243907, -0.04907944]], dtype=float32)

## Encode sentences based on their word index from w2v model

In [15]:
# find the max seq len first
max_seq_len = max([len(s) for s in sentences])
max_seq_len

135

In [16]:
word_map = dict()
for idx, word in enumerate(vocabulary):
    word_map[word] = idx


In [17]:
def encode(sentences, vocab=vocabulary, max_len=max_seq_len):
    encoded = np.zeros(shape=(len(sentences), max_len), dtype=np.int32)

    for i in tqdm(range(len(sentences)), desc="encode"):
        sentence = sentences[i]
        for j, token in enumerate(sentence):
            encoded[i][j] = word_map[token]

    return encoded


encoded_texts = encode(sentences)

encode: 100%|██████████| 64720/64720 [00:00<00:00, 199154.80it/s]


## Split data

In [18]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    encoded_texts, labels, random_state=42, train_size=0.7
)

print(x_train.shape)
print(x_test.shape)

(45304, 135)
(19416, 135)


## CNN Model

In [19]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [20]:
x_train = torch.from_numpy(x_train).to(device)
y_train = torch.LongTensor(y_train).to(device)
x_test = torch.from_numpy(x_test).to(device)
y_test = torch.LongTensor(y_test).to(device)

In [21]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [25]:
"""
    Allowed embedding dims = 50
    Since we have pre-trained embeddings for these values
    n_filters - number of filters for convolutions
    filter_size - size of the filters (3, 4, 7 etc.)
    hidden_dims - number of hidden dimensions
"""

class PolarityCLF(nn.Module):
    def __init__(self, embedding_dims, n_filters, filter_size, hidden_dims, vocab_size, embeddings, n_classes, seq_len):
        super(PolarityCLF, self).__init__()
        
        self.embedding_dims = embedding_dims
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.hidden_dims = hidden_dims 
        self.vocab_size = vocab_size
        self.n_classes = n_classes

        # convert numpy style embeddings to tensors
        self.embeddings = nn.Embedding.from_pretrained(
            torch.FloatTensor(embeddings)
        )
        
        # 2 conv1D layers
        self.conv1 = nn.Conv1d(in_channels=seq_len, 
                               kernel_size=self.filter_size,
                               out_channels=self.embedding_dims)
        
        self.conv2 = nn.Conv1d(in_channels=seq_len, 
                               kernel_size=self.filter_size,
                               out_channels=self.embedding_dims)

        # followed by a linear layer
        self.linear = nn.Linear(in_features=self.n_filters * self.filter_size,
                                out_features=2)
        
    def forward(self, input):
        x = self.embeddings(input)
        #print(x)
        
        x1 = self.conv1(x)
        x1 = F.relu(x)
        x1 = F.max_pool1d(x, x1.size(2))
        
        x2 = self.conv2(x)
        x2 = F.relu(x)
        x2 = F.max_pool1d(x, x2.size(2))
        
        
        out = torch.cat((x1, x2))
        out = self.linear(out)
        out = F.softmax(out)
        
        return out

clf = PolarityCLF(embedding_dims=50,
                  n_filters=50,
                  filter_size=3,
                  hidden_dims=50,
                  vocab_size=vocab_len,
                  embeddings=embed_matrix,
                  n_classes=2,
                  seq_len=max_seq_len)

clf = clf.to(device)
clf(x_train)

RuntimeError: mat1 dim 1 must match mat2 dim 0

In [23]:
clf

PolarityCLF(
  (embeddings): Embedding(46319, 50)
  (conv1): Conv1d(135, 50, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(135, 50, kernel_size=(3,), stride=(1,))
  (linear): Linear(in_features=150, out_features=2, bias=True)
)

In [24]:
# train model
epochs = 5
lr = 0.01

loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(clf.parameters(), lr=lr)
    
for e in tqdm(range(epochs), desc="train"):
    optim.zero_grad()
    
    pred = clf(x_train)
    loss = loss_fn(pred, y_train)
    loss.backward()
    optimizer.step()

train:   0%|          | 0/5 [00:00<?, ?it/s]

tensor([[[ 2.4784, -4.0802,  1.5148,  ...,  1.3475,  1.8960, -5.1509],
         [-0.4303,  1.5252,  0.3382,  ...,  1.0678, -0.5476, -0.6631],
         [-0.0090,  0.8426,  0.0135,  ...,  0.1287, -0.6149, -0.0516],
         ...,
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879]],

        [[-0.3459,  0.1862, -0.1965,  ..., -4.2330, -1.9948,  0.9856],
         [ 1.0964, -3.6491, -0.5099,  ...,  0.2980,  0.8740, -0.7166],
         [-0.0289,  0.8443,  0.2020,  ..., -3.2222,  0.3259, -1.1094],
         ...,
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879]],

        [[ 0.0071,  2.0632, -1.0950,  ...,  3.3501, -0.2430,  1.0507],
         [ 0.0581, -0.6721, -0.4441,  ...,  0




AttributeError: 'NoneType' object has no attribute 'log_softmax'