## Corpus

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Text file Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

## Preprocess
- tokenize
- Clean punctuation, stopwords
- Assign labels

In [18]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from string import punctuation

ignore = stopwords.words("english")

def remove_stopwords(tokens):
    x = [token for token in tokens if token not in ignore]
    return x

def remove_punctuation(tokens):
    x = [token for token in tokens if token not in punctuation]
    return x

def clean(tokens):
    x = remove_stopwords(text)
    x = remove_punctuation(x)

    return x

In [23]:
from tqdm import tqdm

# label -> 1 for pos and 0 for neg
def prepare_corpus():
    # dataset is a list of tuples
    # (label, tokens)
    corpus = list()

    # idx -> label
    text_paths = [neg_path, pos_path]
    for idx, path in enumerate(text_paths):
        texts = read_text_files(path)
        for i in tqdm(range(len(texts)), desc="prepare_corpus"):
            text = texts[i]
            # tokenize
            tokens = tokenize(text)
            
            # clean
            tokens = remove_punctuation(tokens)
            tokens = remove_stopwords(tokens)
            
            # append
            corpus.append((idx, tokens))

    return corpus

In [24]:
corpus = prepare_corpus()

prepare_corpus: 100%|██████████| 31783/31783 [00:04<00:00, 6665.31it/s]
prepare_corpus: 100%|██████████| 32937/32937 [00:05<00:00, 6459.36it/s]


## Embedding - Word2Vec

Can either try pretraiend embeddings from google news dataset or train a new embedding from the existing corpus. Let's try it on the existing corpus.

In [25]:
import multiprocessing
from gensim.models import Word2Vec

sentences = [s[1] for s in corpus]

# 150 dims for the embedding
# embedding dim will define the number of outputs per convolution!
EMBEDDING_DIMS = 150


cores = multiprocessing.cpu_count()

# keep one cpu core free or some operating systems may kill the process :P
corpus_embedding = Word2Vec(
    vector_size=EMBEDDING_DIMS, 
    workers=cores-1, max_vocab_size=100000,
    min_count=0)

# build vocab
corpus_embedding.build_vocab(sentences)

# train for 50 epochs, can always change later!
%time corpus_embedding.train(sentences, total_examples=len(sentences), epochs=50)

Wall time: 15 s


(35077350, 37305350)

### Vocabulary

In [26]:
# list of all the words word2vec has processed
vocabulary = corpus_embedding.wv.index_to_key
vocab_len = len(vocabulary)

In [27]:
vocab_len

46291

## Word2Vec Embedding to Matrix
Word2Vec has vectors for each word in the dictionary it built

In [18]:
import numpy as np

embed_matrix = model.wv.vectors

In [19]:
embed_matrix

array([[ 0.86317396, -0.883756  ,  0.23260137, ..., -0.51179934,
        -0.3775267 , -0.5007696 ],
       [ 1.4009119 , -0.68947804,  0.30188733, ..., -0.42904943,
        -0.2594347 , -0.72123295],
       [ 0.7719407 , -0.3393777 ,  1.2076789 , ..., -0.2713661 ,
         0.44652554, -0.40827197],
       ...,
       [-0.06382576, -0.23063423,  0.6559784 , ..., -0.03027365,
         0.2418905 ,  0.06937668],
       [-0.03812286, -0.15068635, -0.05199979, ..., -0.49092948,
         0.36216962, -0.02147873],
       [-0.09177845,  0.29140413,  0.38027838, ..., -0.44980127,
        -0.36476412,  0.35823593]], dtype=float32)

## Encode sentences based on their word index from w2v model

In [20]:
# find the max seq len first
max_seq_len = max([len(s) for s in sentences])
max_seq_len

135

In [21]:
word_map = dict()
for idx, word in enumerate(vocabulary):
    word_map[word] = idx

In [22]:
def encode(sentences, vocab=vocabulary, max_len=max_seq_len):
    encoded = np.zeros(shape=(len(sentences), max_len), dtype=np.int32)

    for i in tqdm(range(len(sentences)), desc="encode"):
        sentence = sentences[i]
        for j, token in enumerate(sentence):
            encoded[i][j] = word_map[token]

    return encoded


encoded_texts = encode(sentences)

encode:   0%|          | 0/64720 [00:00<?, ?it/s]


KeyError: 'arthouse'

## Split data

In [18]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    encoded_texts, labels, random_state=42, train_size=0.7
)

print(x_train.shape)
print(x_test.shape)

(45304, 135)
(19416, 135)


## CNN Model

In [19]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [20]:
x_train = torch.from_numpy(x_train).to(device)
y_train = torch.LongTensor(y_train).to(device)
x_test = torch.from_numpy(x_test).to(device)
y_test = torch.LongTensor(y_test).to(device)

In [21]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [25]:
"""
    Allowed embedding dims = 50
    Since we have pre-trained embeddings for these values
    n_filters - number of filters for convolutions
    filter_size - size of the filters (3, 4, 7 etc.)
    hidden_dims - number of hidden dimensions
"""

class PolarityCLF(nn.Module):
    def __init__(self, embedding_dims, n_filters, filter_size, hidden_dims, vocab_size, embeddings, n_classes, seq_len):
        super(PolarityCLF, self).__init__()
        
        self.embedding_dims = embedding_dims
        self.n_filters = n_filters
        self.filter_size = filter_size
        self.hidden_dims = hidden_dims 
        self.vocab_size = vocab_size
        self.n_classes = n_classes

        # convert numpy style embeddings to tensors
        self.embeddings = nn.Embedding.from_pretrained(
            torch.FloatTensor(embeddings)
        )
        
        # 2 conv1D layers
        self.conv1 = nn.Conv1d(in_channels=seq_len, 
                               kernel_size=self.filter_size,
                               out_channels=self.embedding_dims)
        
        self.conv2 = nn.Conv1d(in_channels=seq_len, 
                               kernel_size=self.filter_size,
                               out_channels=self.embedding_dims)

        # followed by a linear layer
        self.linear = nn.Linear(in_features=self.n_filters * self.filter_size,
                                out_features=2)
        
    def forward(self, input):
        x = self.embeddings(input)
        #print(x)
        
        x1 = self.conv1(x)
        x1 = F.relu(x)
        x1 = F.max_pool1d(x, x1.size(2))
        
        x2 = self.conv2(x)
        x2 = F.relu(x)
        x2 = F.max_pool1d(x, x2.size(2))
        
        
        out = torch.cat((x1, x2))
        out = self.linear(out)
        out = F.softmax(out)
        
        return out

clf = PolarityCLF(embedding_dims=50,
                  n_filters=50,
                  filter_size=3,
                  hidden_dims=50,
                  vocab_size=vocab_len,
                  embeddings=embed_matrix,
                  n_classes=2,
                  seq_len=max_seq_len)

clf = clf.to(device)
clf(x_train)

RuntimeError: mat1 dim 1 must match mat2 dim 0

In [23]:
clf

PolarityCLF(
  (embeddings): Embedding(46319, 50)
  (conv1): Conv1d(135, 50, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(135, 50, kernel_size=(3,), stride=(1,))
  (linear): Linear(in_features=150, out_features=2, bias=True)
)

In [24]:
# train model
epochs = 5
lr = 0.01

loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(clf.parameters(), lr=lr)
    
for e in tqdm(range(epochs), desc="train"):
    optim.zero_grad()
    
    pred = clf(x_train)
    loss = loss_fn(pred, y_train)
    loss.backward()
    optimizer.step()

train:   0%|          | 0/5 [00:00<?, ?it/s]

tensor([[[ 2.4784, -4.0802,  1.5148,  ...,  1.3475,  1.8960, -5.1509],
         [-0.4303,  1.5252,  0.3382,  ...,  1.0678, -0.5476, -0.6631],
         [-0.0090,  0.8426,  0.0135,  ...,  0.1287, -0.6149, -0.0516],
         ...,
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879]],

        [[-0.3459,  0.1862, -0.1965,  ..., -4.2330, -1.9948,  0.9856],
         [ 1.0964, -3.6491, -0.5099,  ...,  0.2980,  0.8740, -0.7166],
         [-0.0289,  0.8443,  0.2020,  ..., -3.2222,  0.3259, -1.1094],
         ...,
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879],
         [ 0.0189, -0.1142,  0.6068,  ..., -0.1367, -0.2572, -0.3879]],

        [[ 0.0071,  2.0632, -1.0950,  ...,  3.3501, -0.2430,  1.0507],
         [ 0.0581, -0.6721, -0.4441,  ...,  0




AttributeError: 'NoneType' object has no attribute 'log_softmax'