## Corpus

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Text file Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

## Load data

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

## Preprocess
- tokenize
- Clean punctuation, stopwords
- Assign labels

In [18]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from string import punctuation

ignore = stopwords.words("english")

def remove_stopwords(tokens):
    x = [token for token in tokens if token not in ignore]
    return x

def remove_punctuation(tokens):
    x = [token for token in tokens if token not in punctuation]
    return x

def clean(tokens):
    x = remove_stopwords(text)
    x = remove_punctuation(x)

    return x

In [23]:
from tqdm import tqdm

# label -> 1 for pos and 0 for neg
def prepare_corpus():
    # dataset is a list of tuples
    # (label, tokens)
    corpus = list()

    # idx -> label
    text_paths = [neg_path, pos_path]
    for idx, path in enumerate(text_paths):
        texts = read_text_files(path)
        for i in tqdm(range(len(texts)), desc="prepare_corpus"):
            text = texts[i]
            # tokenize
            tokens = tokenize(text)
            
            # clean
            tokens = remove_punctuation(tokens)
            tokens = remove_stopwords(tokens)
            
            # append
            corpus.append((idx, tokens))

    return corpus

In [43]:
corpus = prepare_corpus()

prepare_corpus: 100%|██████████| 31783/31783 [00:04<00:00, 6839.70it/s]
prepare_corpus: 100%|██████████| 32937/32937 [00:05<00:00, 6329.68it/s]


In [28]:
corpus[0]

(0,
 ['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive'])

## Embedding - Word2Vec

Can either try pretraiend embeddings from google news dataset or train a new embedding from the existing corpus. Let's try it on the existing corpus.

In [30]:
import multiprocessing
from gensim.models import Word2Vec

sentences = [s[1] for s in corpus]

# 150 dims for the embedding
# embedding dim will define the number of outputs per convolution!
EMBEDDING_DIMS = 150


cores = multiprocessing.cpu_count()

# keep one cpu core free or some operating systems may kill the process :P
# ignore words which have a frequency < 5
corpus_embedding = Word2Vec(
    vector_size=EMBEDDING_DIMS, 
    workers=cores-1, max_vocab_size=100000,
    min_count=5)

# build vocab
corpus_embedding.build_vocab(sentences)

# train for 50 epochs, can always change later!
%time corpus_embedding.train(sentences, total_examples=len(sentences), epochs=50)

Wall time: 13.6 s


(32380340, 37305350)

### Vocabulary

In [31]:
# list of all the words word2vec has processed
vocabulary = corpus_embedding.wv.index_to_key
vocab_len = len(vocabulary)

In [32]:
vocab_len

14718

In [35]:
vocabulary[:10]

["'s", '``', 'film', "n't", 'movie', 'one', 'like', 'even', 'good', 'time']

## Encode all tokens with indices from embedding

In [64]:
def encode_corpus_tokens_with_embed_idx(corpus):
    encoded_corpus = list()
    for i in tqdm(range(len(corpus)), desc="encode_tokens_with_embed_idx"):
        idxs = []
        label, tokens = corpus[i]

        for token in tokens:
            try:
                idx = corpus_embedding.wv.key_to_index[token]
            except:
                # if token isn't in the vocab
                idx = 0

            idxs.append(idx)
        
        
        encoded_corpus.append((label, idxs))



    return encoded_corpus

In [81]:
encoded_corpus = encode_corpus_tokens_with_embed_idx(corpus)

encode_tokens_with_embed_idx: 100%|██████████| 64720/64720 [00:00<00:00, 317691.92it/s]


In [82]:
encoded_corpus[0][1]

[29, 18, 944, 4129, 45, 1854, 658, 3626, 1280]

## Padding

Left pad with 0

However we need a sequence length. 

In [83]:
# get max sequences length
max_seq_len = max(len(s) for s in sentences)
max_seq_len

85

In [90]:
import numpy as np

def pad_tokens(encoded_corpus, seq_len=max_seq_len):
    padded = np.zeros(
        (len(encoded_corpus), seq_len),
        dtype=np.int32
    )

    for i in tqdm(range(len(corpus)), desc="pad"):
        tokens = encoded_corpus[i][1]

        # nltk's stopwords are a bit agrressive, ignore token lists with 0 size
        if len(tokens) == 0:
            continue

        padded[i, -len(tokens):] = np.array(tokens)

    return padded

In [91]:
padded_tokens = pad_tokens(encoded_corpus)

pad: 100%|██████████| 64720/64720 [00:00<00:00, 389770.06it/s]


## Input and Labels?

In [92]:
X = padded_tokens # input
y = np.array([c[0] for c in encoded_corpus])  #label

In [93]:
print(X.shape)
print(y.shape)

(64720, 85)
(64720,)


## Split data

In [107]:

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    X, y, random_state=42, train_size=0.8
)

In [108]:
# https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.8, random_state=42)

In [109]:
print(f"x_train = {x_train.shape} # y_train = {y_train.shape}")
print(f"x_val = {x_val.shape} # y_val = {y_val.shape}")
print(f"x_test = {x_test.shape} # y_test = {y_test.shape}")

x_train = (41420, 85) # y_train = (41420,)
x_val = (10356, 85) # y_val = (10356,)
x_test = (12944, 85) # y_test = (12944,)


## Convert to TensorData

In [110]:
import torch
from torch.utils.data import TensorDataset

training_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))
test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

## DataLoader for Torch

Let torch handle the shuffling and etc yada stuff for MiniBatch

Why MiniBatch? Dataset is big and feeding everything at once won't generalize well. (Even if the machine can handle it!)

In [111]:
from torch.utils.data import DataLoader

# define a batch size
batch_size = 64

train_loader = DataLoader(training_data, shuffle=True, batch_size=64)
val_loader = DataLoader(val_data, shuffle=True, batch_size=64)
test_loader = DataLoader(test_data, shuffle=True, batch_size=64)