## Dataset

Polarity Dataset. Pang/Lee ACL 2004

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

## Data Directories

In [1]:
pos_path = "./review_polarity/txt_sentoken/pos"
neg_path = "./review_polarity/txt_sentoken/neg"

In [2]:
import os

def read_text_files(category_path):
    file_list = os.listdir(category_path)
    texts = []

    for fname in file_list:
        with open(os.path.join(category_path, fname), "r") as f:
            lines = f.readlines()
            texts.extend(lines)
    
    return texts

In [3]:
pos_texts = read_text_files(pos_path)
neg_texts = read_text_files(neg_path)

## Clean

In [4]:
def clean(texts):
    cleaned = []
    for text in texts:
        cleaned_text = text.strip().lower()
        cleaned.append(cleaned_text)

    return cleaned

In [5]:
pos_cleaned = clean(pos_texts)
neg_cleaned = clean(neg_texts)

In [6]:
# merge cleaned
merged = list()
merged.extend(pos_cleaned)
merged.extend(neg_cleaned)

## Vocab

In [28]:
from nltk.tokenize import word_tokenize as tokenize
from nltk.corpus import stopwords
from tqdm import tqdm

# ignore stop words
ignore = stopwords.words("english")

# returns a list of unique tokens
def get_words():
    tokens = list()

    for i in tqdm(range(len(merged)), desc="get_words"):
        toks = tokenize(merged[i])
        for t in toks:
            if not t in tokens and t not in ignore:
                tokens.append(t)

    return tokens

In [29]:
%time words = get_words(merged)

get_words: 100%|██████████| 64720/64720 [03:04<00:00, 351.62it/s]

CPU times: user 3min 3s, sys: 314 ms, total: 3min 3s
Wall time: 3min 4s





In [30]:
len(words)

46319

So a lot of words. Which also means a lot of dimensions. We need to compress them.

## Embedding

In [33]:
# remove stopwords, use the already cleaned lists
# then label

# label: 1 for pos, 0 for neg
def process_one_class(label, data):
    processed = list()
    for i in tqdm(range(len(data)), desc=f"preprocess_{label}"):
        text = data[i]
        raw = tokenize(text)
        cleaned = []
        
        for tok in raw:
            if tok not in ignore:
                cleaned.append(tok)
        

        processed.append((label, cleaned))
        
    return processed


def preprocess():
    data = [] # label, tokens
    
    pos_processed = process_one_class(1, pos_cleaned)
    neg_processed = process_one_class(0, neg_cleaned)
        
    data.extend(pos_processed)
    data.extend(neg_processed)
    
    return data

In [34]:
data = preprocess()

preprocess_1: 100%|██████████| 32937/32937 [00:04<00:00, 6799.94it/s]
preprocess_0: 100%|██████████| 31783/31783 [00:04<00:00, 7348.64it/s]


In [35]:
data[0]

(1,
 ['films',
  'adapted',
  'comic',
  'books',
  'plenty',
  'success',
  ',',
  'whether',
  "'re",
  'superheroes',
  '(',
  'batman',
  ',',
  'superman',
  ',',
  'spawn',
  ')',
  ',',
  'geared',
  'toward',
  'kids',
  '(',
  'casper',
  ')',
  'arthouse',
  'crowd',
  '(',
  'ghost',
  'world',
  ')',
  ',',
  "'s",
  'never',
  'really',
  'comic',
  'book',
  'like',
  'hell',
  '.'])

In [37]:
sentences = [s[1] for s in data]
labels = [s[0] for s in data]

## Create dataset with torch

