In [1]:
from pathlib import Path

Path.ls = lambda x: list(x.iterdir())

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def read_file(filepath: Path = "Movie_Reviews.txt"):
    with Path(filepath).open("r") as f:
        movie_review_text:str = f.read()
        return movie_review_text

In [4]:
movie_review_text = read_file(filepath="Movie_Reviews.txt")
movie_review_text.__sizeof__(), len(movie_review_text)

(264133428, 132066677)

In [6]:
# !python -m spacy download en_core_web_sm # in case you forgot to download this earlier

## Based on Official NLTK docs

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = """
(How does it deal with this parenthesis?)  "It should be part of the
previous sentence." "(And the same with this one.)" ('And this one!')
"('(And (this)) '?)" [(and this. )]
"""
# print(sent_tokenize(text))
print(word_tokenize(text))

['(', 'How', 'does', 'it', 'deal', 'with', 'this', 'parenthesis', '?', ')', '``', 'It', 'should', 'be', 'part', 'of', 'the', 'previous', 'sentence', '.', "''", '``', '(', 'And', 'the', 'same', 'with', 'this', 'one', '.', ')', "''", '(', "'And", 'this', 'one', '!', "'", ')', '``', '(', "'", '(', 'And', '(', 'this', ')', ')', "'", '?', ')', "''", '[', '(', 'and', 'this', '.', ')', ']']


## Adapting to our Data

In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [12]:
%%time
# %%timeit -n 3
tokens = word_tokenize(movie_review_text)

CPU times: user 2min 53s, sys: 3.87 s, total: 2min 56s
Wall time: 3min


### Get a sense of Vocabulary size

In [13]:
from collections import Counter
token_cntr = Counter(tokens)

In [15]:
print(f"Unique Tokens: {len(token_cntr)}")

Unique Tokens: 301240


At ~300K, this is a large vocabulary and can lead to too much sparsity in our matrix computations. Let's try to reduce the vocabulary size while still retaining the maximum signal we can. 

Using popular convention, we try these next:
1. Keep tokens with minimum frequency = 3
2. Lowercase all tokens and then set minimum frequency = 3

### Reducing our Vocabulary Size

In [16]:
min_freq = 3

In [17]:
token_cntr = {k: v for k, v in token_cntr.items() if v >= min_freq}

In [18]:
print(
    f"After dropping all rare tokens, min_freq = {min_freq}, we have:\nUnique Tokens: {len(token_cntr)}"
)

After dropping all rare tokens, min_freq = 3, we have:
Unique Tokens: 102458


This is still larger than what I'd like. Let's see if we can get a small vocabulary with lowercase tokens. 

In [20]:
%time lowercase_token_cntr = Counter([token.lower() for token in tokens])

CPU times: user 7.72 s, sys: 698 ms, total: 8.42 s
Wall time: 8.43 s


In [21]:
print(f"Unique Tokens: {len(lowercase_token_cntr)}")

Unique Tokens: 254678


In [22]:
lowercase_token_cntr = {k: v for k, v in lowercase_token_cntr.items() if v >= min_freq}

In [23]:
print(
    f"After dropping all rare tokens, min_freq = {min_freq}, we have:\nUnique Tokens: {len(lowercase_token_cntr)}"
)

After dropping all rare tokens, min_freq = 3, we have:
Unique Tokens: 86359
