In [1]:
from pathlib import Path

Path.ls = lambda x: list(x.iterdir())

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def read_file(filepath: Path = "Movie_Reviews.txt"):
    with Path(filepath).open("r") as f:
        movie_review_text:str = f.read()
        return movie_review_text

In [4]:
movie_review_text = read_file(filepath="Movie_Reviews.txt")
movie_review_text.__sizeof__(), len(movie_review_text)

(264133428, 132066677)

# From the Official [Docs](https://stanfordnlp.github.io/stanza/installation_usage.html#getting-started)

In [5]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 116kB [00:00, 5.18MB/s]                    
2020-05-21 14:46:09 INFO: Downloading default packages for language: en (English)...
2020-05-21 14:46:10 INFO: File exists: /Users/nirant/stanza_resources/en/default.zip.
2020-05-21 14:46:14 INFO: Finished downloading models and saved to /Users/nirant/stanza_resources.


In [6]:
nlp = stanza.Pipeline('en', verbose=True) # This sets up a default neural pipeline in English

2020-05-21 14:46:14 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-05-21 14:46:14 INFO: Use device: cpu
2020-05-21 14:46:14 INFO: Loading: tokenize
2020-05-21 14:46:14 INFO: Loading: pos
2020-05-21 14:46:15 INFO: Loading: lemma
2020-05-21 14:46:15 INFO: Loading: depparse
2020-05-21 14:46:16 INFO: Loading: ner
2020-05-21 14:46:17 INFO: Done loading processors!


In [7]:
nlp.processors

{'tokenize': <stanza.pipeline.tokenize_processor.TokenizeProcessor at 0x14c04df10>,
 'pos': <stanza.pipeline.pos_processor.POSProcessor at 0x14c05dc10>,
 'lemma': <stanza.pipeline.lemma_processor.LemmaProcessor at 0x14d53d490>,
 'depparse': <stanza.pipeline.depparse_processor.DepparseProcessor at 0x14d2cef90>,
 'ner': <stanza.pipeline.ner_processor.NERProcessor at 0x1220da190>}

In [8]:
doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")

Stanza uses a nested data format (doc -> sentences -> words/tokens) 

In [9]:
words = []
for sentence in doc.sentences:
    for word in sentence.words:
        words.append(word.text)
words

['Barack',
 'Obama',
 'was',
 'born',
 'in',
 'Hawaii',
 '.',
 'He',
 'was',
 'elected',
 'president',
 'in',
 '2008',
 '.']

In [10]:
tokens = []
for sentence in doc.sentences:
    for token in sentence.tokens:
        tokens.append(token.text)
print(tokens)

['Barack', 'Obama', 'was', 'born', 'in', 'Hawaii', '.', 'He', 'was', 'elected', 'president', 'in', '2008', '.']


## Adapting to our Data

### Disable Unused Components 
Stanza allows us to select processors in the `Pipeline` which we want to use. Since we will be using this primarily for tokenization, we skip everything else.

In [11]:
nlp = stanza.Pipeline("en", processors="tokenize", verbose=False)

In [12]:
nlp.processors

{'tokenize': <stanza.pipeline.tokenize_processor.TokenizeProcessor at 0x11a05b950>}

In [13]:
%%time
# %%timeit -n 3
doc = nlp(movie_review_text[:100000])

CPU times: user 3min, sys: 8.72 s, total: 3min 9s
Wall time: 55.6 s


### Extract Bag of Tokens (Words)
In popular classical ML pipelines, we will need a bag of tokens (words) for classification. Extracting that from the NLP library is often 1 extra step. We profile that as well, to get a better sense of how long it takes.

Since we need tokens, not words, we make a small change from the Example earlier.

In [14]:
%%time
tokens = []
for sentence in doc.sentences:
    for token in sentence.tokens:
        tokens.append(token.text)

CPU times: user 12.8 ms, sys: 809 µs, total: 13.6 ms
Wall time: 13.6 ms


### Get a sense of Vocabulary size

In [15]:
from collections import Counter
token_cntr = Counter(tokens)

In [16]:
print(f"Unique Tokens: {len(token_cntr)}")

Unique Tokens: 4519


At almost ~270K, this is a large vocabulary and can lead to too much sparsity in our matrix computations. Let's try to reduce the vocabulary size while still retaining the maximum signal we can. 

Using popular convention, we try these next:
1. Keep tokens with minimum frequency = 3
2. Lowercase all tokens and then set minimum frequency = 3

### Reducing our Vocabulary Size

In [None]:
min_freq = 3

In [None]:
token_cntr = {k: v for k, v in token_cntr.items() if v >= min_freq}

In [None]:
print(
    f"After dropping all rare tokens, min_freq = {min_freq}, we have:\nUnique Tokens: {len(token_cntr)}"
)

This is still larger than what I'd like. Let's see if we can get a small vocabulary with lowercase tokens. 

In [None]:
%time lowercase_token_cntr = Counter([token.text.lower() for token in doc])

In [None]:
print(f"Unique Tokens: {len(lowercase_token_cntr)}")

In [None]:
lowercase_token_cntr = {k: v for k, v in lowercase_token_cntr.items() if v >= min_freq}

In [None]:
print(
    f"After dropping all rare tokens, min_freq = {min_freq}, we have:\nUnique Tokens: {len(lowercase_token_cntr)}"
)