## Corpora, Tokens, and Types

#### Tokenizing text

In [2]:
import spacy
nlp = spacy.load('en')

In [4]:
text = "Mary, don’t slap the green witch"

In [6]:
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [8]:
from nltk.tokenize import TweetTokenizer

In [9]:
tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tweet

'Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)'

In [10]:
tokenize = TweetTokenizer()
print(tokenize.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


In [11]:
print(tokenize.tokenize(text.lower()))

['mary', ',', 'don', '’', 't', 'slap', 'the', 'green', 'witch']


Unigrams, Bigrams, Trigrams, …, Ngrams

In [16]:
def n_gram(text, n):
    return [text[i:i+n] for i in range(len(text)-n+1)]

In [17]:
cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']

In [18]:
n_gram(cleaned, 3)

[['mary', ',', "n't"],
 [',', "n't", 'slap'],
 ["n't", 'slap', 'green'],
 ['slap', 'green', 'witch'],
 ['green', 'witch', '.']]

#### Lemmatization

In [22]:
doc = nlp("he was running late")

In [23]:
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> -PRON-
was --> be
running --> run
late --> late


### Categorizing Sentences and Documents

#### Categorizing Words: POS Tagging

In [24]:
for token in doc:
    print('{} --> {}'.format(token, token.pos_))

he --> PRON
was --> VERB
running --> VERB
late --> ADV


#### Categorizing Spans: Chunking and Named Entity Recognition

In [26]:
for chunk in doc.noun_chunks:
    print('{} --> {}'.format(chunk, chunk.label_))

he --> NP
