# NLP: Tokeniztion

- **Document:** A text that is single observation in your dataset. It can be as short as a tweet or as long as a novel.
- **Corpus:** A collection of documents that comprises your dataset.

## Make a Toy Corpus

In [1]:
corpus = ["The dog bit the man.",
          "Andrea doesn't believe it happened!",
          "The dog is dangerous.",
          "Truthfully, the man is more dangerous than the dog."]

## How to Tokenize My Documents?

In [2]:
doc1 = corpus[1]
print(doc1)

Andrea doesn't believe it happened!


In [3]:
doc1.split(' ')

['Andrea', "doesn't", 'believe', 'it', 'happened!']

In [4]:
doc0 = corpus[0]
doc0.split(' ')

['The', 'dog', 'bit', 'the', 'man.']

### `spaCy`

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS

In [6]:
doc0 = nlp(doc0)
doc1 = nlp(doc1)

In [7]:
tokens = [token.lower_ for token in doc0 
          if token.lower_ not in STOP_WORDS]
print(tokens)

['dog', 'bit', 'man', '.']


In [8]:
tokens = [token.lower_ for token in doc1
          if token.lower_ not in STOP_WORDS]
print(tokens)

['andrea', "n't", 'believe', 'happened', '!']


### `scikit-learn`

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(lowercase=True, stop_words='english')
cv_tokenizer = cv.build_tokenizer()

## Vectorization

## `CountVectorizer`

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [12]:
corpus = ['The dog is brown',
          'The other dog is blue',
          'One dog is named Spot',
          'The other one is named Snoopy',
          'A third dog is red, and that dog is really named Lucy']

In [42]:
c = CountVectorizer()
X_trans = c.fit_transform(corpus).toarray()

In [14]:
pd.DataFrame(X_trans, columns=sorted(c.vocabulary_))

Unnamed: 0,and,blue,brown,dog,is,lucy,named,one,other,really,red,snoopy,spot,that,the,third
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0
1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0
2,0,0,0,1,1,0,1,1,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,1,1,1,0,0,1,0,0,1,0
4,1,0,0,2,2,1,1,0,0,1,1,0,0,1,0,1


In [43]:
X_trans.shape

(5, 16)

## TD-IDF

In [24]:
corpus

['The dog is brown',
 'The other dog is blue',
 'One dog is named Spot',
 'The other one is named Snoopy',
 'A third dog is red, and that dog is really named Lucy']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
tfidf = TfidfVectorizer(norm='l1')
X_trans = tfidf.fit_transform(corpus).toarray()

df = pd.DataFrame(X_trans, columns=sorted(tfidf.vocabulary_))
df

Unnamed: 0,and,blue,brown,dog,is,lucy,named,one,other,really,red,snoopy,spot,that,the,third
0,0.0,0.0,0.369058,0.207921,0.175858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247163,0.0
1,0.0,0.284382,0.0,0.160216,0.13551,0.0,0.0,0.0,0.229438,0.0,0.0,0.0,0.0,0.0,0.190454,0.0
2,0.0,0.0,0.0,0.160216,0.13551,0.0,0.190454,0.229438,0.0,0.0,0.0,0.0,0.284382,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.107575,0.0,0.151193,0.18214,0.18214,0.0,0.0,0.225758,0.0,0.0,0.151193,0.0
4,0.114292,0.0,0.0,0.128781,0.108922,0.114292,0.076543,0.0,0.0,0.114292,0.114292,0.0,0.0,0.114292,0.0,0.114292


In [32]:
from numpy.linalg import norm

In [30]:
print(df.loc[0].values)

[0.         0.         2.09861229 1.18232156 1.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.40546511 0.        ]


In [35]:
l2 = norm(df.loc[0].values, ord=2)
print(df.loc[0].values / l2)

[0.         0.         0.70835264 0.39907352 0.33753383 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.47439202 0.        ]


In [39]:
l1 = norm(df.loc[0].values, ord=1)
print(l1)
print(df.loc[0].values / l1)

1.919351998101726
[0.         0.         0.36905822 0.20792096 0.17585822 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.24716259 0.        ]


## HashingVectorizer

In [17]:
print(hash('blue') % 2**20)
print(hash('dog') % 2**20)
print(hash('is') % 2**20)

959044
842896
136009


In [18]:
from sklearn.feature_extraction.text import HashingVectorizer

In [22]:
hashing = HashingVectorizer(n_features=20, alternate_sign=False, norm=None)
X_trans = hashing.fit_transform(corpus).toarray()
pd.DataFrame(X_trans)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,2.0,0.0,2.0,0.0,0.0


## n-grams

In [51]:
cv = CountVectorizer(ngram_range=(1,2))
X_trans = cv.fit_transform(corpus).toarray()
df = pd.DataFrame(X_trans, columns=sorted(cv.vocabulary_))
df.columns = [col.replace(' ', '_') for col in df.columns]
df

Unnamed: 0,and,and_that,blue,brown,dog,dog_is,is,is_blue,is_brown,is_named,...,red_and,snoopy,spot,that,that_dog,the,the_dog,the_other,third,third_dog
0,0,0,0,1,1,1,1,0,1,0,...,0,0,0,0,0,1,1,0,0,0
1,0,0,1,0,1,1,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,1,1,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,0
4,1,1,0,0,2,2,2,0,0,0,...,1,0,0,1,1,0,0,0,1,1


In [52]:
print(df.shape)
df.info()

(5, 36)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 36 columns):
and             5 non-null int64
and_that        5 non-null int64
blue            5 non-null int64
brown           5 non-null int64
dog             5 non-null int64
dog_is          5 non-null int64
is              5 non-null int64
is_blue         5 non-null int64
is_brown        5 non-null int64
is_named        5 non-null int64
is_really       5 non-null int64
is_red          5 non-null int64
lucy            5 non-null int64
named           5 non-null int64
named_lucy      5 non-null int64
named_snoopy    5 non-null int64
named_spot      5 non-null int64
one             5 non-null int64
one_dog         5 non-null int64
one_is          5 non-null int64
other           5 non-null int64
other_dog       5 non-null int64
other_one       5 non-null int64
really          5 non-null int64
really_named    5 non-null int64
red             5 non-null int64
red_and         5 non-null int64
