# Tokenization:
- Is a process that splits in input sequence into tokens(ex. split the text on whitespaces: "This is Adrew" -> ["This", "is", "Andrew"])



In [3]:
import nltk
text = "This is my friend's text!"
#Tokenization TreebankWordTokenization
tokens = nltk.tokenize.word_tokenize(text)
tokens

['This', 'is', 'my', 'friend', "'s", 'text', '!']

### There are different types of tokenization:
- WhiteSpace tokenization:

In [5]:
#White spaces tokenization:
white_space_tokens = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
print(f"Splitting by whitespaces: {white_space_tokens}\n")


Splitting by whitespaces: ['This', 'is', 'my', "friend's", 'text!']



- TreeBank tokenization:
    - split standard contractions, e.g. don't -> **do n't** and they'll -> **they 'll**
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

In [6]:
#TreeBankTokenization
tree_bank_tokens = nltk.tokenize.TreebankWordTokenizer().tokenize(text)
print(f"TreeBank splitting: {tree_bank_tokens}\n")

TreeBank splitting: ['This', 'is', 'my', 'friend', "'s", 'text', '!']



- WordPunct tokenization:
    - Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp **\w+|[^\w\s]+**.

In [11]:
#WordPunctTokenization
word_punct_tokens = nltk.tokenize.WordPunctTokenizer().tokenize(text)
print(f"Splitting on punctuation: {word_punct_tokens}\n")

Splitting on punctuation: ['feet', 'wolves', 'cats', 'talked']



# Removing stop words:
- “Stop words” are the most common words in a language like “the”, “a”, “on”, “is”, “all”. These words do not carry important meaning and are usually removed from texts.

In [12]:
from nltk.corpus import stopwords
input_str = "NLTK is a leading platform for building Python programs to work with human language data."
stop_words = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
result = [i for i in tokens if not i in stop_words]
print (result)

['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.']


# Token Normalization:
#### We may want to see the same tokens for different forms of the word:
    - wolf, wolves -> wolf
    - talk, talks -> talk
#### Stemming:
    - A process of removing and replacing suffixes to get to
    the root form of the word, which is known as stemm
#### Lemmatazation:
    - Returns the base or dictionary from a word,
    which is known as lemma

In [10]:
text = "feet wolves cats talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

stemm = nltk.stem.PorterStemmer()
lemma = nltk.stem.WordNetLemmatizer()
print("Sentence: ", text)
print(f"Stemming sentence: {' '.join(stemm.stem(token) for token in tokens)}")
print(f"Lemmatize sentece: {' '.join(lemma.lemmatize(token) for token in tokens)}")

Sentence:  feet wolves cats talked
Stemming sentence: feet wolv cat talk
Lemmatize sentece: foot wolf cat talked
