# 1. Traditional NLP Preprocessing

## 1.1. Stop words

In [1]:
## You might need to run the following two lines to download stopwords for NLTK
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuwen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.corpus import stopwords

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

<font color='red'>Question:</font> What are stopwords? Why do we need to remove them?

In [None]:
Stop words: Words like “the,” “and,” “or” are uninformative and add unneeded noise
to the analysis
stop words are words which are filtered out before or after processing of natural language data (text). 
some extremely common words which would appear to be of little value in helping select documents matching a user need are excluded from the vocabulary entirely. 

## 1.2 Stemming

In [4]:
from nltk.stem.snowball import EnglishStemmer

In [5]:
stemmer = EnglishStemmer(ignore_stopwords=False)

In [6]:
stemmer.stem('walk')

'walk'

In [7]:
stemmer.stem('walking')

'walk'

In [8]:
stemmer.stem('walked')

'walk'

<font color='red'>Question:</font> What is stemming? Why is it helpful?

It is the process of transforming to the root word, that is, it uses an algorithm that removes
common word endings from English words.

It is the process of transforming to the root word, that is, it uses an algorithm that removes
common word endings from English words, such as “ly,” “es,” “ed,” and “s.” For example,
assuming for an analysis you may want to consider “carefully,” “cared,” “cares,” “caringly”
as “care” instead of separate words. 

Lemmatization

It is the process of transforming to the dictionary base form. For this you can use WordNet, which is a large lexical database for English words that are linked together by their semantic relationships. It works as a thesaurus, that is, it groups words together based on their meanings.

## 1.3 CountVectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

**Example**: Build a stem_tokenizer function

In [10]:
import re
def stem_tokenizer(text):
    stemmer = EnglishStemmer(ignore_stopwords=True)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words 

**Example**: Initialize a vectorizer

In [11]:
cv = CountVectorizer(stop_words=stopwords.words('english'),
                     tokenizer=stem_tokenizer,
                     lowercase=True,
                     max_df=0.9,
                     min_df=1
                    )

**Example**: Fit vectorizer using texts

In [12]:
texts = ['Rabbit runs fast', 
         'Rabbit runs very very fast', 
         'Duck runs fast too', 
         'Rabbit runs faster than Duck runs',
         'Duck runs slower than Rabbit runs',
         'Rabbit runs faster',
         'Duck runs slower',
         'Duck runs every day',
         'This duck runs than all the other ducks, but it still runs slower than a rabbit'
        ]

In [13]:
cv.fit(texts)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.9, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function stem_tokenizer at 0x000001E5EC1D82F0>,
                vocabulary=None)

**Example**: Vocabulary of the vectorizer

In [14]:
cv.vocabulary_

{'rabbit': 5,
 'fast': 3,
 'duck': 1,
 'faster': 4,
 'slower': 6,
 'everi': 2,
 'day': 0,
 'still': 7}

**Example**: Vectorized texts

In [15]:
import pandas as pd

In [16]:
cv.transform(texts)

<9x8 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [16]:
df = pd.DataFrame(cv.transform(texts).todense())

In [17]:
df.columns = list(zip(*sorted(cv.vocabulary_.items(), key=lambda x: x[-1])))[0]

In [18]:
df

Unnamed: 0,day,duck,everi,fast,faster,rabbit,slower,still
0,0,0,0,1,0,1,0,0
1,0,0,0,1,0,1,0,0
2,0,1,0,1,0,0,0,0
3,0,1,0,0,1,1,0,0
4,0,1,0,0,0,1,1,0
5,0,0,0,0,1,1,0,0
6,0,1,0,0,0,0,1,0
7,1,1,1,0,0,0,0,0
8,0,2,0,0,0,1,1,1


<font color='red'>Question:</font> How does **CountVectorizer** work? What is **Bag of Words**?

Call the fit() function in order to learn a vocabulary from one or more documents.
Call the transform() function on one or more documents as needed to encode each as a vector.

An encoded vector is returned with a length of the entire vocabulary and an integer count for the number of times each word appeared in the document.

One of the important concepts in text mining is n-grams, which are fundamentally a set of co-occurring or continuous sequence of n items from a given sequence of large text. The item here could be words, letters, and syllables. 

<font color='red'>Question:</font> What do **df_min** and **df_max** do? Why do we want to remove the most and least frequent words?

max_dffloat in range [0.0, 1.0] or int, default=1.0
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

min_dffloat in range [0.0, 1.0] or int, default=1
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

## 2.4 Ngrams

**Example**: Add ngrams in the vectorizer

In [19]:
cv = CountVectorizer(stop_words=stopwords.words('english'),
                     tokenizer=stem_tokenizer,
                     lowercase=True,
                     max_df=0.9,
                     min_df=2,
                     ngram_range=(1, 3)
                    )

In [20]:
cv.fit(texts)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.9, max_features=None, min_df=2,
                ngram_range=(1, 3), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function stem_tokenizer at 0x1a169a8440>,
                vocabulary=None)

In [21]:
cv.vocabulary_

{'rabbit': 5,
 'fast': 3,
 'rabbit run': 6,
 'run fast': 9,
 'rabbit run fast': 7,
 'duck': 0,
 'duck run': 1,
 'faster': 4,
 'run faster': 10,
 'rabbit run faster': 8,
 'slower': 13,
 'run slower': 11,
 'slower rabbit': 14,
 'duck run slower': 2,
 'run slower rabbit': 12}

In [22]:
df = pd.DataFrame(cv.transform(texts).todense())

In [23]:
df.columns = list(zip(*sorted(cv.vocabulary_.items(), key=lambda x: x[-1])))[0]

In [24]:
df

Unnamed: 0,duck,duck run,duck run slower,fast,faster,rabbit,rabbit run,rabbit run fast,rabbit run faster,run fast,run faster,run slower,run slower rabbit,slower,slower rabbit
0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0
1,0,0,0,1,0,1,1,1,0,1,0,0,0,0,0
2,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0
3,1,1,0,0,1,1,1,0,1,0,1,0,0,0,0
4,1,1,1,0,0,1,1,0,0,0,0,1,1,1,1
5,0,0,0,0,1,1,1,0,1,0,1,0,0,0,0
6,1,1,1,0,0,0,0,0,0,0,0,1,0,1,0
7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,2,1,0,0,0,1,0,0,0,0,0,1,1,1,1


<font color='red'>Question:</font> What is ngram? What problem can it solve?

The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted. All values of n such such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams. Only applies if analyzer is not callable.

## 2.5 TFIDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'))

In [27]:
tfidf.fit(texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [28]:
df = pd.DataFrame(tfidf.transform(texts).todense())

In [29]:
df.columns = list(zip(*sorted(tfidf.vocabulary_.items(), key=lambda x: x[-1])))[0]

In [30]:
df

Unnamed: 0,day,duck,ducks,every,fast,faster,rabbit,runs,slower,still
0,0.0,0.0,0.0,0.0,0.750896,0.0,0.531611,0.391849,0.0,0.0
1,0.0,0.0,0.0,0.0,0.750896,0.0,0.531611,0.391849,0.0,0.0
2,0.0,0.531611,0.0,0.0,0.750896,0.0,0.0,0.391849,0.0,0.0
3,0.0,0.383134,0.0,0.0,0.0,0.622417,0.383134,0.564813,0.0,0.0
4,0.0,0.402638,0.0,0.0,0.0,0.0,0.402638,0.593566,0.568722,0.0
5,0.0,0.0,0.0,0.0,0.0,0.794357,0.488973,0.36042,0.0,0.0
6,0.0,0.531611,0.0,0.0,0.0,0.0,0.0,0.391849,0.750896,0.0
7,0.643201,0.334407,0.0,0.643201,0.0,0.0,0.0,0.24649,0.0,0.0
8,0.0,0.271489,0.522184,0.0,0.0,0.0,0.271489,0.400227,0.383476,0.522184


<font color='red'>Question:</font> What does TFIDF mean? Why is it designed in this way?

TF-IDF is a good statistical measure to reflect the revance of the term to the document in a collection of documents or corpus. 

sklearn provides provides a function TfidfVectorizer to calculate TFIDF for text, however by default it normalizes the term vector using L2
normalization and also IDF is smoothed by adding one to the document
frequency to prevent zero divisions.