# <font color = 'pickle'> Import/install Libraries

In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import sys


In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')

    !pip install -U nltk -qq
    !pip install -U spacy -qq
    !python -m spacy download en_core_web_sm -qq

    basepath = '/content/drive/MyDrive/data'
    sys.path.append('/content/drive/MyDrive/data/custom-functions')
else:
    basepath = '/home/harpreet/Insync/google_drive_shaannoor/data'
    sys.path.append(
        '/home/harpreet/Insync/google_drive_shaannoor/data/custom-functions')


Mounted at /content/drive
2023-08-28 07:50:33.168994: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# Import required libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import spacy         # For NLP preprocessing

# Import required nltk packages
import nltk
nltk.download('stopwords')  # Download the stopwords corpus
from nltk.corpus import stopwords as nltk_stopwords  # Stopwords corpus

# Import tweet tokenizer from nltk
from nltk.tokenize import TweetTokenizer

# Import CountVectorizer and TfidfVectorizer from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import pathlib for managing file paths
from pathlib import Path

# import custom-preprocessor from python file
import custom_preprocessor_mod as cp

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
spacy.__version__


'3.6.1'

In [None]:
# load spacy model
nlp = spacy.load('en_core_web_sm')


# <font color = 'pickle'> Bag of Words (Sparse Embeddings)


## <font color = 'pickle'>**What is Bag of Words (BoW)?**</font>

A **bag-of-words** is a representation of text that describes the occurrence of words within a document <font color ='indianred'>**disregarding grammar and word order**</font>. It involves two steps:

    1. Create Vocabulary. Each word in vocabulary forms feature(independent variable) to represent document.
    2. Score words (based on frequency or occurrence) to create Vectors.

## <font color = 'pickle'> **Why do you need to learn Bag of Words?**</font>

- Till now we have learnt how to pre-process the text data i.e clean the text data.
- Our final goal is to use text data in Machine Learning (ML) models. For example - we want to predict whether e-mail is a spam or not based on the text of the data.
- But ML models can understand only numbers. Therefore we need to convert text to vectors (numbers).
- The simple method of converting text to numbers is to use 'Bag of Words approach'






## <font color = 'pickle'> **Why do you need Bag of Words in age of LLMs?** </font>

<font color = 'indianred'> **Outstanding paper Award ACL 2023: Linear Classifier: An Often-Forgotten Baseline for Text Classification**</font>

*"Large-scale pre-trained language models such as BERT are popular solutions for text classification. Due to the superior performance of these advanced methods, nowadays, people often directly train them for a few epochs and deploy the obtained model. In this opinion paper, we point out that this way may only sometimes get satisfactory results. We argue the importance of running a simple baseline like linear classifiers on bag-of-words features along with advanced methods."*



## <font color = 'pickle'>**Learning Outcome** </font>
After completing this tutorial, you will know

1. What the bag-of-words approach is and how you can use it to represent text data.
2. What are different techniques to prepare a vocabulary and score words.
3. How to implement 'Bag-of-words' approach in python using sklearn.

# <font color = 'pickle'> **Tutorial Overview**</font>
 - Generating Vocab
 - Generating vectors using Vocab
     - Binary Vectorizer
     - Count Vectorizer
     - tfidf Vectorizer

 - Modifying Vocab
 - Example - IMDB Dataset


## <font color = 'pickle'> **Generating Vocab**

###  <font color = 'pickle'> **Dummy Corpus**

In [None]:
# Dummy corpus
Corpus = ["Count Vectorizer - for this vectorizer, scoring is done based on frequency. For this vectorizer frequency is key. @vectorizer #frequency @frequency, doesn’t",
          "tfidf vectorizer - for this vectorizer, scoring is done based on tfidf,  higher tfidf higher score #tfidf @vectorizer "  ,
          "Binary vectorizer - for this vectorizer, scoring is done based on presence of word. For this vectorizer, dummy is key #dummy @dummy @vectorizer "]


### <font color = 'pickle'>**Create an instance of Vectorizer**

In [None]:
vectorizer = CountVectorizer()


The above code creates an instance of the `CountVectorizer` class from the `sklearn.feature_extraction.text module`. This class is used to convert a collection of text documents to a matrix of token counts.

It accomplishes this by
  1. tokenizing the input text
  2. creating a vocabulary of all the tokens found in the text
  3. encoding the text as a matrix of token counts based on this vocabulary.

The created instance vectorizer can then be used to fit the text data to the vocabulary and generate the token count matrix.

In [None]:
CountVectorizer??


### <font color = 'pickle'>**Fit Vectorizer on corpus to generate vocab**

In [None]:
# Fit the vectorizer on corpus
vectorizer.fit(Corpus)


<font color = 'indianred'>**Vectorizer().fit() does the following**:
- lowercases your text
- uses utf-8 encoding
- performs tokenization (converts raw text to smaller units of text)
- uses word level tokenization (meaning each word is treated as a separate token) and  ignores single characters during tokenization ( words like ‘a’ and ‘I’ are removed)
- By default, the regular expression that is used to split the text and create tokens is : `"\b\w\w+\b"`.
  - This means it finds all sequences of characters that consist of at least two letters or numbers(\w) and that are separated by word boundaries (\b).
  - It does not find single-letter words, and it splits up contractions like “doesn’t” or “bit.ly”, but it matches “h8ter” as a single word.
- The CountVectorizer then converts all words to lowercasecharacters, so that “soon”, “Soon”, and “sOon” all correspond to the same token (and therefore feature).
- It then creates a dictionary of unique words.
- The set of unique words is used as features in the CountVectorizer.

In [None]:
# Let us see the dictionary created
vectorizer.vocabulary_


{'count': 2,
 'vectorizer': 18,
 'for': 6,
 'this': 17,
 'scoring': 15,
 'is': 9,
 'done': 4,
 'based': 0,
 'on': 12,
 'frequency': 7,
 'key': 10,
 'doesn': 3,
 'tfidf': 16,
 'higher': 8,
 'score': 14,
 'binary': 1,
 'presence': 13,
 'of': 11,
 'word': 19,
 'dummy': 5}

In [None]:
# The set of unique words is used as features in the CountVectorizer
features = vectorizer.get_feature_names_out()
print(features)
print(len(features))


['based' 'binary' 'count' 'doesn' 'done' 'dummy' 'for' 'frequency'
 'higher' 'is' 'key' 'of' 'on' 'presence' 'score' 'scoring' 'tfidf' 'this'
 'vectorizer' 'word']
20


## <font color = 'pickle'>**Generate Vectors using Vocab**

### <font color = 'pickle'>**Binary Vectorizer**

In [None]:
binary_vectorizer = CountVectorizer(binary=True)
binary_vectorizer.fit(Corpus)


- We can now call transform() method to transform documents in our corpus to vectors.
- <font color = 'dodgerblue'>**Each document**</font> will be represented by <font color = 'dodgerblue'>**vector of length equal to len(dictionary)**.</font>
- The vectors are stored in the form of a <font color = 'dodgerblue'>**sparse matrix**.</font>
- We can use <font color = 'dodgerblue'>**toarray()**</font> function to get complete matrix.
- Number of columns represent the number of features (len(vocab)).
- Number of rows represent the number the documents in a corpus.
- <font color = 'dodgerblue'>**For each row, the numbers displayed are 0 or 1 - indicating absence or presence of a word in a document.**

In [None]:
binary_vectors = binary_vectorizer.transform(Corpus)


In [None]:
print(f'vectors in sparse format')
print(binary_vectors)


vectors in sparse format
  (0, 0)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 6)	1
  (0, 7)	1
  (0, 9)	1
  (0, 10)	1
  (0, 12)	1
  (0, 15)	1
  (0, 17)	1
  (0, 18)	1
  (1, 0)	1
  (1, 4)	1
  (1, 6)	1
  (1, 8)	1
  (1, 9)	1
  (1, 12)	1
  (1, 14)	1
  (1, 15)	1
  (1, 16)	1
  (1, 17)	1
  (1, 18)	1
  (2, 0)	1
  (2, 1)	1
  (2, 4)	1
  (2, 5)	1
  (2, 6)	1
  (2, 9)	1
  (2, 10)	1
  (2, 11)	1
  (2, 12)	1
  (2, 13)	1
  (2, 15)	1
  (2, 17)	1
  (2, 18)	1
  (2, 19)	1


In [None]:
print(f'\nbinary vectors in array(dense) format')
print(binary_vectors.toarray())
print(
    f'\nThe shape of the binary vectors is : {binary_vectors.toarray().shape}')



binary vectors in array(dense) format
[[1 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 1 0]
 [1 0 0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 1 1 0]
 [1 1 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1 1]]

The shape of the binary vectors is : (3, 20)


In [None]:
# create dataframe for better visualization
df_binary = pd.DataFrame(binary_vectors.toarray(), columns=features)
df_binary


Unnamed: 0,based,binary,count,doesn,done,dummy,for,frequency,higher,is,key,of,on,presence,score,scoring,tfidf,this,vectorizer,word
0,1,0,1,1,1,0,1,1,0,1,1,0,1,0,0,1,0,1,1,0
1,1,0,0,0,1,0,1,0,1,1,0,0,1,0,1,1,1,1,1,0
2,1,1,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,1,1,1


### <font color = 'pickle'>**Count Vectorizer**
-  The vectors are stored in the form of a sparse matrix.
- Number of columns represent the number of features (len(vocab))
- Number of rows represent the number the documents in a corpus
- Thus, each document is represented by a vector of size of length of vocab.
- For each row, <font color = 'dodgerblue'>**the numbers displayed are the number of times a particular word has occurred in the document.**

In [None]:
term_freq_vectorizer = CountVectorizer(binary=False)
# we can combine fit and transform steps into a single step using fit_transform()
count_vectors = term_freq_vectorizer.fit_transform(Corpus)
print(f'count vectors in array (dense) format\n')
print(count_vectors.toarray())
print(f'\nThe shape of the count vectors is : {count_vectors.toarray().shape}')


count vectors in array (dense) format

[[1 0 1 1 1 0 2 4 0 2 1 0 1 0 0 1 0 2 4 0]
 [1 0 0 0 1 0 1 0 2 1 0 0 1 0 1 1 4 1 3 0]
 [1 1 0 0 1 3 2 0 0 2 1 1 1 1 0 1 0 2 4 1]]

The shape of the count vectors is : (3, 20)


In [None]:
# create dataframe for better visualization
df_count = pd.DataFrame(count_vectors.toarray(),
                        columns=term_freq_vectorizer.get_feature_names_out())
df_count


Unnamed: 0,based,binary,count,doesn,done,dummy,for,frequency,higher,is,key,of,on,presence,score,scoring,tfidf,this,vectorizer,word
0,1,0,1,1,1,0,2,4,0,2,1,0,1,0,0,1,0,2,4,0
1,1,0,0,0,1,0,1,0,2,1,0,0,1,0,1,1,4,1,3,0
2,1,1,0,0,1,3,2,0,0,2,1,1,1,1,0,1,0,2,4,1


### <font color = 'pickle'>**tf-idf Vectorizer**</font>

- One measure of how important a word is term frequency (tf) (how frequently a word occurs in a document). We examined term frequency in previous sections where we used CountVectorizer to get the freqency of each word.
- But there may be words in a document, that occur many times but these words also occur in all other documents as well.
- Therefore the word might not be a good representation of the document.
- We can account for this by  <font color = 'dodgerblue'>giving more importance to words that occur in fewer documents using inverse document frequency </font> ((# Number of documents) / (Number of documents containing the word)).
- This can be <font color = 'dodgerblue'>combined with term frequency</font> to calculate a term’s tf-idf (the two quantities multiplied together), the frequency of a term adjusted for how rarely it is used.
- The idea of tf-idf is to <font color = 'dodgerblue'>find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents.</font>
- tf-idf gives more weight to the the words that are important (i.e., occur more frequently) in a given document, but occur rarely in other documents.

In [None]:
tfidf_vectorizer = TfidfVectorizer()
# we can combine fit and transform steps into a single step using fit_transform()
tfidf_vectors = tfidf_vectorizer.fit_transform(Corpus)
print(f'tfidf vectors in array (dense) format\n')
print(tfidf_vectors.toarray())
print(f'\nThe shape of the tfidf vectors is : {tfidf_vectors.toarray().shape}')


tfidf vectors in array (dense) format

[[0.10829999 0.         0.18336782 0.18336782 0.10829999 0.
  0.21659998 0.73347128 0.         0.21659998 0.13945595 0.
  0.10829999 0.         0.         0.10829999 0.         0.21659998
  0.43319995 0.        ]
 [0.11455596 0.         0.         0.         0.11455596 0.
  0.11455596 0.         0.3879202  0.11455596 0.         0.
  0.11455596 0.         0.1939601  0.11455596 0.77584039 0.11455596
  0.34366788 0.        ]
 [0.11874019 0.20104462 0.         0.         0.11874019 0.60313387
  0.23748039 0.         0.         0.23748039 0.15289962 0.20104462
  0.11874019 0.20104462 0.         0.11874019 0.         0.23748039
  0.47496077 0.20104462]]

The shape of the tfidf vectors is : (3, 20)


In [None]:
# create dataframe for better visualization
df_tfidf = pd.DataFrame(tfidf_vectors.toarray(),
                        columns=tfidf_vectorizer.get_feature_names_out())
df_tfidf.round(4)


Unnamed: 0,based,binary,count,doesn,done,dummy,for,frequency,higher,is,key,of,on,presence,score,scoring,tfidf,this,vectorizer,word
0,0.1083,0.0,0.1834,0.1834,0.1083,0.0,0.2166,0.7335,0.0,0.2166,0.1395,0.0,0.1083,0.0,0.0,0.1083,0.0,0.2166,0.4332,0.0
1,0.1146,0.0,0.0,0.0,0.1146,0.0,0.1146,0.0,0.3879,0.1146,0.0,0.0,0.1146,0.0,0.194,0.1146,0.7758,0.1146,0.3437,0.0
2,0.1187,0.201,0.0,0.0,0.1187,0.6031,0.2375,0.0,0.0,0.2375,0.1529,0.201,0.1187,0.201,0.0,0.1187,0.0,0.2375,0.475,0.201


### <font color = 'pickle'>**Undertstanding tfidf calculations**

By default <br>
$\text{tfidf}(w, d) = \text{tf(w, d)} * \text{idf(w)}$
<br>
$\text{idf(w)} = \log\big(\frac{N + 1}{N_w + 1}\big) + 1$
<br><br>
if smooth_idf = False (default is True):
<br>
$\text{idf(w)} = \log\big(\frac{N }{N_w}\big) + 1$
<br><br>
if sublinear_tfbool = True (default is False)
<br>
$\text{tf(w, d)} = \log(\text{tf(w, d)} ) + 1$

Here:<br>
- $\text{tf}(w, d)$ is number of times word $w$ appears in document $d$
<br>
- $\text{idf}(w)$ is inverse document frequency of word $w$
- $N$ is total number of documents
- $N_w$ is number of documents that contain word w

In [None]:
# Calculate inverse document frequency for each feature (word)
term_idf = tfidf_vectorizer.idf_
term_idf


array([1.        , 1.69314718, 1.69314718, 1.69314718, 1.        ,
       1.69314718, 1.        , 1.69314718, 1.69314718, 1.        ,
       1.28768207, 1.69314718, 1.        , 1.69314718, 1.69314718,
       1.        , 1.69314718, 1.        , 1.        , 1.69314718])

In [None]:
# create dataframe for better visualization
df_idf = pd.DataFrame(term_idf, index=tfidf_vectorizer.get_feature_names_out())
df_idf.round(4).T


Unnamed: 0,based,binary,count,doesn,done,dummy,for,frequency,higher,is,key,of,on,presence,score,scoring,tfidf,this,vectorizer,word
0,1.0,1.6931,1.6931,1.6931,1.0,1.6931,1.0,1.6931,1.6931,1.0,1.2877,1.6931,1.0,1.6931,1.6931,1.0,1.6931,1.0,1.0,1.6931


In [None]:
# create dataframe for tf vectors for the first document

# Create a dense numpy array from the sparse count vector for the first document
first_document_tf = count_vectors[0].toarray().ravel()

# Get the feature names for the term frequency vectors
feature_names_tf = term_freq_vectorizer.get_feature_names_out()

# Create a dataframe from the term frequency feature names and values
df_tf = pd.DataFrame({'features': feature_names_tf, 'tf': first_document_tf})
df_tf


Unnamed: 0,features,tf
0,based,1
1,binary,0
2,count,1
3,doesn,1
4,done,1
5,dummy,0
6,for,2
7,frequency,4
8,higher,0
9,is,2


Note: The `toarray` method is used to convert the sparse matrix into a dense numpy array, and `ravel` is used to flatten the resulting 2-dimensional array into a 1-dimensional array. This is necessary because pandas dataframes expect 1-dimensional arrays as values for the columns.

In [None]:
# create dataframe for tfidf vectors for the first document
first_document_tfidf = tfidf_vectors[0].toarray().ravel()
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame({'features': feature_names_tfidf,
                        'idf': term_idf, 'norm_tfidf': first_document_tfidf})

# combine dataframes

# Merge the tf and tf-idf dataframes on the 'features' column
df = pd.merge(left=df_tf, right=df_tfidf)

# Sort the combined dataframe by the 'norm_tfidf' column in descending order
df.sort_values(by=["norm_tfidf"], ascending=False, inplace=True)

df


Unnamed: 0,features,tf,idf,norm_tfidf
7,frequency,4,1.693147,0.733471
18,vectorizer,4,1.0,0.4332
17,this,2,1.0,0.2166
6,for,2,1.0,0.2166
9,is,2,1.0,0.2166
2,count,1,1.693147,0.183368
3,doesn,1,1.693147,0.183368
10,key,1,1.287682,0.139456
12,on,1,1.0,0.1083
15,scoring,1,1.0,0.1083


**Observations from above results**
- words 'frequency' and 'vectorizer' occurs 4 times in the documsnt and hence term frequency is 4.
- Word 'vectorizer' occurs in every document and hence idf is 1 (log(1) + 1).
- norm_tfidf gives higher score to word 'frequency' than 'vectorizer'.
- norm_tfidf is not equal to idf * tf

Let us know understand how norm_tfidf is calculated:

In [None]:
# calculate tfidf (without any normalization)
df['tfidf'] = df.eval('tf*idf')


In [None]:
# calculate tfidf - normalized
df['sq_tfidf'] = df.eval('tfidf**2')
df['norm_tfidf_manually'] = df['tfidf']/np.sqrt(df['sq_tfidf'].sum())


## <font color = 'pickle'>**Modifying Vocab**

### <font color = 'pickle'>**Case sensitive**

In [None]:
# The lowercase argument is set to False to indicate that the text should
# not be converted to lowercase before tokenizing.
# The resulting vocab may have same word in upper and lower case
vectorizer = CountVectorizer(lowercase=False)

# we can use fit_transform to use fit() and transform() in one step
vectors = vectorizer.fit_transform(Corpus)
vectorizer.vocabulary_


{'Count': 1,
 'Vectorizer': 3,
 'for': 8,
 'this': 19,
 'vectorizer': 20,
 'scoring': 17,
 'is': 11,
 'done': 6,
 'based': 4,
 'on': 14,
 'frequency': 9,
 'For': 2,
 'key': 12,
 'doesn': 5,
 'tfidf': 18,
 'higher': 10,
 'score': 16,
 'Binary': 0,
 'presence': 15,
 'of': 13,
 'word': 21,
 'dummy': 7}

### <font color = 'pickle'>**Filtering words based on frequency**

The `max_df`, `min_df`, and `max_features` parameters in the `CountVectorizer`` class control the feature selection for the resulting term frequency (tf) vectors.

- `max_df`: This parameter sets the maximum threshold for the frequency of a term in the document collection. If a term has a document frequency (i.e., the number of documents that contain the term) higher than max_df, it will be ignored. <font color = 'dodgerblue' >**This parameter is used to filter out stop words (corpus specific) that appear in too many documents.** </font>

- min_df: This parameter sets the minimum threshold for the frequency of a term in the document collection. If a term has a document frequency lower than min_df, it will be ignored.  <font color = 'dodgerblue' >**This parameter is used to filter out rare words that appear in too few documents.**

- max_features: This parameter sets the maximum number of features (i.e., the maximum number of unique terms) that should be included in the resulting tf vectors. If the number of unique terms in the document collection is larger than max_features, the terms with the highest tf values will be kept and the others will be ignored.  <font color = 'dodgerblue' >**This parameter is used to reduce the dimensionality of the resulting tf vectors, which can help reduce the computational cost of downstream processing.**

By using the max_df, min_df, and max_features parameters, you can control the feature selection process and determine the most informative terms to include in the tf vectors.

In [None]:
# remove rare words - remove words which appear in less than 2 documents
vectorizer = CountVectorizer(min_df=2)
vectorizer.fit(Corpus)
vectorizer.vocabulary_


{'vectorizer': 8,
 'for': 2,
 'this': 7,
 'scoring': 6,
 'is': 3,
 'done': 1,
 'based': 0,
 'on': 5,
 'key': 4}

In [None]:
# remove words which appear in more than 2 documents - remove corpus specific stop words
vectorizer = CountVectorizer(max_df=2)
vectorizer.fit(Corpus)
vectorizer.vocabulary_


{'count': 1,
 'frequency': 4,
 'key': 6,
 'doesn': 2,
 'tfidf': 10,
 'higher': 5,
 'score': 9,
 'binary': 0,
 'presence': 8,
 'of': 7,
 'word': 11,
 'dummy': 3}

In [None]:
# retain most frequent words only - retain top n words based on term frequency across corpus
vectorizer = CountVectorizer(max_features=5)
vectorizer.fit(Corpus)
vectorizer.vocabulary_


{'vectorizer': 4, 'for': 0, 'this': 3, 'is': 1, 'tfidf': 2}

### <font color = 'pickle'>**Stop Words**

In [None]:
# We can also specify list of stopwords to countvectorizer to get the feature without stopwords

# Import libraries
nltk_stop_words = nltk_stopwords.words('english')

vectorizer = CountVectorizer(max_features=5, stop_words=nltk_stop_words)
vectorizer.fit(Corpus)
vectorizer.vocabulary_


{'vectorizer': 4, 'done': 1, 'based': 0, 'frequency': 2, 'tfidf': 3}

### <font color = 'pickle'>**Custom Tokenizer and Preprocessor**

#### <font color = 'pickle'>**nltk tokenizer**

In [None]:
# We can use custom tokenizer e.g. we can use nltk tweet tokenizer to get each tokens as feature

# Create an instance of the TweetTokenizer class
tweet_tokenizer = TweetTokenizer()

# Initialize the CountVectorizer with the custom tokenizer
# only works if analyzer = 'word'
vectorizer = CountVectorizer(
    analyzer='word', tokenizer=tweet_tokenizer.tokenize)

vectorizer.fit_transform(Corpus)
vectorizer.vocabulary_




{'count': 11,
 'vectorizer': 28,
 '-': 4,
 'for': 15,
 'this': 27,
 ',': 3,
 'scoring': 24,
 'is': 18,
 'done': 13,
 'based': 9,
 'on': 21,
 'frequency': 16,
 '.': 5,
 'key': 19,
 '@vectorizer': 8,
 '#frequency': 1,
 '@frequency': 7,
 'doesn': 12,
 '’': 30,
 't': 25,
 'tfidf': 26,
 'higher': 17,
 'score': 23,
 '#tfidf': 2,
 'binary': 10,
 'presence': 22,
 'of': 20,
 'word': 29,
 'dummy': 14,
 '#dummy': 0,
 '@dummy': 6}

#### <font color = 'pickle'>**spacy pre-processor and tokenizer**

In [None]:
def spacy_preprocessor(text):

    # Create spacy object
    doc = nlp(text)

    # remove punctuations and get a list of tokens
    filtered_text = [token.text for token in doc if not token.is_punct]

    # join the processed tokens in to string
    return " ".join(filtered_text)


In [None]:
# Spacy Tokenizer
def spacy_tokenizer(data):
    doc = nlp(data)
    return [token.text for token in doc]


In [None]:
# custom preprocessor and spacy tokenizer
vectorizer = CountVectorizer(analyzer='word', preprocessor=spacy_preprocessor,
                             tokenizer=spacy_tokenizer, token_pattern=None)
vectors = vectorizer.fit(Corpus)
vectorizer.vocabulary_


{'Count': 5,
 'Vectorizer': 7,
 'for': 12,
 'this': 24,
 'vectorizer': 25,
 'scoring': 22,
 'is': 15,
 'done': 10,
 'based': 8,
 'on': 19,
 'frequency': 13,
 'For': 6,
 'key': 16,
 '@vectorizer': 3,
 '@frequency': 2,
 'does': 9,
 'n’t': 17,
 'tfidf': 23,
 '  ': 0,
 'higher': 14,
 'score': 21,
 'Binary': 4,
 'presence': 20,
 'of': 18,
 'word': 26,
 'dummy': 11,
 '@dummy': 1}

#### <font color = 'pickle'>**custom preprocessor we created earlier**

In [None]:
custom_preprocessor = cp.SpacyPreprocessor(
    'en_core_web_sm', remove_stop=True, lemmatize=True, stemming=False)


In [None]:
def spacy_preprocessor(text):
    filtered_text = custom_preprocessor.transform([text])
    return " ".join(filtered_text)


In [None]:
# custom preprocessor and spacy tokenizer
vectorizer = CountVectorizer(analyzer='word', preprocessor=spacy_preprocessor,
                             tokenizer=spacy_tokenizer, token_pattern=None)
vectors = vectorizer.fit(Corpus)
vectorizer.vocabulary_


{'count': 3,
 'vectorizer': 12,
 'scoring': 10,
 'base': 1,
 'frequency': 5,
 'key': 7,
 'tfidf': 11,
 '  ': 0,
 'high': 6,
 'score': 9,
 'binary': 2,
 'presence': 8,
 'word': 13,
 'dummy': 4}

#### <font color = 'pickle'>**token patterns with regular expressions**

In [None]:
# We can pass regex to the argument token_pattern to get required pattern
# whitespace tokenizer
# This can be very useful if we have allready cleaned the text
vectorizer = CountVectorizer(analyzer='word', token_pattern=r"[\S]+")

# Assign the encoded(transformed) vectors to a variable
vectors = vectorizer.fit_transform(Corpus)

vectorizer.vocabulary_


{'count': 9,
 'vectorizer': 28,
 '-': 3,
 'for': 13,
 'this': 27,
 'vectorizer,': 29,
 'scoring': 24,
 'is': 17,
 'done': 11,
 'based': 7,
 'on': 21,
 'frequency.': 15,
 'frequency': 14,
 'key.': 19,
 '@vectorizer': 6,
 '#frequency': 1,
 '@frequency,': 5,
 'doesn’t': 10,
 'tfidf': 25,
 'tfidf,': 26,
 'higher': 16,
 'score': 23,
 '#tfidf': 2,
 'binary': 8,
 'presence': 22,
 'of': 20,
 'word.': 30,
 'dummy': 12,
 'key': 18,
 '#dummy': 0,
 '@dummy': 4}

### <font color = 'pickle'>**ngrams**</font>

- Till now our features consists of single token. However, in some cases we may want to use sequence of tokens as features
- Consider the following corpus
 1. This item is good
 2. This item is not good
- Now  both the documents will have feature 'good' and 'not' will be an additional feature in document 2.
- For applications like sentiment analysis - it might be a good idea to consider 'not good' as a single token.

- We can use ngram_range(min_n, max_n) in CountVectorizer to create features that consists of sequence of words.

- if we specify min_n = 2 and max_n = 3, we will get bigrams and trigrams as features.

In [None]:
min_n = 2
max_n = 2

# only works if analyzer = 'word'
vectorizer1 = CountVectorizer(analyzer='word', ngram_range=(min_n, max_n))
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(min_n, max_n))

text1 = ["This item is not good"]
text2 = ["This item is terribly good"]

# Fit the vectorizer to text
vectorizer1.fit_transform(text1)
vectorizer2.fit_transform(text2)

features1 = vectorizer1.get_feature_names_out()
features2 = vectorizer2.get_feature_names_out()

print('Features for text 1\n')
for feature in features1:
    print(feature)

print(f'\nFeatures for text 2\n')
for feature in features2:
    print(feature)


Features for text 1

is not
item is
not good
this item

Features for text 2

is terribly
item is
terribly good
this item


## <font color = 'pickle'>**Example : IMDB Data set**

### <font color = 'pickle'>**Import Data**

In [None]:
# Use train.csv of IMDB movie review data (we downloaded this in the last lecture)
base_folder = Path(basepath)
data_folder = base_folder/'datasets'
train_data = data_folder / 'aclImdb'/'train.csv'
test_data = data_folder / 'aclImdb'/'test.csv'


In [None]:
# Reading data
train_df = pd.read_csv(train_data, index_col=0)
test_df = pd.read_csv(test_data, index_col=0)
print(f'Shape of Training data set is : {train_df.shape}')
print(f'Shape of Test data set is : {test_df.shape}')
print(f'\nTop five rows of Training data set:\n')
train_df.head()


Shape of Training data set is : (25000, 2)
Shape of Test data set is : (25000, 2)

Top five rows of Training data set:



Unnamed: 0,Reviews,Labels
0,Ever wanted to know just how much Hollywood co...,1
1,The movie itself was ok for the kids. But I go...,1
2,You could stage a version of Charles Dickens' ...,1
3,this was a fantastic episode. i saw a clip fro...,1
4,and laugh out loud funny in many scenes.<br />...,1


### <font color = 'pickle'>**Generating Vocab**</font>
- <font color = 'indianred'>**Vocab should be created only based on training dataset**</font>
- We will generate vocab using CountVectorizer
- <font color = 'indianred'>**Use fit_transform() on Training data set**.
- **Use only transform() on Test dataset**. This make sures that we generate vocab only based on training dataset.

In [None]:
# Initialize vectorizer
nltk_stop_words = nltk_stopwords.words('english')
bag_of_word = CountVectorizer(stop_words=nltk_stop_words)

# Fit on training data
bag_of_word.fit(train_df['Reviews'].values)


In [None]:
# get feature names
features = bag_of_word.get_feature_names_out()


In [None]:
# check the legth of the vocab
len(features)


74704

### <font color = 'pickle'>**Create vectors for reviews**

In [None]:
# Transform the training and test dataset
bow_vector_train = bag_of_word.transform(train_df['Reviews'].values)
bow_vector_test = bag_of_word.transform(test_df['Reviews'].values)


In [None]:
# Shape of the matrix for train dataset
bow_vector_train


<25000x74704 sparse matrix of type '<class 'numpy.int64'>'
	with 2479678 stored elements in Compressed Sparse Row format>

In [None]:
# Shape of the matrix for test dataset
bow_vector_test


<25000x74704 sparse matrix of type '<class 'numpy.int64'>'
	with 2385031 stored elements in Compressed Sparse Row format>

### <font color = 'pickle'>**Limit vocab using max_features**
We got 25k rows with 78k+ features, but what if we want only top 5k features.
We can do this by providing max_features parameter.

In [None]:
# Limit Vocab size using Max features
spacy_stop_words = nlp.Defaults.stop_words
bag_of_word = CountVectorizer(
    max_features=5000, stop_words=list(spacy_stop_words))  # Max features

# Fit on training data
bag_of_word.fit(train_df['Reviews'].values)




In [None]:
# Transform the training and test dataset
bow_vector_train = bag_of_word.transform(train_df['Reviews'].values)
bow_vector_test = bag_of_word.transform(train_df['Reviews'].values)


In [None]:
# Document representation
vocab = bag_of_word.get_feature_names_out()
pd.DataFrame(bow_vector_train.toarray(), columns=vocab)


Unnamed: 0,00,000,10,100,11,12,13,13th,14,15,...,yesterday,york,young,younger,youth,zero,zizek,zombie,zombies,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24997,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
24998,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
