In [1]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()
v.fit(["Shaun is looking for a job is"])
v.vocabulary_

{'shaun': 4, 'is': 1, 'looking': 3, 'for': 0, 'job': 2}

**Number shown in the output are the indexes.**

**To see the functions of CountVectorizer press v.'tab' but mainly we use ngram_range function to give the size of n-gram. By deafult ngram_range is (1,1), it means uni-gram.**

In [2]:
# Assigning range to CountVectorizer, means take 1,2 or 3 .. word combinations to create feature

v = CountVectorizer(ngram_range=(1, 3))  # (min_n, max_n) 
v.fit(["Shaun is looking for a job"])
v.vocabulary_

{'shaun': 9,
 'is': 2,
 'looking': 6,
 'for': 0,
 'job': 5,
 'shaun is': 10,
 'is looking': 3,
 'looking for': 7,
 'for job': 1,
 'shaun is looking': 11,
 'is looking for': 4,
 'looking for job': 8}

**Number shown in the output are the indexes. These index no. are shown becaue it is a vocublary of words.**

**In output first five lines are the results of uni-gram, 6-9 lines are the results of bi-gram, 10-12 lines are the result of tri-gram.**

In [3]:
# Assigning range to CountVectorizer 

v = CountVectorizer(ngram_range=(1, 3))
v.fit(["Absolutely wonderful - silky and easy and comfortable"])
v.vocabulary_

{'absolutely': 0,
 'wonderful': 14,
 'silky': 11,
 'and': 3,
 'easy': 8,
 'comfortable': 7,
 'absolutely wonderful': 1,
 'wonderful silky': 15,
 'silky and': 12,
 'and easy': 5,
 'easy and': 9,
 'and comfortable': 4,
 'absolutely wonderful silky': 2,
 'wonderful silky and': 16,
 'silky and easy': 13,
 'and easy and': 6,
 'easy and comfortable': 10}

**Special character is present in vocabulary (-) but it is not shown in output becuase counter vectorizer remove special character before making features.**

In [4]:
# column names after applying N-Grams

v.get_feature_names()



['absolutely',
 'absolutely wonderful',
 'absolutely wonderful silky',
 'and',
 'and comfortable',
 'and easy',
 'and easy and',
 'comfortable',
 'easy',
 'easy and',
 'easy and comfortable',
 'silky',
 'silky and',
 'silky and easy',
 'wonderful',
 'wonderful silky',
 'wonderful silky and']

**Here Index no. are not seen in code becuase these are feature names.**

In [5]:
# checking stopwords in vocabulary

v.stop_words

In [6]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Logi is eating pizza"
]

In [7]:
# !python -m spacy download en_core_web_sm

In [8]:
# Code to remove stop words and punctuation marks from text and apply lemmatization on text.

import spacy

nlp = spacy.load("en_core_web_sm")  # importing english language vocabulary from spacy library

def preprocess(text):
    # Remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [9]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [10]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [11]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Logi eat pizza']

In [12]:
# Create CountVectorizer with n-gram range from 1 to 2

v = CountVectorizer(ngram_range=(1, 2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 8,
 'eat': 0,
 'pizza': 6,
 'thor eat': 9,
 'eat pizza': 1,
 'loki': 4,
 'tall': 7,
 'loki tall': 5,
 'logi': 2,
 'logi eat': 3}

**This is the vocabulary of corpus.**

In [15]:
# Now generate bag of n gram vector for few sample documents

v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

**This is the vector representation of first document of corpus.**

In [18]:
# Let's take a document that has out of vocabulary (OOV) term and see how bag of ngram generates vector out of it

In [19]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

**Hulk is out of vocabulary (OOV) term, that's why it is unable to predict correct output and generate random numbers.**

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
corpus = [
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document,"
]

In [22]:
vectorizer = CountVectorizer(ngram_range=(2, 2)) # (2,2) all features are the combinations of 2 words

In [23]:
x = vectorizer.fit_transform(corpus)

In [24]:
vectorizer.get_feature_names_out()

array(['and this', 'document is', 'first document', 'is the', 'is this',
       'second document', 'the first', 'the second', 'the third',
       'third one', 'this document', 'this is', 'this the'], dtype=object)