## Count Vectorizer

In [8]:
#import count vectorizer and tfidf vectoriser
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import pandas as pd
train = ('The sky is blue.','The sun is bright.')
test = ('The sun in the sky is bright', 'We can see the shining sun, the bright sun.')
# instantiate the vectorizer object
# use analyzer as word and stop_words is english which are responsible for remove stop words and create word vocabulary
# fit_transform creates the vocabluary depending on the group of documents passed.
countvectorizer = CountVectorizer(analyzer='word' , stop_words='english')

terms = countvectorizer.fit_transform(train)
term_vectors  = countvectorizer.transform(test)
print(f"Term frequency built from train data: \n{terms}")
print(f"Vocaboluary created: \n {countvectorizer.vocabulary_}")
print(f"Feature names: \n {countvectorizer.get_feature_names_out()}")
# print(f"Feature names: \n {countvectorizer.get_feature_names()}") // deprecated

print(f"Coordinates which have non-zero entries in term frequency matrix: \n{term_vectors}")
print("Sparse Matrix form of test data term frequency: \n")
print(term_vectors.todense())

df = pd.DataFrame(term_vectors.toarray(), index = {'Doc1', 'Doc2'}, columns = countvectorizer.get_feature_names_out())
df


Term frequency built from train data: 
  (0, 2)	1
  (0, 0)	1
  (1, 3)	1
  (1, 1)	1
Vocaboluary created: 
 {'sky': 2, 'blue': 0, 'sun': 3, 'bright': 1}
Feature names: 
 ['blue' 'bright' 'sky' 'sun']
Coordinates which have non-zero entries in term frequency matrix: 
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (1, 1)	1
  (1, 3)	2
Sparse Matrix form of test data term frequency: 

[[0 1 1 1]
 [0 1 0 2]]


Unnamed: 0,blue,bright,sky,sun
Doc2,0,1,1,1
Doc1,0,1,0,2


## TfidfTransformer

In [2]:
tfidf = TfidfTransformer(norm='l2')
tfidf.fit(term_vectors)

print("\nVector of idf \n")
print(tfidf.idf_)
## idf is calculated by the formula = ln( D+1 /df + 1) + 1 | [ adding 1 in num/den/and after calculating natural log as well ]

tf_idf_matrix = tfidf.transform(term_vectors)
print("\nFinal tf-idf vectorizer matrix form :\n")
print(tf_idf_matrix.todense())
## tfidf is calculated as multipication of tf and idf. And then normalizing it by dividing the square root of 
## sum of squares of tfidf(i,j) 




Vector of idf 

[2.09861229 1.         1.40546511 1.        ]

Final tf-idf vectorizer matrix form :

[[0.         0.50154891 0.70490949 0.50154891]
 [0.         0.4472136  0.         0.89442719]]


In [3]:
# fit and transform can be done in one single step as well
tfidf_1 = TfidfTransformer(norm='l2')
tf_idf_matrix_1 = tfidf_1.fit_transform(term_vectors)

print("\nVector of idf \n")
print(tfidf_1.idf_)
print("\nFinal tf-idf vectorizer matrix form :\n")
print(tf_idf_matrix_1.todense())



Vector of idf 

[2.09861229 1.         1.40546511 1.        ]

Final tf-idf vectorizer matrix form :

[[0.         0.50154891 0.70490949 0.50154891]
 [0.         0.4472136  0.         0.89442719]]


## TfidfVectorizer

In [4]:
tfidfvectorizer = TfidfVectorizer(analyzer='word' , stop_words='english')
tfidfvectorizer.fit(train)

tfidf_term_vectors  = tfidfvectorizer.transform(test)

print(tfidfvectorizer.idf_)

print("Sparse Matrix form of test data : \n")
tfidf_term_vectors.todense()

[1.40546511 1.40546511 1.40546511 1.40546511]
Sparse Matrix form of test data : 



matrix([[0.        , 0.57735027, 0.57735027, 0.57735027],
        [0.        , 0.4472136 , 0.        , 0.89442719]])

In [5]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

cosine_similarity(tfidf_term_vectors, tfidf_term_vectors)



array([[1.        , 0.77459667],
       [0.77459667, 1.        ]])

In [6]:
from sklearn.metrics.pairwise import linear_kernel
linear_kernel(tfidf_term_vectors,tfidf_term_vectors)

array([[1.        , 0.77459667],
       [0.77459667, 1.        ]])

In [7]:
cosine_similarity(term_vectors, term_vectors)

array([[1.        , 0.77459667],
       [0.77459667, 1.        ]])