In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Topics
- CountVectorizer
- TfidfVectorizer
- Cosine Similarity

In [2]:
fruits='fruits are quite healthy to eat, since we are living in a polluted world'
veg='Getting fresh and hygenic vegetables became quite difficult nowadays that too without pesticides'
tmu='sumit loves to study in tmu, he treats this place as a temple, he is very descent in classroom'

In [3]:
corpus=[fruits,veg,tmu]

In [4]:
corpus

['fruits are quite healthy to eat, since we are living in a polluted world',
 'Getting fresh and hygenic vegetables became quite difficult nowadays that too without pesticides',
 'sumit loves to study in tmu, he treats this place as a temple, he is very descent in classroom']

# Applying CountVectorizer on the text data

In [5]:
from sklearn.feature_extraction import text

In [6]:
cv=text.CountVectorizer(input=corpus)
matrix=cv.fit_transform(corpus)

In [7]:
pd.DataFrame(matrix.toarray(),columns=cv.get_feature_names())

Unnamed: 0,and,are,as,became,classroom,descent,difficult,eat,fresh,fruits,...,this,tmu,to,too,treats,vegetables,very,we,without,world
0,0,2,0,0,0,0,0,1,0,1,...,0,0,1,0,0,0,0,1,0,1
1,1,0,0,1,0,0,1,0,1,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,1,0,1,1,0,0,0,0,...,1,1,1,0,1,0,1,0,0,0


In [8]:
cv2=text.CountVectorizer(input=corpus,ngram_range=(1,1))
mat2=cv2.fit_transform(corpus)

In [9]:
pd.DataFrame(mat2.toarray(),columns=cv2.get_feature_names())

Unnamed: 0,and,are,as,became,classroom,descent,difficult,eat,fresh,fruits,...,this,tmu,to,too,treats,vegetables,very,we,without,world
0,0,2,0,0,0,0,0,1,0,1,...,0,0,1,0,0,0,0,1,0,1
1,1,0,0,1,0,0,1,0,1,0,...,0,0,0,1,0,1,0,0,1,0
2,0,0,1,0,1,1,0,0,0,0,...,1,1,1,0,1,0,1,0,0,0


In [10]:
cv3=text.CountVectorizer(input=corpus,ngram_range=(1,2))
mat3=cv3.fit_transform(corpus)
pd.DataFrame(mat3.toarray(),columns=cv3.get_feature_names())

Unnamed: 0,and,and hygenic,are,are living,are quite,as,as temple,became,became quite,classroom,...,treats this,vegetables,vegetables became,very,very descent,we,we are,without,without pesticides,world
0,0,0,2,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
1,1,1,0,0,0,0,0,1,1,0,...,0,1,1,0,0,0,0,1,1,0
2,0,0,0,0,0,1,1,0,0,1,...,1,0,0,1,1,0,0,0,0,0


In [11]:
cv4=text.CountVectorizer(input=corpus,ngram_range=(1,3))
mat4=cv4.fit_transform(corpus)
pd.DataFrame(mat4.toarray(),columns=cv4.get_feature_names())

Unnamed: 0,and,and hygenic,and hygenic vegetables,are,are living,are living in,are quite,are quite healthy,as,as temple,...,vegetables became quite,very,very descent,very descent in,we,we are,we are living,without,without pesticides,world
0,0,0,0,2,1,1,1,1,0,0,...,0,0,0,0,1,1,1,0,0,1
1,1,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,0,0,0,1,1,...,0,1,1,1,0,0,0,0,0,0


In [12]:
cv5=text.CountVectorizer(input=corpus,ngram_range=(1,1),stop_words='english')
mat5=cv5.fit_transform(corpus)
pd.DataFrame(mat5.toarray(),columns=cv5.get_feature_names())

Unnamed: 0,classroom,descent,difficult,eat,fresh,fruits,getting,healthy,hygenic,living,...,place,polluted,quite,study,sumit,temple,tmu,treats,vegetables,world
0,0,0,0,1,0,1,0,1,0,1,...,0,1,1,0,0,0,0,0,0,1
1,0,0,1,0,1,0,1,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,1,1,0,0,0,0,0,0,0,0,...,1,0,0,1,1,1,1,1,0,0


In [13]:
cv6=text.CountVectorizer(input=corpus,ngram_range=(1,1),stop_words='english',max_features=10)
mat6=cv6.fit_transform(corpus)
pd.DataFrame(mat6.toarray(),columns=cv6.get_feature_names())

Unnamed: 0,classroom,pesticides,place,polluted,quite,study,sumit,temple,tmu,treats
0,0,0,0,1,1,0,0,0,0,0
1,0,1,0,0,1,0,0,0,0,0
2,1,0,1,0,0,1,1,1,1,1


## TFIDF

In [14]:
tf=text.TfidfVectorizer(input=corpus)
m=tf.fit_transform(corpus)
pd.DataFrame(m.toarray(),columns=tf.get_feature_names())

Unnamed: 0,and,are,as,became,classroom,descent,difficult,eat,fresh,fruits,...,this,tmu,to,too,treats,vegetables,very,we,without,world
0,0.0,0.53965,0.0,0.0,0.0,0.0,0.0,0.269825,0.0,0.269825,...,0.0,0.0,0.205209,0.0,0.0,0.0,0.0,0.269825,0.0,0.269825
1,0.28196,0.0,0.0,0.28196,0.0,0.0,0.28196,0.0,0.28196,0.0,...,0.0,0.0,0.0,0.28196,0.0,0.28196,0.0,0.0,0.28196,0.0
2,0.0,0.0,0.224213,0.0,0.224213,0.224213,0.0,0.0,0.0,0.0,...,0.224213,0.224213,0.17052,0.0,0.224213,0.0,0.224213,0.0,0.0,0.0


In [15]:
tf1=text.TfidfVectorizer(input=corpus,ngram_range=(2,2))
m1=tf1.fit_transform(corpus)
pd.DataFrame(m1.toarray(),columns=tf1.get_feature_names())

Unnamed: 0,and hygenic,are living,are quite,as temple,became quite,descent in,difficult nowadays,eat since,fresh and,fruits are,...,this place,tmu he,to eat,to study,too without,treats this,vegetables became,very descent,we are,without pesticides
0,0.0,0.288675,0.288675,0.0,0.0,0.0,0.0,0.288675,0.0,0.288675,...,0.0,0.0,0.288675,0.0,0.0,0.0,0.0,0.0,0.288675,0.0
1,0.288675,0.0,0.0,0.0,0.288675,0.0,0.288675,0.0,0.288675,0.0,...,0.0,0.0,0.0,0.0,0.288675,0.0,0.288675,0.0,0.0,0.288675
2,0.0,0.0,0.0,0.242536,0.0,0.242536,0.0,0.0,0.0,0.0,...,0.242536,0.242536,0.0,0.242536,0.0,0.242536,0.0,0.242536,0.0,0.0


In [16]:
tf2=text.TfidfVectorizer(input=corpus,ngram_range=(2,2),stop_words='english',max_features=5)
m2=tf2.fit_transform(corpus)
pd.DataFrame(m2.toarray(),columns=tf2.get_feature_names())

Unnamed: 0,descent classroom,study tmu,sumit loves,temple descent,tmu treats
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.447214,0.447214,0.447214,0.447214,0.447214


## Cosine Similarity

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
corpus=['This is sentence one','This is another sentence']

In [23]:
cv=text.CountVectorizer(input=corpus)
matrix=cv.fit_transform(corpus)

In [24]:
pd.DataFrame(matrix.toarray(),columns=cv.get_feature_names())

Unnamed: 0,another,is,one,sentence,this
0,0,1,1,1,1
1,1,1,0,1,1


In [25]:
cosine_similarity(matrix)

array([[1.  , 0.75],
       [0.75, 1.  ]])

In [27]:
help(cosine_similarity)

Help on function cosine_similarity in module sklearn.metrics.pairwise:

cosine_similarity(X, Y=None, dense_output=True)
    Compute cosine similarity between samples in X and Y.
    
    Cosine similarity, or the cosine kernel, computes similarity as the
    normalized dot product of X and Y:
    
        K(X, Y) = <X, Y> / (||X||*||Y||)
    
    On L2-normalized data, this function is equivalent to linear_kernel.
    
    Read more in the :ref:`User Guide <cosine_similarity>`.
    
    Parameters
    ----------
    X : ndarray or sparse array, shape: (n_samples_X, n_features)
        Input data.
    
    Y : ndarray or sparse array, shape: (n_samples_Y, n_features)
        Input data. If ``None``, the output will be the pairwise
        similarities between all samples in ``X``.
    
    dense_output : boolean (optional), default True
        Whether to return dense output even when the input is sparse. If
        ``False``, the output is sparse if both input arrays are sparse.
    
 