# Libraries

In [82]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

# Documents & Corpus

In [83]:
S1 = "sunshine state enjoy sunshine"
S2 = "brown fox jump high, brown fox run"
S3 = "sunshine state fox run fast"

corpus = [S1, S2, S3]
corpus

['sunshine state enjoy sunshine',
 'brown fox jump high, brown fox run',
 'sunshine state fox run fast']

# BoW model


In [84]:
boW = CountVectorizer(stop_words = 'english')
vectors = boW.fit_transform(corpus)
features = boW.get_feature_names()
dense = vectors.todense()
denselistCorpus = dense.tolist()
denselistCorpus



[[0, 1, 0, 0, 0, 0, 0, 1, 2],
 [2, 0, 0, 2, 1, 1, 1, 0, 0],
 [0, 0, 1, 1, 0, 0, 1, 1, 1]]

In [67]:
vectors.shape

(3, 9)

In [63]:
boW_df = pd.DataFrame(denselist, columns=features)
boW_df

Unnamed: 0,brown,enjoy,fast,fox,high,jump,run,state,sunshine
0,0,1,0,0,0,0,0,1,2
1,2,0,0,2,1,1,1,0,0
2,0,0,1,1,0,0,1,1,1


# TF model

In [70]:
tf = TfidfVectorizer(use_idf=False)
matrix = tf.fit_transform(corpus)

In [73]:
features = tf.get_feature_names()
dense = matrix.todense()
denselist = dense.tolist()
dense

matrix([[0.        , 0.40824829, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.40824829, 0.81649658],
        [0.60302269, 0.        , 0.        , 0.60302269, 0.30151134,
         0.30151134, 0.30151134, 0.        , 0.        ],
        [0.        , 0.        , 0.4472136 , 0.4472136 , 0.        ,
         0.        , 0.4472136 , 0.4472136 , 0.4472136 ]])

In [75]:
tf_df = pd.DataFrame(denselist, columns=features)
tf_df

Unnamed: 0,brown,enjoy,fast,fox,high,jump,run,state,sunshine
0,0.0,0.408248,0.0,0.0,0.0,0.0,0.0,0.408248,0.816497
1,0.603023,0.0,0.0,0.603023,0.301511,0.301511,0.301511,0.0,0.0
2,0.0,0.0,0.447214,0.447214,0.0,0.0,0.447214,0.447214,0.447214


# IDF model


In [76]:
idf = TfidfTransformer(smooth_idf=True, use_idf=True)
matrix = idf.fit(vectors)

In [77]:
df_idf = pd.DataFrame(idf.idf_, index=boW.get_feature_names(), columns=["IDF"]) 
df_idf.sort_values(by=['IDF'])



Unnamed: 0,IDF
fox,1.287682
run,1.287682
state,1.287682
sunshine,1.287682
brown,1.693147
enjoy,1.693147
fast,1.693147
high,1.693147
jump,1.693147


# TF.IDF

In [78]:
tfidf = TfidfVectorizer(use_idf=True) 
tfidf_vectors = tfidf.fit_transform(corpus)

In [79]:
vectors_tfidf = tfidf_vectorizer_vectors[[0, 1, 2]] 
dense = vectors_tfidf.todense()
dense

matrix([[0.        , 0.50689001, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.38550292, 0.77100584],
        [0.67070313, 0.        , 0.        , 0.51008702, 0.33535157,
         0.33535157, 0.25504351, 0.        , 0.        ],
        [0.        , 0.        , 0.54935123, 0.41779577, 0.        ,
         0.        , 0.41779577, 0.41779577, 0.41779577]])

In [80]:
df = pd.DataFrame(vectors_tfidf.T.todense(), index=tfidf.get_feature_names(), columns=["tfidf_S1", "tfidf_S2", "tfidf_S3"]) 
df.sort_values(by=["tfidf_S1", "tfidf_S2", "tfidf_S3"],ascending=False)



Unnamed: 0,tfidf_S1,tfidf_S2,tfidf_S3
sunshine,0.771006,0.0,0.417796
enjoy,0.50689,0.0,0.0
state,0.385503,0.0,0.417796
brown,0.0,0.670703,0.0
fox,0.0,0.510087,0.417796
high,0.0,0.335352,0.0
jump,0.0,0.335352,0.0
run,0.0,0.255044,0.417796
fast,0.0,0.0,0.549351


# Cosine Similarity

In [86]:
S1 = denselistCorpus[0]
S3 = denselistCorpus[2]
 
print("S1:", S1)
print("S3:", S3)

cosine = np.dot(S1, S3)/(norm(S1)*norm(S3))
print("Cosine Similarity:", cosine)

S1: [0, 1, 0, 0, 0, 0, 0, 1, 2]
S3: [0, 0, 1, 1, 0, 0, 1, 1, 1]
Cosine Similarity: 0.5477225575051661
