In [1]:
import numpy as np

def cosine_similarity(x, y):

    # Ensure length of x and y are the same
    if len(x) != len(y) :
        return None

    # Compute the dot product between x and y
    dot_product = np.dot(x, y)

    # Compute the L2 norms (magnitudes) of x and y
    magnitude_x = np.sqrt(np.sum(x**2))
    magnitude_y = np.sqrt(np.sum(y**2))

    # Compute the cosine similarity
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)

    return cosine_similarity

In [2]:
corpus = [  'data science is one of the most important fields of science',
            'this is one of the best data science courses',
            'data scientists analyze data'   ]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a matrix to represent the corpus
X = CountVectorizer().fit_transform(corpus).toarray()

print(X)

[[0 0 0 1 1 1 1 1 2 1 2 0 1 0]
 [0 1 1 1 0 0 1 0 1 1 1 0 1 1]
 [1 0 0 2 0 0 0 0 0 0 0 1 0 0]]


In [4]:
cos_sim_1_2 = cosine_similarity(X[0, :], X[1, :])
cos_sim_1_3 = cosine_similarity(X[0, :], X[2, :])
cos_sim_2_3 = cosine_similarity(X[1, :], X[2, :])

print('Cosine Similarity between: ')
print('\tDocument 1 and Document 2: ', cos_sim_1_2)
print('\tDocument 1 and Document 3: ', cos_sim_1_3)
print('\tDocument 2 and Document 3: ', cos_sim_2_3)

Cosine Similarity between: 
	Document 1 and Document 2:  0.6885303726590962
	Document 1 and Document 3:  0.21081851067789195
	Document 2 and Document 3:  0.2721655269759087


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
cos_sim_1_2 = cosine_similarity([X[0, :], X[1, :]])

print('Cosine Similarity between Document 1 and Document 2 is \n',cos_sim_1_2 )


Cosine Similarity between Document 1 and Document 2 is 
 [[1.         0.68853037]
 [0.68853037 1.        ]]


In [13]:
cos_sim_1_3 = cosine_similarity([X[0, :], X[2, :]])

print('Cosine Similarity between Document 1 and Document 3 is \n',cos_sim_1_3 )


Cosine Similarity between Document 1 and Document 3 is 
 [[1.         0.21081851]
 [0.21081851 1.        ]]


In [15]:
cos_sim_2_3 = cosine_similarity([X[1, :], X[2, :]])

print('Cosine Similarity between Document 2 and Document 3 is \n',cos_sim_2_3 )


Cosine Similarity between Document 2 and Document 3 is 
 [[1.         0.27216553]
 [0.27216553 1.        ]]
