## Jaccard Similarity

In [1]:
# documents
doc1, doc2='I like dogs.', 'I hate dogs.'

# Split the documents and create tokens
doc1_tokens=set(doc1.lower().split())
doc2_tokens=set(doc2.lower().split())

#Print the tokens
print(doc1_tokens,doc2_tokens)

{'i', 'like', 'dogs.'} {'i', 'dogs.', 'hate'}


In [7]:
# Jaccard

jaccard_sim = len(doc1_tokens.intersection(doc2_tokens))/len(doc1_tokens.union(doc2_tokens))

print(jaccard_sim)

0.5


In [8]:
# Cosine
# Let's import text feature extraction TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import Cosien Similarity metric
from sklearn.metrics.pairwise import cosine_similarity


docs=['I like dogs.', 'I hate dogs.']

# Create TFidfVectorizer 
tfidf= TfidfVectorizer()

# Fit and transform the documents 
tfidf_vector = tfidf.fit_transform(docs)

# Compute cosine similarity
cosine_sim=cosine_similarity(tfidf_vector, tfidf_vector)

# Print the cosine similarity
print(cosine_sim)

[[1.         0.33609693]
 [0.33609693 1.        ]]


In [9]:
import en_core_web_sm

nlp = en_core_web_sm.load()

In [12]:
# Document Vectorization
doc1, doc2 = nlp('I like apples.'), nlp('I like oranges.')

doc1.similarity(doc2)

  doc1.similarity(doc2)


0.9454207125669349

In [10]:
from scipy import spatial

# Document Vectorization
doc1, doc2 = nlp('I like apples.').vector, nlp('I like oranges.').vector

# Cosine Similarity
result = 1 - spatial.distance.cosine(doc1, doc2)

print(result)

0.9454206228256226


## Search Engine

In [26]:
import en_core_web_sm
from numpy import dot
from numpy.linalg import norm
import numpy as np

nlp = en_core_web_sm.load()


# Prepare dataset
doc_list=['I love this sandwich.',
          'this is an amazing place!',
          'I feel very good about these beers.',
          'this is my best work.',
          'what an awesome view',
          'I do not like this restaurant',
          'I am tired of this stuff.',
          "I can't deal with this",
          'he is my sworn enemy!',
          'my boss is horrible.',
          'I hate this sandwich.']

In [27]:
query = input()

sim_list = []

def cosine(vec1,vec2):
    return dot(vec1,vec2) / (norm(vec1) * norm(vec2))

for doc in doc_list:
    vec1 = nlp(query).vector
    vec2 = nlp(doc).vector
    sim_score = cosine(vec1,vec2)
    sim_list.append(sim_score)
    

horrible


In [28]:
sim_list

[0.22444192,
 0.19761992,
 0.25994086,
 0.082769185,
 0.18947649,
 -0.08410258,
 0.2888038,
 -0.055111412,
 0.20689178,
 0.34992522,
 0.2285129]

In [29]:
# most similar
most_similar=doc_list[sim_list.index(max(sim_list))]
print("\nMost Similar:\n",most_similar)


Most Similar:
 my boss is horrible.


In [35]:
# sorting most similar sentences
top_index=list(np.argsort(sim_list)[-5:])

top_index.reverse()

In [36]:
for i in top_index:
    print(doc_list[i])

my boss is horrible.
I am tired of this stuff.
I feel very good about these beers.
I hate this sandwich.
I love this sandwich.
