In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from scipy.spatial.distance import euclidean, cityblock
from sklearn.metrics.pairwise import cosine_similarity

# List of documents
documents = [
    "Shipment of gold damaged in a fire",
    "Delivery of silver arrived in a silver truck",
    "Shipment of gold arrived in a truck",
    "Purchased silver and gold arrived in a wooden truck",
    "The arrival of gold and silver shipment is delayed."
]

# Query document
query = "gold silver truck"

# Vectorize documents and query
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
query_vec = vectorizer.transform([query])

# Convert query_vec to numpy array for easier manipulation
query_vec = query_vec.toarray()

# Calculate similarities using Euclidean distance
euclidean_distances = []
for doc in X.toarray():
    dist = euclidean(query_vec.flatten(), doc.flatten())
    euclidean_distances.append(dist)

# Calculate similarities using Manhattan distance
manhattan_distances = []
for doc in X.toarray():
    dist = cityblock(query_vec.flatten(), doc.flatten())
    manhattan_distances.append(dist)

# Calculate similarities using Cosine similarity
cosine_similarities = cosine_similarity(query_vec, X).flatten()

# Combine documents with their distances for ranking
documents_with_distances = list(zip(documents, euclidean_distances, manhattan_distances, cosine_similarities))

# Sort by each similarity measure and print top two relevant documents
print("Top 2 relevant documents using Euclidean distance:")
documents_sorted_by_euclidean = sorted(documents_with_distances, key=lambda x: x[1])[:2]
for i, (doc, euclidean_dist, manhattan_dist, cosine_sim) in enumerate(documents_sorted_by_euclidean, 1):
    print(f"{i}. Document: '{doc}'")
    print(f"   Euclidean Distance: {euclidean_dist:.4f}")
    print()

print("Top 2 relevant documents using Manhattan distance:")
documents_sorted_by_manhattan = sorted(documents_with_distances, key=lambda x: x[2])[:2]
for i, (doc, euclidean_dist, manhattan_dist, cosine_sim) in enumerate(documents_sorted_by_manhattan, 1):
    print(f"{i}. Document: '{doc}'")
    print(f"   Manhattan Distance: {manhattan_dist:.4f}")
    print()

print("Top 2 relevant documents using Cosine similarity:")
documents_sorted_by_cosine = sorted(documents_with_distances, key=lambda x: x[3], reverse=True)[:2]
for i, (doc, euclidean_dist, manhattan_dist, cosine_sim) in enumerate(documents_sorted_by_cosine, 1):
    print(f"{i}. Document: '{doc}'")
    print(f"   Cosine Similarity: {cosine_sim:.4f}")
    print()


Top 2 relevant documents using Euclidean distance:
1. Document: 'Shipment of gold arrived in a truck'
   Euclidean Distance: 2.2361

2. Document: 'Purchased silver and gold arrived in a wooden truck'
   Euclidean Distance: 2.2361

Top 2 relevant documents using Manhattan distance:
1. Document: 'Shipment of gold arrived in a truck'
   Manhattan Distance: 5.0000

2. Document: 'Purchased silver and gold arrived in a wooden truck'
   Manhattan Distance: 5.0000

Top 2 relevant documents using Cosine similarity:
1. Document: 'Purchased silver and gold arrived in a wooden truck'
   Cosine Similarity: 0.6124

2. Document: 'Delivery of silver arrived in a silver truck'
   Cosine Similarity: 0.5774

