In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
import pandas as pd
# Documents
documents = [
    "Shipment of gold damaged in a fire",
    "Delivery of silver arrived in a silver truck",
    "Shipment of gold arrived in a truck",
    "Purchased silver and gold arrived in a wooden truck",
    "The arrival of gold and silver shipment is delayed"  
]
# Query
query = ["gold silver truck"]
# Vectorize using CountVectorizer (TF)                     
vectorizer = CountVectorizer()
doc_vectors = vectorizer.fit_transform(documents + query).toarray()
# Separate query vectZZZZZSZSZSZSZSZSor
query_vector = doc_vectors[-1]
doc_vectors = doc_vectors[:-1]
# Compute similarity/distance scores
euclidean = euclidean_distances(doc_vectors, [query_vector]).flatten()
manhattan = manhattan_distances(doc_vectors, [query_vector]).flatten()
cosine = cosine_similarity(doc_vectors, [query_vector]).flatten()
# Create DataFrame with scores
results = pd.DataFrame({
    "Document": [f"D{i+1}" for i in range(len(documents))],
    "Euclidean Distance": euclidean,
    "Manhattan Distance": manhattan,
    "Cosine Similarity": cosine
})
# Sort and display most similar documents
print("Top documents by Cosine Similarity:")
print(results.sort_values(by="Cosine Similarity", ascending=False)[["Document", "Cosine Similarity"]], end="\n\n")
print("Top documents by Euclidean Distance:")
print(results.sort_values(by="Euclidean Distance")[["Document", "Euclidean Distance"]], end="\n\n")
print("Top documents by Manhattan Distance:")
print(results.sort_values(by="Manhattan Distance")[["Document", "Manhattan Distance"]])
print("Top 2 Documents by Euclidean Distance (lowest is best):")
print(results.sort_values("Euclidean Distance").head(2), '\n')
print("Top 2 Documents by Manhattan Distance (lowest is best):")
print(results.sort_values("Manhattan Distance").head(2), '\n')
print("Top 2 Documents by Cosine Similarity (highest is best):")
print(results.sort_values("Cosine Similarity", ascending=False)[["Document", "Cosine Similarity"]].head(2))


Top documents by Cosine Similarity:
  Document  Cosine Similarity
3       D4           0.612372
1       D2           0.577350
2       D3           0.471405
4       D5           0.384900
0       D1           0.235702

Top documents by Euclidean Distance:
  Document  Euclidean Distance
2       D3            2.236068
3       D4            2.236068
1       D2            2.449490
0       D1            2.645751
4       D5            2.828427

Top documents by Manhattan Distance:
  Document  Manhattan Distance
2       D3                 5.0
3       D4                 5.0
1       D2                 6.0
0       D1                 7.0
4       D5                 8.0
Top 2 Documents by Euclidean Distance (lowest is best):
  Document  Euclidean Distance  Manhattan Distance  Cosine Similarity
2       D3            2.236068                 5.0           0.471405
3       D4            2.236068                 5.0           0.612372 

Top 2 Documents by Manhattan Distance (lowest is best):
  Document  

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
import numpy as np
import pandas as pd
# Documents
documents = [
    "Shipment of gold damaged in a fire",
    "Delivery of silver arrived in a silver truck",
    "Shipment of gold arrived in a truck",
    "Purchased silver and gold arrived in a wooden truck",
    "The arrival of gold and silver shipment is delayed"
]
# Query
query = "gold silver truck"
# Combine all for vectorization
all_texts = documents + [query]
# Vectorize using term frequency
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
X = vectorizer.fit_transform(all_texts).toarray()
# Separate query vector
doc_vectors = X[:-1]
query_vector = X[-1:]
# Compute distances/similarity
euclidean = euclidean_distances(doc_vectors, query_vector).flatten()
manhattan = manhattan_distances(doc_vectors, query_vector).flatten()
cosine = cosine_similarity(doc_vectors, query_vector).flatten()
# Prepare and sort results
results = pd.DataFrame({
    "Document": [f"D{i+1}" for i in range(len(documents))],
    "Euclidean": euclidean,
    "Manhattan": manhattan,
    "Cosine": cosine
})

# Sort and show top 2 for each metric
print("Top 2 Documents by Euclidean Distance (lowest is best):")
print(results.sort_values("Euclidean").head(2), '\n')
print("Top 2 Documents by Manhattan Distance (lowest is best):")
print(results.sort_values("Manhattan").head(2), '\n')
print("Top 2 Documents by Cosine Similarity (highest is best):")
print(results.sort_values("Cosine", ascending=False).head(2))



Top 2 Documents by Euclidean Distance (lowest is best):
  Document  Euclidean  Manhattan    Cosine
2       D3   1.732051        3.0  0.577350
3       D4   1.732051        3.0  0.707107 

Top 2 Documents by Manhattan Distance (lowest is best):
  Document  Euclidean  Manhattan    Cosine
2       D3   1.732051        3.0  0.577350
3       D4   1.732051        3.0  0.707107 

Top 2 Documents by Cosine Similarity (highest is best):
  Document  Euclidean  Manhattan    Cosine
3       D4   1.732051        3.0  0.707107
1       D2   2.000000        4.0  0.654654
