In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Define the data
training_data = [
    {"text": "AI advancements in deep learning.", "class": "Technology"},
    {"text": "New vaccines improve health.", "class": "Health"},
    {"text": "Machine learning for big data.", "class": "Technology"},
    {"text": "Benefits of a balanced diet.", "class": "Health"}
]

test_data = "Deep learning in healthcare."

# Step 2: Prepare the text corpus
corpus = [data["text"] for data in training_data] + [test_data]

# Step 3: Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Step 4: Compute cosine similarity
test_vector = tfidf_matrix[-1]  # Test document vector
training_vectors = tfidf_matrix[:-1]  # Training document vectors

similarities = cosine_similarity(test_vector, training_vectors)[0]

# Step 5: Associate similarities with classes
results = []
for i, similarity in enumerate(similarities):
    results.append({"article_id": i + 1, "similarity": similarity, "class": training_data[i]["class"]})

# Step 6: Sort by similarity
results.sort(key=lambda x: x["similarity"], reverse=True)

# Step 7: Perform classification (k = 3)
k = 3
nearest_neighbors = results[:k]

# Majority voting
classes = [neighbor["class"] for neighbor in nearest_neighbors]
predicted_class = max(set(classes), key=classes.count)

# Print results
print("Similarity Scores:")
for result in results:
    print(f"Article {result['article_id']}: Similarity = {result['similarity']:.2f}, Class = {result['class']}")

print("\nNearest Neighbors:")
for neighbor in nearest_neighbors:
    print(f"Article {neighbor['article_id']}: Similarity = {neighbor['similarity']:.2f}, Class = {neighbor['class']}")

print(f"\nPredicted Class for Test Document: {predicted_class}")


Similarity Scores:
Article 1: Similarity = 0.54, Class = Technology
Article 3: Similarity = 0.13, Class = Technology
Article 2: Similarity = 0.00, Class = Health
Article 4: Similarity = 0.00, Class = Health

Nearest Neighbors:
Article 1: Similarity = 0.54, Class = Technology
Article 3: Similarity = 0.13, Class = Technology
Article 2: Similarity = 0.00, Class = Health

Predicted Class for Test Document: Technology


In [None]:
corpus

['AI advancements in deep learning.',
 'New vaccines improve health.',
 'Machine learning for big data.',
 'Benefits of a balanced diet.',
 'Deep learning in healthcare.']

In [None]:
similarities

array([0.54499756, 0.        , 0.12822556, 0.        ])

In [None]:
vectorizer

In [None]:
tfidf_matrix

<5x18 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [None]:
test_vector

<1x18 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>