In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Step 1: Define the data
technology_docs = [
    "AI advancements in machine learning.",
    "New algorithms for big data analysis."
]

health_docs = [
    "Meditation improves mental health.",
    "Health benefits of balanced nutrition."
]

test_doc = "AI for healthcare advancements."


In [None]:
# Step 2: Preprocessing and vectorization
all_docs = technology_docs + health_docs + [test_doc]


# Use CountVectorizer to tokenize and vectorize
vectorizer = CountVectorizer()
term_matrix = vectorizer.fit_transform(all_docs).toarray()

# Get term frequencies
feature_names = vectorizer.get_feature_names_out()
tech_matrix = term_matrix[:len(technology_docs)]
health_matrix = term_matrix[len(technology_docs):-1]
test_vector = term_matrix[-1]


In [None]:
# Step 3: Compute Rocchio weights
alpha_p, alpha_n = 1, 0.5

tech_centroid = alpha_p * tech_matrix.mean(axis=0) - alpha_n * health_matrix.mean(axis=0)
health_centroid = alpha_p * health_matrix.mean(axis=0) - alpha_n * tech_matrix.mean(axis=0)


In [None]:
# Step 4: Compute cosine similarity
def cosine_sim(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0, 0]

tech_similarity = cosine_sim(test_vector, tech_centroid)
health_similarity = cosine_sim(test_vector, health_centroid)



In [None]:

# Step 5: Classification
if tech_similarity > health_similarity:
    predicted_class = "Technology"
else:
    predicted_class = "Health"

# Print results
print("Feature Names:", feature_names)
print("Test Document Vector:", test_vector)
print("Technology Centroid:", tech_centroid)
print("Health Centroid:", health_centroid)
print(f"Similarity with Technology: {tech_similarity:.3f}")
print(f"Similarity with Health: {health_similarity:.3f}")
print(f"Predicted Class: {predicted_class}")


Feature Names: ['advancements' 'ai' 'algorithms' 'analysis' 'balanced' 'benefits' 'big'
 'data' 'for' 'health' 'healthcare' 'improves' 'in' 'learning' 'machine'
 'meditation' 'mental' 'new' 'nutrition' 'of']
Test Document Vector: [1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
Technology Centroid: [ 0.5   0.5   0.5   0.5  -0.25 -0.25  0.5   0.5   0.5  -0.5   0.   -0.25
  0.5   0.5   0.5  -0.25 -0.25  0.5  -0.25 -0.25]
Health Centroid: [-0.25 -0.25 -0.25 -0.25  0.5   0.5  -0.25 -0.25 -0.25  1.    0.    0.5
 -0.25 -0.25 -0.25  0.5   0.5  -0.25  0.5   0.5 ]
Similarity with Technology: 0.405
Similarity with Health: -0.202
Predicted Class: Technology
