#Python Implementation for Text Clustering Using K-Medoids

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text documents
documents = [
    "The sky is blue and beautiful",
    "Love this blue and bright sky!",
    "The quick brown fox jumps over the lazy dog",
    "A king's breakfast has sausages, ham, and bacon",
    "Breakfast is the most important meal of the day",
    "The fox is quick and the sky is blue",
    "The lazy dog is sleeping"
]

# Step 1: Preprocessing and TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents).toarray()

# Step 2: Initialize K-Medoids
k = 3  # Number of clusters
initial_medoids = np.random.choice(len(X), size=k, replace=False)
medoids = X[initial_medoids]

# Step 3: K-Medoids Algorithm
def kmedoids(X, medoids, max_iter=100):
    for _ in range(max_iter):
        # Compute distance between points and medoids
        distances = cosine_distances(X, medoids)
        labels = np.argmin(distances, axis=1)

        # Update medoids
        new_medoids = []
        for i in range(len(medoids)):
            cluster_points = X[labels == i]
            if len(cluster_points) > 0:
                # Choose new medoid as the point minimizing the total distance
                medoid_index = np.argmin(np.sum(cosine_distances(cluster_points), axis=1))
                new_medoids.append(cluster_points[medoid_index])
            else:
                new_medoids.append(medoids[i])

        new_medoids = np.array(new_medoids)
        if np.all(medoids == new_medoids):
            break
        medoids = new_medoids

    return medoids, labels

# Run K-Medoids
final_medoids, labels = kmedoids(X, medoids)

# Step 4: Output Results
clusters = {i: [] for i in range(k)}
for idx, label in enumerate(labels):
    clusters[label].append(documents[idx])

print("Medoids:", [documents[initial_medoids[i]] for i in range(k)])
for cluster_id, docs in clusters.items():
    print(f"\nCluster {cluster_id}:")
    for doc in docs:
        print(f"  {doc}")


Medoids: ['The quick brown fox jumps over the lazy dog', "A king's breakfast has sausages, ham, and bacon", 'The fox is quick and the sky is blue']

Cluster 0:
  The quick brown fox jumps over the lazy dog
  The lazy dog is sleeping

Cluster 1:
  A king's breakfast has sausages, ham, and bacon
  Breakfast is the most important meal of the day

Cluster 2:
  The sky is blue and beautiful
  Love this blue and bright sky!
  The fox is quick and the sky is blue
