<a href="https://colab.research.google.com/github/WayneGretzky1/CSCI-4521-Applied-Machine-Learning/blob/main/2_3_k_means_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load the data

In [1]:
!wget "https://raw.githubusercontent.com/be-prado/csci4521/refs/heads/main/20news-bydate.tar.gz"

--2025-10-01 20:04:49--  https://raw.githubusercontent.com/be-prado/csci4521/refs/heads/main/20news-bydate.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14464277 (14M) [application/octet-stream]
Saving to: ‘20news-bydate.tar.gz’


2025-10-01 20:04:49 (122 MB/s) - ‘20news-bydate.tar.gz’ saved [14464277/14464277]



In [None]:
!tar -xf 20news-bydate.tar.gz

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import os

In [None]:
DIR = "/content/20news-bydate-train/"
catigories = ["comp.graphics","comp.sys.mac.hardware","talk.politics.misc","sci.space","misc.forsale"]

In [None]:
posts = []
labels = []
for c in catigories:
  posts += [open(os.path.join(DIR+c, f), encoding="latin-1").read() for f in os.listdir(DIR+c)]
  labels += [c for f in os.listdir(DIR+c)]
labels = np.array(labels)

## Vectorize documents

In [None]:
#A TF-IDF Vectorizer with Stemming
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
   def build_analyzer(self):
     # analyzer callable from TfidfVectorizer that does tokenization
     analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
     # new analyzer that performs stemming on the tokenization from the analyzer above
     return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
# vectorized tf-idf matrix where each row represents a document and each
# column represents a feature i.e. tf-idf score for each term
X_train = vectorizer.fit_transform(posts)

print(X_train.shape)

(2805, 31285)


## K-Means Clustering

In [None]:
# TODO: implement k-means clustering using the sklearn
num_clusters = 20
km = KMeans(n_clusters = num_clusters, init = 'random', n_init = 10, verbose = 1)

In [None]:
km.labels_ #The labels

In [None]:
km.labels_.shape

In [None]:
km.cluster_centers_ #The actual cluster centers

## Using the model

In [None]:
# TODO: vectorize the sentence "Used Car for Sale!"


In [None]:
# TODO: predict the cluster for the sentence above


In [None]:
km.cluster_centers_[new_post_cluster] #We can manualy compare feature to cluster centers

Which posts are also in my cluster?

In [None]:
cluster_neighbor_indices = (km.labels_==new_post_cluster).nonzero()[0]
print(cluster_neighbor_indices[0:10]) #first 10 cluster indicies

Print the labels of the posts in the same cluster:

In [None]:
np.random.choice(labels[cluster_neighbor_indices],10) #print the label for 10 random indices

Ideally, there should be a clear most common topic within this cluster, that well describes the test prompt.

We can also sort, and find the k-nearest neighbors within this cluster:

In [None]:
cluster_neighbors = []
# for each post in my cluster
for i in cluster_neighbor_indices:
  # find the distance between new post and posts in the cluster
  dist = np.linalg.norm((new_post_vec - X_train[i]).toarray())
  # save the distance, the post, and the label into the list
  cluster_neighbors.append((dist, posts[i], labels[i]))
  # sort the list
  # NOTE: The sorted() method sorts tuples by default, using the first item in each tuple (our distances!)
  cluster_neighbors = sorted(cluster_neighbors)
print(len(cluster_neighbors))

In [None]:
print(cluster_neighbors[0]) #The nearest neighbor within the cluster

In [None]:
print(cluster_neighbors[len(cluster_neighbors)//2]) #The median neighbor within the cluster

In [None]:
print(cluster_neighbors[-1]) #The furthest away neighbor still in the cluster

## Word Clouds

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
posts = np.array(posts)
labels = np.array(labels)

In [None]:
# for each cluster...
for cluster_id in range(0,num_clusters):
  # grab all the posts in the cluster
  matching_indices = (km.labels_== cluster_id)
  # concatonate them into a single 'document'
  all_text = " ".join(txt for txt in posts[matching_indices.nonzero()[0]])
  # make a word cloud of it
  word_cloud1 = WordCloud(collocations = False, background_color = 'white',
                          width = 2048, height = 1080).generate(all_text)
  plt.imshow(word_cloud1, interpolation='bilinear')
  plt.axis("off")
  plt.show()
  # print how many posts were in that cluster
  print("cluster size: ",posts[matching_indices].size)
  # print the topics of the first 10 posts in the cluster
  print("-"+"\n-".join(txt for txt in labels[matching_indices][0:10]))

## Word Cloud (TF-IDF)

In [None]:
import pandas as pd

In [None]:
unstemed_vectorizer = TfidfVectorizer(stop_words='english')
vecs = unstemed_vectorizer.fit_transform(posts)
feature_names = unstemed_vectorizer.get_feature_names_out()
dense = vecs.todense() # brings the vecs sparse matrix into regular matrix format
tfidf_df = pd.DataFrame(dense, columns=feature_names)

for cluster_id in range(0,num_clusters):
  matching_indices = (km.labels_== cluster_id)
  word_tfidf = tfidf_df[matching_indices].T.sum(axis=1)
  word_cloud1 = WordCloud(collocations = False, background_color = 'white', max_words=60,
                          width = 2048, height = 1080).generate_from_frequencies(word_tfidf)
                          # instead of using frequencies, override with the tfidf val
  plt.imshow(word_cloud1, interpolation='bilinear')
  plt.axis("off")
  plt.show()
  print("cluster size: ",posts[matching_indices].size)
  print("-"+"\n-".join(txt for txt in np.random.choice(labels[matching_indices],10)))

In [None]:
# same as above but with stems (can be a bit harder to interpret)

feature_names = vectorizer.get_feature_names_out()
dense = X_train.todense() # brings the vecs sparse matrix into regular matrix format
tfidf_df = pd.DataFrame(dense, columns=feature_names)

for cluster_id in range(0,num_clusters):
  matching_indices = (km.labels_== cluster_id)
  word_tfidf = tfidf_df[matching_indices].T.sum(axis=1)
  word_cloud1 = WordCloud(collocations = False, background_color = 'white', max_words=60,
                          width = 2048, height = 1080).generate_from_frequencies(word_tfidf)
                          # instead of using frequencies, override with the tfidf val
  plt.imshow(word_cloud1, interpolation='bilinear')
  plt.axis("off")
  plt.show()
  print("cluster size: ",posts[matching_indices].size)
  print("-"+"\n-".join(txt for txt in np.random.choice(labels[matching_indices],10)))

## Your turn!

Predict the cluster of your own sentence and print the word cloud of that cluster to see if your output seems sensible.