# K-means Clustering on text

Also see scikit-learn overview:

https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#kmeans-sparse-high-dim


In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.set_option("display.max_colwidth",100)

## Load subset of "20 Newsgroups" dataset

In [2]:
categories = ["misc.forsale", 
              "sci.space", 
              "sci.med",
              "talk.politics.misc",
              "rec.sport.baseball",
              "comp.graphics"]

twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, 
                                  shuffle=True)
twenty_eval = fetch_20newsgroups(subset='test',
                                 categories=categories, 
                                 shuffle=True)

# what's missing here? 

In [3]:
len(twenty_train.data)

3418

In [4]:
len(twenty_eval.data)

2276

In [5]:
np.unique(twenty_train.target_names)

array(['comp.graphics', 'misc.forsale', 'rec.sport.baseball', 'sci.med',
       'sci.space', 'talk.politics.misc'], dtype='<U18')

In [6]:

# not really unsupervised ... but let's say we don't have these labels:
y_train = twenty_train.target

In [7]:
for label in np.unique(y_train):
    print(f"Label {label} {twenty_train.target_names[label]:15}: {np.mean(y_train==label):.2%}")

Label 0 comp.graphics  : 17.09%
Label 1 misc.forsale   : 17.12%
Label 2 rec.sport.baseball: 17.47%
Label 3 sci.med        : 17.38%
Label 4 sci.space      : 17.35%
Label 5 talk.politics.misc: 13.60%


In [8]:
topic_id2name = {name:idx for name,idx in enumerate(twenty_train.target_names)}
print(topic_id2name)

{0: 'comp.graphics', 1: 'misc.forsale', 2: 'rec.sport.baseball', 3: 'sci.med', 4: 'sci.space', 5: 'talk.politics.misc'}


## Normalize and vectorize documents

In [9]:
vectorizer = TfidfVectorizer(min_df=10, stop_words="english").fit(twenty_train.data)
X = vectorizer.transform(twenty_train.data)

X.shape

(3418, 6238)

In [10]:
# how do we know how many clusters? In theory we probably don't (but it's a class demo)

kmeans = KMeans(n_clusters=6, verbose=True, random_state=0, n_init=1)

kmeans.fit(X)

Initialization complete
Iteration 0, inertia 6445.491086675302.
Iteration 1, inertia 3307.184025042653.
Iteration 2, inertia 3292.6499970125924.
Iteration 3, inertia 3283.777641972384.
Iteration 4, inertia 3279.0314059114044.
Iteration 5, inertia 3275.5375117967696.
Iteration 6, inertia 3271.3028846330203.
Iteration 7, inertia 3263.752860750047.
Iteration 8, inertia 3257.8943726723082.
Iteration 9, inertia 3256.452873363556.
Iteration 10, inertia 3255.618908543882.
Iteration 11, inertia 3255.1160465818266.
Iteration 12, inertia 3254.8624397862513.
Iteration 13, inertia 3254.7396221075674.
Iteration 14, inertia 3254.5817563951623.
Iteration 15, inertia 3254.5384648002537.
Iteration 16, inertia 3254.5138577641546.
Iteration 17, inertia 3254.5038798755313.
Converged at iteration 17: strict convergence.


In [11]:
cluster_scores  = kmeans.transform(X) 

cluster_scores.shape

(3418, 6)

In [12]:
cluster_scores[42,:]

array([0.96548728, 1.00111135, 0.99948482, 1.05125624, 1.16804748,
       1.01998042])

In [13]:
# but what are we predicting?
cluster_id   = kmeans.predict(X) 
cluster_id.shape

(3418,)

In [14]:
pd.crosstab(
    pd.Series(y_train, name="True label").map(topic_id2name),
    pd.Series(cluster_id, name="Cluster assignment"),
    margins=True,
)

Cluster assignment,0,1,2,3,4,5,All
True label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
comp.graphics,1,13,558,1,0,11,584
misc.forsale,5,11,560,7,0,2,585
rec.sport.baseball,504,16,74,2,0,1,597
sci.med,0,355,149,5,78,7,594
sci.space,0,37,127,90,0,339,593
talk.politics.misc,6,417,35,1,0,6,465
All,516,849,1503,106,78,366,3418


In [15]:
BASEBALL_LABEL = 0

texts = np.array(twenty_train.data)

for text in texts[cluster_id==BASEBALL_LABEL][:3]:
    print("======================")
    print(text[:500])  # first 500 chars of text

From: sheehan@aludra.usc.edu (Joseph Sheehan)
Subject: Re: Young Catchers
Organization: University of Southern California, Los Angeles, CA
Lines: 120
NNTP-Posting-Host: aludra.usc.edu
Summary: Lopez is better than current Brave catchers!
Keywords: Solid != good

>In article <mssC50qA5.Dtv@netcom.com> mss@netcom.com (Mark Singer) writes:
>>In article <1993Apr5.151834.14257@cs.cornell.edu> tedward@cs.cornell.edu (Edward [Ted] Fischer) writes:

I'm still catching up from Spring Break, but bear with
From: stlouis@unixg.ubc.ca (Phill St. Louis)
Subject: Billy Taylor a Brave or Jay?
Organization: University of British Columbia, Vancouver, B.C., Canada
Lines: 8
Distribution: world
NNTP-Posting-Host: unixg.ubc.ca

Does anyone know where Billy Taylor is?  Richmond or Syracuse?  He was taken
by the Jays in the Rule V draft, but not kept on the roster.  Baseball Weekly
said that he was demoted to Syracuse, but a Toronto paper indicated that
the Braves took him back.  Is there an Atlanta fan, or a

## Topic similarity

Do we have a way to review which clusters are "similar"? 

In [16]:
kmeans.cluster_centers_.shape

(6, 6238)

In [17]:
CLUSTER_ID = 0  # note this refers to a cluster, not a labeled class

centroids_normed = kmeans.cluster_centers_/np.linalg.norm(kmeans.cluster_centers_, axis=1, ord=2, keepdims=True)

some_centroid = centroids_normed[CLUSTER_ID,:]

print(some_centroid.shape)

(6238,)


In [18]:
# which clusters are closest to cluster zero? 
np.dot(some_centroid, centroids_normed.T)

array([1.        , 0.53044333, 0.5268671 , 0.24080106, 0.1811999 ,
       0.34367238])

In [19]:
for clust_id, dist in enumerate(np.dot(some_centroid, centroids_normed.T)):
    print(f"{clust_id}: {dist:.4f}")

0: 1.0000
1: 0.5304
2: 0.5269
3: 0.2408
4: 0.1812
5: 0.3437


## Peek at cluster results - "Subject"

In [20]:
# strip out "Subject" line as quick summary
subjects = [[lines.replace("Subject: ","").replace("Re:","").strip() 
             for lines in text.split("\n") 
             if 'Subject' in lines][0] for text in texts]

df_subjects = pd.DataFrame({
    "texts":subjects,
    "cluster_label":cluster_id,
})

# showing sample of 10 rows from 3 clusters
# (picking 3 to keep it small and readable)
df_samp = df_subjects[df_subjects.cluster_label.isin([0,1,2])].sample(10, random_state=102)      


df_samp.cluster_label+=1  # for display - start index at 1 not zero 
df_samp

Unnamed: 0,texts,cluster_label
2849,FOR SALE: CHEAP LOGIC BOARDS!!! (update),3
2286,"Ampex 456 2"" Recording Tape For Sale",3
2083,The Bob Dylan Baseball Abstract,1
2974,Patient-Physician Diplomacy,2
3296,"Defensive Averages 1988-1992, Third Base",1
3042,Dana-Faber Cancer Institute,3
14,Ryan rumor...,1
2923,MS-Windows graphics viewer?,3
1842,Jack Morris,1
3409,Candida Albicans: what is it?,3


In [21]:
cluster_id

array([1, 5, 0, ..., 1, 1, 0], dtype=int32)