Load and clean newsgroup data

In [8]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]
groups = fetch_20newsgroups(subset='all', categories=categories)
labels = groups.target
label_names = groups.target_names
def is_letter_only(word):
    for char in word:
        if not char.isalpha():
            return False
    return True

from nltk.corpus import names
all_names = set(names.words())
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data_cleaned = []
for doc in groups.data:
    doc = doc.lower()
    doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word) and word not in all_names)
    data_cleaned.append(doc_cleaned)

Convert cleaned data into count vectors. We use minimum and maximum document frequency

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english", max_features=None, max_df=0.5, min_df=2)
data = count_vector.fit_transform(data_cleaned)

Cluster data into four groups

In [10]:
from sklearn.cluster import KMeans
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

Check sizes of resulting clusters

In [11]:
clusters = kmeans.labels_
from collections import Counter
print(Counter(clusters))

Counter({3: 3360, 0: 17, 1: 7, 2: 3})


Note that most samples congested into one big cluster (3)

TfIdfVectorizer diminishes the weight of common terms occuring frequently, qnd emphasizes terms that rarely occur

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english', max_features=None, max_df=0.5, min_df=2)
data = tfidf_vector.fit_transform(data_cleaned)
kmeans.fit(data)
clusters = kmeans.labels_
print(Counter(clusters))

Counter({3: 1480, 0: 782, 1: 615, 2: 510})


Examine each cluster, what they contain and the top 10 terms

In [14]:
import numpy as np
cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}
terms = tfidf_vector.get_feature_names()
centroids = kmeans.cluster_centers_
for cluster, index_list in cluster_label.items():
    counter = Counter(cluster_label[cluster])
    print('cluster_{}: {} samples'.format(cluster, len(index_list)))
    for label_index, count in sorted(counter.items(), key = lambda x: x[1], reverse=True):
        print('{}: {} samples'.format(label_names[label_index], count))
    print('Top 10 terms:')
    for ind in centroids[cluster].argsort()[-10:]:
        print(' %s' % terms[ind], end="")
    print()

cluster_0: 782 samples
comp.graphics: 733 samples
sci.space: 44 samples
alt.atheism: 4 samples
talk.religion.misc: 1 samples
Top 10 terms:
 computer need know thanks looking university program file graphic image
cluster_1: 615 samples
alt.atheism: 365 samples
talk.religion.misc: 247 samples
comp.graphics: 2 samples
sci.space: 1 samples
Top 10 terms:
 article moral think morality jesus people wa say christian god
cluster_2: 510 samples
sci.space: 508 samples
alt.atheism: 1 samples
comp.graphics: 1 samples
Top 10 terms:
 just zoology moon hst nasa mission wa launch shuttle space
cluster_3: 1480 samples
sci.space: 434 samples
alt.atheism: 429 samples
talk.religion.misc: 380 samples
comp.graphics: 237 samples
Top 10 terms:
 new people think know like ha just university article wa
