In [10]:
import numpy as np

In [1]:
#Loading, stemming and creating new data set
from sklearn.datasets import fetch_20newsgroups
categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
groups = fetch_20newsgroups(subset='all',categories=categories)

labels = groups.target
label_names = groups.target_names
def is_letter_only(word):
    for char in word:
        if not char.isalpha():
            return False
        return True
from nltk.corpus import names
all_names = set(names.words())
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
data_cleaned = []
for doc in groups.data:
    doc = doc.lower()
    doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for
        word in doc.split() if is_letter_only(word)
        and word not in all_names)
    data_cleaned.append(doc_cleaned)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english",max_features=None, max_df=0.5, min_df=2)
data = count_vector.fit_transform(data_cleaned)

In [3]:
#clustering
from sklearn.cluster import KMeans
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data)

KMeans(n_clusters=4, random_state=42)

In [4]:
clusters = kmeans.labels_
from collections import Counter
print(Counter(clusters))


Counter({0: 3365, 3: 12, 2: 7, 1: 3})


count-based features are not sufficiently representative so  the better approach is to use a  frequency-inverse
document frequency (tf-idf)

##### replace CountVectorizer with TfidfVectorizer from scikit-learn

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vector = TfidfVectorizer(stop_words='english',max_features=None, max_df=0.5, min_df=2)

In [6]:
data = tfidf_vector.fit_transform(data_cleaned)
kmeans.fit(data)
clusters = kmeans.labels_

In [7]:
print(Counter(clusters))

Counter({3: 1580, 1: 1026, 2: 720, 0: 61})


In [13]:
#"""Examining waht the  clusters contain and finding the top 10 terms """

cluster_label = {i: labels[np.where(clusters == i)] for i in range(k)}
terms = tfidf_vector.get_feature_names()
kmeans.fit(data)
centroids = kmeans.cluster_centers_
for cluster, index_list in cluster_label.items():
        counter = Counter(cluster_label[cluster])
        print('cluster_{}: {} samples'.format(cluster, len(index_list)))
        for label_index, count in sorted(counter.items(),key=lambda x: x[1], reverse=True):
            print('{}: {} samples'.format(label_names[label_index], count))
        print('Top 10 terms:')
        for ind in centroids[cluster].argsort()[-10:]:
            print(' %s' % terms[ind], end="")
        print()


cluster_0: 61 samples
sci.space: 61 samples
Top 10 terms:
 dunn resembles svr3 work utzoo zoo spencer zoology toronto henry
cluster_1: 1026 samples
alt.atheism: 638 samples
talk.religion.misc: 381 samples
sci.space: 5 samples
comp.graphics: 2 samples
Top 10 terms:
 morality don sandvik jesus say christian people com wa god
cluster_2: 720 samples
sci.space: 700 samples
comp.graphics: 10 samples
talk.religion.misc: 7 samples
alt.atheism: 3 samples
Top 10 terms:
 launch moon alaska shuttle gov digex wa access nasa space
cluster_3: 1580 samples
comp.graphics: 961 samples
talk.religion.misc: 240 samples
sci.space: 221 samples
alt.atheism: 158 samples
Top 10 terms:
 know wa nntp host posting graphic file com image university


#### Topic modeling using NMF


In [14]:
from sklearn.decomposition import NMF
t = 20
nmf = NMF(n_components=t, random_state=42)

In [15]:
data = count_vector.fit_transform(data_cleaned)

In [16]:
nmf.fit(data)



NMF(n_components=20, random_state=42)

In [17]:
nmf.components_

array([[0.05199448, 0.        , 0.00016396, ..., 0.        , 0.        ,
        0.00066426],
       [0.00028246, 0.        , 0.00103722, ..., 0.        , 0.00046909,
        0.00136239],
       [0.        , 0.        , 0.00016499, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.00315889, 0.        , 0.01103702, ..., 0.00100776, 0.00735967,
        0.00162324],
       [0.        , 0.        , 0.00024811, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00456599, ..., 0.        , 0.00022907,
        0.        ]])

In [18]:
terms = count_vector.get_feature_names()

In [20]:
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic {}:" .format(topic_idx))
    print(" ".join([terms[i] for i in topic.argsort()[-10:]]))

    
    
    
    

Topic 0:
free program quality version format color gif file image jpeg
Topic 1:
graphics image server ftp pub file send graphic ray mail
Topic 2:
doe christian people believe belief religious religion atheism god atheist
Topic 3:
wa venus atmosphere sun surface moon solar spacecraft earth planet
Topic 4:
program include ha user software analysis processing data tool image
Topic 5:
shall wa said son unto christ mcconkie father lord god
Topic 6:
data venture service year ha market commercial space satellite launch
Topic 7:
day christian psalm people said prophecy ha wa matthew jesus
Topic 8:
program software format ha sgi package ftp available image data
Topic 9:
unified space motion book star physicist physical universe theory larson
Topic 10:
research sci group international national telescope satellite shuttle list space
Topic 11:
jpl available mission astronaut shuttle center data gov space nasa
Topic 12:
year magi new zarathushtra war book did time people wa
Topic 13:
form ha false 

#### Topic modeling using LDA

In [21]:
from sklearn.decomposition import LatentDirichletAllocation

In [22]:
t = 20

In [23]:
lda = LatentDirichletAllocation(n_components=t, learning_method='batch',random_state=42)

In [24]:
data = count_vector.fit_transform(data_cleaned)

In [25]:
lda.fit(data)

LatentDirichletAllocation(n_components=20, random_state=42)

In [26]:
lda.components_

array([[2.78477238, 0.05      , 0.05      , ..., 0.05      , 0.05      ,
        0.05      ],
       [0.05      , 0.05      , 1.24460302, ..., 0.05      , 0.05      ,
        0.05      ],
       [0.05      , 0.05      , 0.3395871 , ..., 0.05      , 0.05      ,
        0.05      ],
       ...,
       [0.05000002, 0.05      , 0.05      , ..., 0.05      , 0.05      ,
        0.05      ],
       [0.05      , 0.05      , 0.05      , ..., 0.05      , 0.05      ,
        0.05      ],
       [0.05      , 0.05      , 1.02314724, ..., 0.05      , 0.05      ,
        0.05      ]])

In [27]:
terms = count_vector.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic {}:" .format(topic_idx))
    print(" ".join([terms[i] for i in topic.argsort()[-10:]]))

Topic 0:
say uk know ha life matthew just brian wa jesus
Topic 1:
oort nntp host distribution search know posting just gopher gamma
Topic 2:
conference nntp posting cost host year space wa digex access
Topic 3:
graphic software available ftp data program format jpeg file image
Topic 4:
computer help looking graphic know thanks posting university nntp host
Topic 5:
point host nntp posting solntze wpd livesey sgi com wa
Topic 6:
atheist doe morality moral religion think don god say people
Topic 7:
host know lis posting wa space university uiuc god cobb
Topic 8:
science send national cs center mail sci gov nasa space
Topic 9:
time people think like keith caltech just wa don com
Topic 10:
book ray star god wa energy physical larson universe theory
Topic 11:
don say jesus ha people know christian bible wa god
Topic 12:
wa ha zoology zoo satellite space spencer launch toronto henry
Topic 13:
rushdie shall ha christ people lord jesus law god wa
Topic 14:
know host nntp people posting just don