# Clustering with the 20-Newsgroups Dataset
Authors: R. Edwards, J.Giles, N. Velzboer, E. Whitney and you! <br>
Purpose: Exploring different clustering approaches with the 20 newsgroup dataset, comprising of user questions and posts on different web forums (open source: http://qwone.com/~jason/20Newsgroups/)

In [48]:
# Import Relavent Libraries

## Python libraries 
import string
import time

## Data Manipulation
import pandas as pd
import numpy as np

## Plotting Data
import matplotlib.pyplot as plt

## Feature selection
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Dimensionality Reduction 
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from collections import Counter

## Clustering 
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture

## Get rid of pesky warnings
import warnings
warnings.filterwarnings('ignore')

## Set-up
Reading in data, creating pandas dataframe

In [23]:
# Sklearn maintains offers this data as part of their inbuilt learning sets.
#  We'll download the 'training' set,  however ignore the name 'train'. They 
# offer the data split into training and testing sets if you are trying to apply some classification approach 
# May take up to a minute to download
newsgroups_train = fetch_20newsgroups(subset='train')

In [24]:
# Pull data into pandas dataframe. Non essential, but pandas dataframes are nice to work with
df_text_ng=pd.DataFrame(data={'text':newsgroups_train.data}, columns=['text'])

# Take subset of dataset to prevent memory/kernel/processing issues
row_lim = 20
df_text_ng = df_text_ng.iloc[0:row_lim,:]

## Cleaning and pre-processing
We will apply the following pre-processing steps <br>
<ol>
<li> Remove unwanted characters
<li> Tokenisation and case sensitive 
<li> Stopwords
<li> Lemmitisation 
<ul>

In [25]:
# Standard NLTK stopwords list
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
            "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
            'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
            'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
            'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 
            'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
            'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 
            'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 
            'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 
            'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 
            'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 
            'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 
            'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 
            'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 
            'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', 
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
            'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 
            'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
            "weren't", 'won', "won't", 'wouldn', "wouldn't"]
# Add extras you want to remove here
custom_stops = ['subject', 'from', 'to', 'article', 'summary','nntp', 'posting',
                'host' ]
# Join two lists
our_stopwords = stopwords + custom_stops


In [26]:
pd.set_option('display.max_colwidth', 1000)
# Lower case
df_text_ng['text_lower'] = df_text_ng['text'].str.lower()
# Remove \n (newlines), and strings with emails or .com IP style addresses
df_text_ng['text_noemails'] = df_text_ng['text_lower'].replace(r"\n"," ",
                              regex=True).replace(r"\S*[@.]\S*\s?","", regex=True)
# Remove anything except letters and spaces
df_text_ng['text_punct'] = df_text_ng['text_noemails'].replace(r"[^a-z ]"," ",
                              regex=True)
# Split string into lists of words on the whitespace
df_text_ng['text_tokens'] = df_text_ng['text_punct'].str.split()
# Remove words not in our prederived list
df_text_ng['text_stop'] = df_text_ng['text_tokens'].apply(lambda x: [word 
                             for word in x if word not in our_stopwords])
# Rejoin into a string 
df_text_ng['text_clean'] = df_text_ng['text_stop'].apply(lambda x: ' '.join([word for word in x]))

In [27]:
print("Original record 1: \n\n")
print(df_text_ng['text'][0:1])
print("\n")
print("Cleaned record 1: \n\n")
print(df_text_ng['text_clean'][0:1])

Original record 1: 


0    From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n
Name: text, dtype: object


Cleaned record 1: 


0    thing car organization university maryland college park lines wondering anyone could enlighten car saw door sports car looked late early called doors really addition front bump

## Feature engineering
Below we'll create the three possible sets that may be used (There are other <br>methods
but these are the voting options) <br>
<ol>
<li> Set 1: Bag of words with Unigrams
<li> Set 2: Bag of words with Unigrams and bigrams
<li> Set 3: TF-IDF with unigrams
</ol> <br>
Uncomment the set you would like to create

In [28]:
#Uncomment this cell for Set 1
bogS1 = CountVectorizer()
X = bogS1.fit_transform(df_text_ng['text_clean'])
print(X.toarray())
print("First 20 features:....\n")
print(bogS1.get_feature_names()[0:20])


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 2 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
First 20 features:....

['ab', 'able', 'abraham', 'abs', 'absolute', 'absurd', 'abuse', 'acceleration', 'acceptance', 'access', 'accessories', 'accident', 'accidental', 'accidentally', 'accidents', 'accuracy', 'achieved', 'acquisition', 'acrv', 'active']


In [29]:
#Uncomment this cell for Set 2
bogS2 = CountVectorizer(ngram_range=(1,2))
X = bogS2.fit_transform(df_text_ng['text_clean'])
print("First 20 features:....\n")
print(bogS2.get_feature_names()[0:20])

First 20 features:....

['ab', 'able', 'able amplify', 'able import', 'able point', 'abraham', 'abraham moses', 'abs', 'abs security', 'absolute', 'absolute best', 'absolute gets', 'absolute moral', 'absurd', 'absurd scsi', 'abuse', 'abuse tiff', 'acceleration', 'acceleration clock', 'acceleration higher']


In [30]:
#Uncomment this cell for Set 3
tfidfvect = TfidfVectorizer()
X = tfidfvect.fit_transform(df_text_ng['text_clean'])
print("First 20 features:....\n")
print(tfidfvect.get_feature_names()[0:20])

First 20 features:....

['ab', 'able', 'abraham', 'abs', 'absolute', 'absurd', 'abuse', 'acceleration', 'acceptance', 'access', 'accessories', 'accident', 'accidental', 'accidentally', 'accidents', 'accuracy', 'achieved', 'acquisition', 'acrv', 'active']


## Dimensionality reduction 

### LDA

In [31]:
n_components = 20

In [134]:
# LDA
print("Performing dimensionality reduction using LSA...")

start = time.time()

lda = LatentDirichletAllocation(n_components=n_components, random_state=0, evaluate_every=1)
lda_model = lda.fit(X)
X_DR = lda.fit_transform(X)

end = time.time()
duration = np.round(end - start)
print("LDA duration: %i" %(duration))
# plt.plot(X_DR.max(axis=1),'o')

Performing dimensionality reduction using LSA...
LDA duration: 0


### LSA

In [135]:
# LSA
## Prepare data for clustering

print("Performing dimensionality reduction using LSA...")

start = time.time()

# # Vectorizer results are normalized, which makes KMeans behave as
# # spherical k-means for better results. Since LSA/SVD results are
# # not normalized, we have to redo the normalization.
svd = TruncatedSVD(n_components=n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X_array = X.toarray()
X_DR = lsa.fit_transform(X)

end = time.time()
duration = np.round(end - start)
print("LDA duration: %i" %(duration))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
     int(explained_variance * 100)))

# Plot the explained variances
features = range(svd.n_components)

plt.bar(features, svd.explained_variance_ratio_, color='black')
plt.xlabel('LSA features')
plt.ylabel('variance %');


Performing dimensionality reduction using LSA...
LDA duration: 0
Explained variance of the SVD step: 100%


### PCA

In [35]:
# PCA

## Prepare data for clustering

print("Performing dimensionality reduction using PCA...")

start = time.time()

pca = PCA(n_components=n_components)
X_DR = pca.fit_transform(X.toarray())
principalDf = pd.DataFrame(data = X_DR)

print("Explained variance of the PCA: {}%".format(
     int(pca.explained_variance_ratio_.sum() * 100)))

end = time.time()
duration = np.round(end - start)
print("LDA duration: %i" %(duration))

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %');
print('The shape of X_DR is: ', X_DR.shape)
#plt.xticks(features)


Performing dimensionality reduction using PCA...
Explained variance of the PCA: 99%
LDA duration: 0
The shape of X_DR is:  (20, 20)


## Clustering

### Kmeans

In [235]:
# Initialise Modelling Parameters

## Number of CLusters
K = 2

## Method for Initialisation
init='k-means++' ## k-means++ selects initial cluster centers for k-mean clustering in a smart way to speed up convergence.

## The number of initializations to perform. The best results are kept.
n_init=10

In [238]:
## Create model instance
model_instance = KMeans(n_clusters = K,
                        init=init,
                        n_init=n_init
                       )

## Fit data to model
model = model_instance.fit(X_DR)

## Get labels for evaluation
labels = model.labels_

### Gaussian Mixture Models

In [225]:
# Initialise Modelling Parameters

## Number of CLusters
K = 5

## String describing the type of covariance parameters to use.
covariance_type='full' ## Full means each component has its own general covariance matrix

## The number of initializations to perform. The best results are kept.
n_init=10

In [237]:
## Create model instance
model_instance = GaussianMixture(n_components=K
                                , n_init=n_init
                                , covariance_type=covariance_type
                                )

## Fit data to model
model = model_instance.fit(X_DR)

## Get labels for evaluation
labels = model.predict(X_DR)

### DBSCAN

In [231]:
# Initialise Modelling Parameters

## The maximum distance between two samples for one to be considered as in the neighborhood of the other.
eps = 1.3

## The number of samples in a neighborhood for a point to be considered as a core point. This includes the point itself.
min_samples = 2

In [241]:
## Create model instance
model_instance = DBSCAN(eps=eps
                        , min_samples=min_samples
                        )

## Fit data to model
model = model_instance.fit(X_DR)

## Get labels for evaluation
labels = model.labels_

## Evaluation

In [242]:
# Evaluation
## Silhouette Score
silhouette_score = metrics.silhouette_score(X_DR
                                            , labels
                                           )
print(silhouette_score)

## Davies-Bouldin Score
davies_bouldin_score = metrics.davies_bouldin_score(X_DR
                                                    , labels
                                                   )

print(davies_bouldin_score)

0.004528616739895276
2.069880711696186
