## Clustering text using NLP and K-means clustering

In [1]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df_raw = pd.read_csv("dataset/articles_data.csv")

In [23]:
df_raw.head()

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,0,reuters,Reuters,Reuters Editorial,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,https://www.reuters.com/article/us-tesla-crash...,https://s4.reutersmedia.net/resources/r/?m=02&...,2019-09-03T16:22:20Z,WASHINGTON (Reuters) - The National Transporta...,0.0,0.0,0.0,2528.0,0.0
1,1,the-irish-times,The Irish Times,Eoin Burke-Kennedy,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,https://www.irishtimes.com/business/economy/un...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T10:32:28Z,The States jobless rate fell to 5.2 per cent l...,0.0,6.0,10.0,2.0,0.0
2,2,the-irish-times,The Irish Times,Deirdre McQuillan,"Louise Kennedy AW2019: Long coats, sparkling t...",Autumn-winter collection features designer’s g...,https://www.irishtimes.com/\t\t\t\t\t\t\t/life...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T14:40:00Z,Louise Kennedy is showing off her autumn-winte...,1.0,,,,
3,3,al-jazeera-english,Al Jazeera English,Al Jazeera,North Korean footballer Han joins Italian gian...,Han is the first North Korean player in the Se...,https://www.aljazeera.com/news/2019/09/north-k...,https://www.aljazeera.com/mritems/Images/2019/...,2019-09-03T17:25:39Z,"Han Kwang Song, the first North Korean footbal...",0.0,0.0,0.0,7.0,0.0
4,4,bbc-news,BBC News,BBC News,UK government lawyer says proroguing parliamen...,"The UK government's lawyer, David Johnston arg...",https://www.bbc.co.uk/news/av/uk-scotland-4956...,https://ichef.bbci.co.uk/news/1024/branded_new...,2019-09-03T14:39:21Z,,0.0,0.0,0.0,0.0,0.0


## Data preprocessing

In [4]:
def preprocess_text(corpus):
    """Pre-process corpus and generate tokens

    Args:
        corpus: corpus to tokenize.

    Returns:
        Tokenized corpus.
    """
    corpus = str(corpus).lower()  # Lowercase words
    corpus = re.sub(r"\[(.*?)\]", "", corpus)  # Remove [+XYZ chars] in content
    corpus = re.sub(r"\s+", " ", corpus)  # Remove multiple spaces in content
    corpus = re.sub(r"\w+…|…", "", corpus)  # Remove ellipsis (and last word)
    corpus = re.sub(r"(?<=\w)-(?=\w)", " ", corpus)  # Replace dash between words
    corpus = re.sub(
        f"[{re.escape(string.punctuation)}]", "", corpus
    )  # Remove punctuation

    tokens = word_tokenize(corpus)  # Get tokens from text
    custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
    tokens = [t for t in tokens if not t in custom_stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [5]:
text_columns = ["title", "description", "content"]

df = df_raw.copy()
df["content"] = df["content"].fillna("")

df["content"]

0        WASHINGTON (Reuters) - The National Transporta...
1        The States jobless rate fell to 5.2 per cent l...
2        Louise Kennedy is showing off her autumn-winte...
3        Han Kwang Song, the first North Korean footbal...
4                                                         
                               ...                        
10432    Growth in the U.S. economys vast services sect...
10433    ZURICH/HONG KONG (Reuters) - The announcement ...
10434                                                     
10435                                                     
10436                                                     
Name: content, Length: 10437, dtype: object

In [6]:
for col in text_columns:
    df[col] = df[col].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10437 entries, 0 to 10436
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       10437 non-null  int64  
 1   source_id                        10437 non-null  object 
 2   source_name                      10437 non-null  object 
 3   author                           9417 non-null   object 
 4   title                            10437 non-null  object 
 5   description                      10437 non-null  object 
 6   url                              10436 non-null  object 
 7   url_to_image                     9781 non-null   object 
 8   published_at                     10436 non-null  object 
 9   content                          10437 non-null  object 
 10  top_article                      10435 non-null  float64
 11  engagement_reaction_count        10319 non-null  float64
 12  engagement_comment

In [7]:
# Create article column based on title, description, and content
df["article"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["article"]

0        NTSB says Autopilot engaged in 2018 California...
1        Unemployment falls to post-crash low of 5.2% |...
2        Louise Kennedy AW2019: Long coats, sparkling t...
3        North Korean footballer Han joins Italian gian...
4        UK government lawyer says proroguing parliamen...
                               ...                        
10432    Drop in US service sector activity raises econ...
10433    Banker defections pose challenge for Credit Su...
10434    A 5-year-old cancer survivor donates 3,000 toy...
10435    Fateful Connection | A detective is haunted by...
10436    Love, Hate & Obsession | Who wanted one-time m...
Name: article, Length: 10437, dtype: object

In [8]:
df["tokens"] = df["article"].map(lambda x: preprocess_text(x))
df["tokens"]

0        [ntsb, says, autopilot, engaged, california, t...
1        [unemployment, falls, post, crash, low, latest...
2        [louise, kennedy, aw2019, long, coats, sparkli...
3        [north, korean, footballer, han, joins, italia...
4        [uk, government, lawyer, says, proroguing, par...
                               ...                        
10432    [drop, us, service, sector, activity, raises, ...
10433    [banker, defections, pose, challenge, credit, ...
10434    [year, old, cancer, survivor, donates, toys, c...
10435    [fateful, connection, detective, haunted, case...
10436    [love, hate, obsession, wanted, one, time, mil...
Name: tokens, Length: 10437, dtype: object

In [9]:
# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["article", "tokens"]]

print(f"Original data: {df_raw.shape}")
print(f"Pre-processed data: {df.shape}")

Original data: (10437, 15)
Pre-processed data: (9882, 2)


### looking at vocabulary

In [10]:
from collections import Counter 

In [11]:
docs = df["article"].values
tokenized_docs = df["tokens"].values

vocab = Counter()
for token in tokenized_docs:
    vocab.update(token)

In [12]:
vocab.most_common(10)

[('us', 2757),
 ('said', 2519),
 ('year', 1781),
 ('president', 1756),
 ('trump', 1705),
 ('world', 1620),
 ('says', 1511),
 ('one', 1418),
 ('two', 1284),
 ('first', 1195)]

### Vectorization using word embeddings

In [13]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [14]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)

In [15]:
model.wv.most_similar("trump")

[('trumps', 0.988541841506958),
 ('president', 0.9746480584144592),
 ('donald', 0.9274919629096985),
 ('ivanka', 0.9203823804855347),
 ('impeachment', 0.9195769429206848),
 ('pences', 0.9152195453643799),
 ('avlon', 0.9148270487785339),
 ('biden', 0.9146018624305725),
 ('breitbart', 0.9143953323364258),
 ('vice', 0.9067230224609375)]

In [16]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

In [17]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

In [18]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=42)

In [19]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [20]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=50, print_silhouette_values=True)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

  f"MiniBatchKMeans is known to have a memory leak on "


For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3556.5088681217776
Silhouette values:
    Cluster 1: Size:55 | Avg:0.36 | Min:-0.00 | Max: 0.56
    Cluster 36: Size:30 | Avg:0.34 | Min:0.03 | Max: 0.54
    Cluster 22: Size:78 | Avg:0.33 | Min:0.08 | Max: 0.51
    Cluster 5: Size:105 | Avg:0.33 | Min:0.03 | Max: 0.53
    Cluster 27: Size:78 | Avg:0.29 | Min:-0.15 | Max: 0.49
    Cluster 10: Size:149 | Avg:0.28 | Min:-0.05 | Max: 0.52
    Cluster 18: Size:52 | Avg:0.28 | Min:-0.07 | Max: 0.50
    Cluster 29: Size:101 | Avg:0.26 | Min:-0.01 | Max: 0.47
    Cluster 35: Size:68 | Avg:0.22 | Min:-0.00 | Max: 0.39
    Cluster 21: Size:131 | Avg:0.21 | Min:-0.07 | Max: 0.42
    Cluster 7: Size:197 | Avg:0.20 | Min:-0.05 | Max: 0.40
    Cluster 44: Size:68 | Avg:0.18 | Min:-0.05 | Max: 0.37
    Cluster 3: Size:106 | Avg:0.17 | Min:-0.01 | Max: 0.34
    Cluster 33: Size:133 | Avg:0.17 | Min:-0.09 | Max: 0.36
    Cluster 41: Size:110 | Avg:0.16 | Min:-0.01 | Max: 0.37
    Cluster 14: Siz

In [21]:
print("Top terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: beirut hughes drugmaker interfax shore 
Cluster 1: sharpie noaa forecasters assertions claim 
Cluster 2: studio perhaps score sequel retirement 
Cluster 3: leo pm delay varadkar coveney 
Cluster 4: pointed argument fentanyl terror blamed 
Cluster 5: category humberto landfall charleston wrath 
Cluster 6: suspend occupied parties swinson matteo 
Cluster 7: popularity access ai tips likes 
Cluster 8: aides undermine erdogan delegation vizcarra 
Cluster 9: jury amber knife officers neighbour 
Cluster 10: islands tornadoes coastal flooding charleston 
Cluster 11: daughter assaulting disappearance sexually indiana 
Cluster 12: lift escalation crisis wider imports 
Cluster 13: planned resolve presented fargo boeing 
Cluster 14: zelensky volodymyr whistleblowers ukrainian impeach 
Cluster 15: panel tweet cnnin declaration strategist 
Cluster 16: appearances weekends decade haul century 
Cluster 17: winning summer goals takes wta 
Cluster 

In [22]:
test_cluster = 40
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:10]:
    print(docs[d])
    print("-------------")

Nationwide raids target Russian opposition activists after Moscow poll, says Alexei Navalny | Russian authorities have staged nationwide raids on the regional offices of opposition leader Alexei Navalny, as well as the homes of dozens of staff and supporters, his team said Thursday. | 
-------------
Credit Suisse Executive Steps Down Amid Spying Scandal | Pierre-Olivier Bouée, the chief executive officer, had hired an investigator to track the head of wealth management who left for a rival bank. | Urs Rohner, the chairman of Credit Suisse, said in a news conference Tuesday morning that the surveillance of Mr. Khan was wrong. 
The measure that was taken was disproportionate and did not reflect the criteria and standards by which we measure our own work… [+913 chars]
-------------
New EU chief 'to sleep in her office' | Ursula von der Leyen's is planning to live in a tiny apartment attached to her Brussels office, her staff has told AFP. | The incoming European Commission President Ursu