# Word2Vec Article Clustering

## Preparing Environment

In [4]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading data

In [8]:
df_raw = pd.read_csv('articles_data.csv')
df_raw.head(2)

Unnamed: 0.1,Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,0,reuters,Reuters,Reuters Editorial,NTSB says Autopilot engaged in 2018 California...,The National Transportation Safety Board said ...,https://www.reuters.com/article/us-tesla-crash...,https://s4.reutersmedia.net/resources/r/?m=02&...,2019-09-03T16:22:20Z,WASHINGTON (Reuters) - The National Transporta...,0.0,0.0,0.0,2528.0,0.0
1,1,the-irish-times,The Irish Times,Eoin Burke-Kennedy,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,https://www.irishtimes.com/business/economy/un...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T10:32:28Z,The States jobless rate fell to 5.2 per cent l...,0.0,6.0,10.0,2.0,0.0


## Preparing data

In [6]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

In [9]:
custom_stopwords = set(stopwords.words("english") + ["news", "new", "top"])
text_columns = ["title", "description", "content"]

df = df_raw.copy()
df["content"] = df["content"].fillna("")

for col in text_columns:
    df[col] = df[col].astype(str)

# Create text column based on title, description, and content
df["text"] = df[text_columns].apply(lambda x: " | ".join(x), axis=1)
df["tokens"] = df["text"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["text", "tokens"]]

docs = df["text"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (10437, 15)
Pre-processed dataframe: (9882, 2)


In [12]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

## Training model

In [10]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

In [11]:
model.wv.most_similar("trump")

[('trumps', 0.9885429739952087),
 ('president', 0.9746477603912354),
 ('donald', 0.9274908304214478),
 ('ivanka', 0.9203856587409973),
 ('impeachment', 0.9195814728736877),
 ('pences', 0.9152246713638306),
 ('avlon', 0.914821207523346),
 ('biden', 0.9146034121513367),
 ('breitbart', 0.9144060015678406),
 ('vice', 0.906724750995636)]

In [13]:
vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(9882, 100)

In [14]:
def mbkmeans_clusters(
	X, 
    k, 
    mb, 
    print_silhouette_values, 
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [15]:
clustering, cluster_labels = mbkmeans_clusters(
	X=vectorized_docs,
    k=50,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

For n_clusters = 50
Silhouette coefficient: 0.11
Inertia:3571.503939450964
Silhouette values:
    Cluster 38: Size:55 | Avg:0.37 | Min:0.00 | Max: 0.55
    Cluster 35: Size:75 | Avg:0.32 | Min:0.08 | Max: 0.50
    Cluster 39: Size:95 | Avg:0.27 | Min:0.02 | Max: 0.46
    Cluster 9: Size:93 | Avg:0.27 | Min:-0.00 | Max: 0.44
    Cluster 25: Size:94 | Avg:0.25 | Min:-0.06 | Max: 0.48
    Cluster 24: Size:56 | Avg:0.24 | Min:-0.03 | Max: 0.47
    Cluster 6: Size:180 | Avg:0.24 | Min:-0.11 | Max: 0.51
    Cluster 3: Size:152 | Avg:0.23 | Min:-0.06 | Max: 0.45
    Cluster 46: Size:110 | Avg:0.22 | Min:-0.10 | Max: 0.47
    Cluster 29: Size:34 | Avg:0.22 | Min:-0.05 | Max: 0.45
    Cluster 42: Size:174 | Avg:0.22 | Min:-0.00 | Max: 0.39
    Cluster 4: Size:108 | Avg:0.19 | Min:-0.13 | Max: 0.41
    Cluster 12: Size:45 | Avg:0.19 | Min:0.02 | Max: 0.39
    Cluster 36: Size:196 | Avg:0.17 | Min:-0.03 | Max: 0.37
    Cluster 27: Size:84 | Avg:0.16 | Min:-0.05 | Max: 0.33
    Cluster 19: Size:40

In [16]:
print("Most representative terms per cluster (based on centroids):")
for i in range(50):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: december plunged total decided baker 
Cluster 1: wireless economists jpmorgan export sentiment 
Cluster 2: tweet praised insults prisoner criticized 
Cluster 3: delay mps referendum leo jo 
Cluster 4: serial trying shocked passenger contained 
Cluster 5: opens television orleans corps nationwide 
Cluster 6: charleston islands flooding ravaged carolinas 
Cluster 7: closed mission region missiles blamed 
Cluster 8: militants probable iraqi baghdad targeting 
Cluster 9: squad warm foursomes qualifying argentina 
Cluster 10: success usually fitness product edge 
Cluster 11: escalation responses assembly resume lift 
Cluster 12: putin rouhani vladimir presidents tayyip 
Cluster 13: aides ukrainian vizcarra congressional volodymyr 
Cluster 14: land keeping evil japanese highly 
Cluster 15: abused founding longtime resolution coup 
Cluster 16: stabbing fatally charged neighbor murdering 
Cluster 17: body murdering apartmen

In [17]:
test_cluster = 29
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
for d in most_representative_docs[:3]:
    print(docs[d])
    print("-------------")

The Latest: Pakistan predicts bloodbath in Kashmir | Get breaking national and world news, broadcast video coverage, and exclusive interviews. Find the top news online at ABC news. | The Latest on the U.N. General Assembly's annual gathering of world leaders (all times local):
11:34 a.m.
Pakistani Prime Minister Imran Khan has denounced India's crackdown in Kashmir and warned of a "bloodbath" in the disputed region.
Khan said Friday at… [+3543 chars]
-------------
Embattled Israeli PM fights for survival in do-over election | Get breaking national and world news, broadcast video coverage, and exclusive interviews. Find the top news online at ABC news. | A visibly frantic Prime Minister Benjamin Netanyahu is in the fight of his political life as the country heads to national elections for the second time this year.
With Netanyahu locked in a razor tight race and facing the likelihood of criminal corruption c… [+7219 chars]
-------------
Israeli PM convenes Cabinet in West Bank ahead of 