In [1]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
import gensim.downloader as api

from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/trkosire/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/trkosire/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens

    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation

    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]  # Remove digits
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens


custom_stopwords = set(stopwords.words("english"))
text_columns = ["label"]

df_raw = pd.read_csv("final_data_exploration.csv")
df = df_raw.copy()
df["label"] = df["label"].fillna("")

df["label"] = df["label"].astype(str)
df["tokens"] = df["label"].map(lambda x: clean_text(x, word_tokenize, custom_stopwords))

# Remove duplicated after preprocessing
_, idx = np.unique(df["tokens"], return_index=True)
df = df.iloc[idx, :]

# Remove empty values and keep relevant columns
df = df.loc[df.tokens.map(lambda x: len(x) > 0), ["label", "tokens"]]

docs = df["label"].values
tokenized_docs = df["tokens"].values

print(f"Original dataframe: {df_raw.shape}")
print(f"Pre-processed dataframe: {df.shape}")

Original dataframe: (23301, 7)
Pre-processed dataframe: (5718, 2)


In [5]:
model = Word2Vec(sentences=tokenized_docs, vector_size=100, workers=1, seed=SEED)

In [9]:
model.wv.most_similar("grinder")

[('machine', 0.708974301815033),
 ('saw', 0.688444197177887),
 ('tool', 0.608670711517334),
 ('welding', 0.5565173029899597),
 ('workpiece', 0.5380322337150574),
 ('drill', 0.5340321660041809),
 ('cutter', 0.5282828211784363),
 ('handheld', 0.5199535489082336),
 ('hammer', 0.5189434289932251),
 ('drilling', 0.5157067179679871)]

In [10]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

vectorized_docs = vectorize(tokenized_docs, model=model)
len(vectorized_docs), len(vectorized_docs[0])

(5718, 100)

In [11]:
def mbkmeans_clusters(
    X,
    k,
    mb,
    print_silhouette_values,
):
    """Generate clusters and print Silhouette metrics using MBKmeans

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

In [14]:
clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=10,
    mb=500,
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": docs,
    "tokens": [" ".join(text) for text in tokenized_docs],
    "cluster": cluster_labels
})

  super()._check_params_vs_input(X, default_n_init=3)


For n_clusters = 10
Silhouette coefficient: 0.07
Inertia:10.056236125048681
Silhouette values:
    Cluster 3: Size:118 | Avg:0.33 | Min:-0.09 | Max: 0.50
    Cluster 8: Size:42 | Avg:0.31 | Min:0.09 | Max: 0.52
    Cluster 2: Size:179 | Avg:0.29 | Min:0.04 | Max: 0.51
    Cluster 7: Size:102 | Avg:0.26 | Min:0.07 | Max: 0.39
    Cluster 5: Size:129 | Avg:0.22 | Min:0.02 | Max: 0.38
    Cluster 6: Size:66 | Avg:0.19 | Min:-0.05 | Max: 0.38
    Cluster 9: Size:164 | Avg:0.17 | Min:-0.04 | Max: 0.32
    Cluster 1: Size:3246 | Avg:0.06 | Min:-0.16 | Max: 0.25
    Cluster 0: Size:749 | Avg:0.02 | Min:-0.15 | Max: 0.12
    Cluster 4: Size:923 | Avg:-0.01 | Min:-0.15 | Max: 0.08


In [16]:
print("Most representative terms per cluster (based on centroids):")
for i in range(10):
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=5)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Most representative terms per cluster (based on centroids):
Cluster 0: saw machine tool grinder drill 
Cluster 1: saw machine drill tool grinder 
Cluster 2: tool machine saw drill grinder 
Cluster 3: machine saw tool grinder drill 
Cluster 4: drill machine saw tool pres 
Cluster 5: metal machine saw tool drill 
Cluster 6: lock machine drill saw tool 
Cluster 7: cutter machine saw drilling tool 
Cluster 8: bulb light machine drill head 
Cluster 9: screw saw machine drill head 
