# Clustering

## Task 1: Data Preprocessing

In [1]:
import pandas as pd
import re

df = pd.read_csv('../datasets/reviews.csv')
df["full_review"] = df["Summary"].fillna("") + " " + df["Text"].fillna("")

def remove_html(text):
    return re.sub(r"<.*?>", " ", text)

df["full_review"] = df["full_review"].apply(remove_html)

print("Number of reviews:", len(df))
print("Sample review:\n", df["full_review"].iloc[4])


Number of reviews: 10000
Sample review:
 Great Taste . . . and I want to congratulate the graphic artist for putting the entire product name on such a small box.  The ad men must have really thought long and hard.  But seriously, I love the product. The taste was refreshing and I thought that the taste was pleasing with no aftertaste.  Not too sweet and Goldilocks would have stopped right there if there was a choice of 3.  Easy to use as you just pour the contents into a 16 oz bottle of water and shake.  Mixed well, no granulation, you really couldn't ask for anything more and Lipton like a good drug dealer knows what they are doing, give me a taste for free and have me for life.  5 stars.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    # Keep only letters
    tokens = [t for t in tokens if t.isalpha()]
    # Optional 1: Removes stopwords
    tokens = [t for t in tokens if t not in stop_words]
    # Optional 2: lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df["clean_review"] = df["full_review"].apply(preprocess)
print("Cleaned review sample:\n", df["clean_review"].iloc[0])

Cleaned review sample:
 crunchy good sandwich cooky tried couple brand sandwich cooky best bunch crunchy true texture real cooky might think filling make bit sweet mean satisfied sweet tooth sooner chocolate version glutino good true chocolatey taste something brand


feature representations?

1. Bag of Words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Optional 3: Removes rare and overly used terms,
# Leave out of in less than 5 rewiews, leave out if in more than 95% of reviews
vectorizer = CountVectorizer(max_df=0.70, min_df=50)

X_BoW = vectorizer.fit_transform(df["clean_review"])
print("Bag-of-Words matrix shape:", X_BoW.shape)
#bow_df = pd.DataFrame(X_BoW.toarray(), columns=vectorizer.get_feature_names_out())
#bow_df.head()


Bag-of-Words matrix shape: (10000, 1144)


2. Bert

In [5]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute sentence embeddings from clean reviews
embeddings = model.encode(df["full_review"].tolist(), show_progress_bar=True)

# Convert to numpy array if needed
import numpy as np
X_bert = np.array(embeddings)

print("BERT embedding shape:", X_bert.shape)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 313/313 [00:08<00:00, 36.53it/s]


BERT embedding shape: (10000, 384)


3. UMAP

In [6]:
from umap import UMAP
from sklearn.preprocessing import StandardScaler

# Scale first for UMAP
X_scaled = StandardScaler().fit_transform(X_bert)

# Reduce to 10D
umap = UMAP(n_components=10)
X_bert_UMAP = umap.fit_transform(X_scaled)
print("BERT reduced embedding shape:", X_bert_UMAP.shape)



BERT reduced embedding shape: (10000, 10)


## Task 2: Clustering and Evaluation

1. Kmeans clustering

In [7]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

def kmeans_clustering(X, k=10):
    kmeans = KMeans(n_clusters=k, n_init="auto")
    labels = kmeans.fit_predict(X)
    sil = silhouette_score(X, labels)
    return sil,labels


2. agglomerative clustering

In [8]:
from sklearn.cluster import AgglomerativeClustering

def agglomerative_clustering(X, k=10, linkage='complete', metric="euclidean"):
    clustering = AgglomerativeClustering(n_clusters=k, linkage=linkage, metric=metric)
    labels = clustering.fit_predict(X)
    sil = silhouette_score(X, labels)
    return sil,labels


3. Gaussian Mixture Models

In [9]:
from sklearn.mixture import GaussianMixture

def gmm_clustering(X, k=10):
    gmm = GaussianMixture(n_components=k, random_state=42)
    labels = gmm.fit_predict(X)
    sil = silhouette_score(X, labels)
    return sil,labels


4. Spectral clustering

In [10]:
from sklearn.cluster import SpectralClustering

def spectral_clustering(X, k=10, affinity='rbf', n_neighbors=10):
    try:
        sc = SpectralClustering(n_clusters=k, affinity=affinity, random_state=42, assign_labels='kmeans', n_neighbors=n_neighbors)
        labels = sc.fit_predict(X)
        sil = silhouette_score(X, labels)
        return sil,labels
    except Exception as e:
        print(f"Spectral clustering with k={k} failed: {e}")
        return None


5. Dbscan clustering

In [11]:
from sklearn.cluster import DBSCAN

def dbscan_clustering(X, eps_value=0.3, min_samples=5, metric="euclidean"):
    db = DBSCAN(eps=eps_value, min_samples=min_samples, metric=metric)
    labels = db.fit_predict(X)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    if n_clusters > 1:
        sil = silhouette_score(X, labels)
        return sil,labels
    else:
        print(f"[DBSCAN] with eps={eps_value}, insufficient clusters (only 1 or noise).")
        return None


## Compare everything

In [5]:
from itertools import product

results = []

def test_model(model_func, X, param_grid):
    best_score = -1
    best_params = {}
    all_combinations = list(product(*param_grid.values()))

    for combo in all_combinations:
        params = dict(zip(param_grid.keys(), combo))
        try:
            score, _ = model_func(X, **params)
            if score is not None and score > best_score:
                best_score = score
                best_params = params
            results.append({
                "data_name": data_name,
                "model_name": model_name,
                "params": params,
                "silhouette": score
            })
            print(f"Params: {params}, silhouette: {score:.4f}" if score else f"Params: {params}, failed")
        except Exception as e:
            results.append({
                "data_name": data_name,
                "model_name": model_name,
                "params": params,
                "silhouette": None,
                "error": str(e)
            })
            print(f"Params: {params}, error: {e}")

    return best_params, best_score

In [13]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize

X_dict = {
    # "UMAP": X_bert_UMAP,
    "BoW": X_BoW,
    # "BERT": X_bert,
}

X_param_grid = {
    "BoW": {"normalize_input": False},
    "BERT": {"normalize_input": True},
    "UMAP": {"normalize_input": False}
}

model_functions = {
    "kmeans": kmeans_clustering,
    "agglomerative": agglomerative_clustering,
    "gmm": gmm_clustering,
    "spectral": spectral_clustering,
    "dbscan": dbscan_clustering
}

parameters = {
    # "kmeans": {
    #     "k": [4, 6, 8, 10],
    # },
    # "gmm": {
    #     "k": [4, 6, 8, 10],
    # },
    "agglomerative": {
        "k": [4, 6, 8, 10],
        "linkage": ["complete", "average", "single"],
        "metric": ["euclidean", "manhattan", "cosine"],
    },
    # "spectral": {
    #     "k": [4, 6, 8, 10],
    #     "affinity": ["rbf", "nearest_neighbors"],
    #     #"n_neighbors": [10, 20, 30],
    # },
    "dbscan": {
        "eps_value": [0.3, 0.5, 0.7],
        "min_samples": [3, 8],
        "metric": ["euclidean", "manhattan"],
    }
}



for data_name, X in X_dict.items():
    normalize_flag = X_param_grid.get(data_name)["normalize_input"]
    X = normalize(X) if normalize_flag else X
    for model_name, param_grid in parameters.items():
        print(f"\nTesting {model_name} on {data_name}")
        model_func = model_functions[model_name]

        X1 = X.toarray() if (data_name == "BoW" and (model_name == "agglomerative" or model_name == "gmm")) else X

        best_params, best_score = test_model(model_func, X1, param_grid)
        print(f"Best for {model_name} on {data_name}: {best_params} with silhouette={best_score:.4f}")
        print("========================================")

# Save results to DataFrame and CSV
df_results = pd.read_csv("performance_results.csv")
new_df = pd.DataFrame(results)

df = pd.concat([df_results, new_df], ignore_index=True)
df_results.to_csv("performance_results.csv", index=False)
print("Results saved to 'performance_results.csv'")



Testing agglomerative on BoW
Params: {'k': 4, 'linkage': 'complete', 'metric': 'euclidean'}, silhouette: 0.7321
Params: {'k': 4, 'linkage': 'complete', 'metric': 'manhattan'}, silhouette: 0.7514
Params: {'k': 4, 'linkage': 'complete', 'metric': 'cosine'}, silhouette: -0.1609
Params: {'k': 4, 'linkage': 'average', 'metric': 'euclidean'}, silhouette: 0.7698
Params: {'k': 4, 'linkage': 'average', 'metric': 'manhattan'}, silhouette: 0.6909
Params: {'k': 4, 'linkage': 'average', 'metric': 'cosine'}, silhouette: -0.1987
Params: {'k': 4, 'linkage': 'single', 'metric': 'euclidean'}, silhouette: 0.7517
Params: {'k': 4, 'linkage': 'single', 'metric': 'manhattan'}, silhouette: 0.6909
Params: {'k': 4, 'linkage': 'single', 'metric': 'cosine'}, silhouette: -0.2188
Params: {'k': 6, 'linkage': 'complete', 'metric': 'euclidean'}, silhouette: 0.7165
Params: {'k': 6, 'linkage': 'complete', 'metric': 'manhattan'}, silhouette: 0.7120
Params: {'k': 6, 'linkage': 'complete', 'metric': 'cosine'}, silhouette:

KeyboardInterrupt: 

In [24]:
df_results = pd.read_csv("performance_results.csv")
df_results = df_results.drop(columns=["error"])
df_results = df_results.dropna(subset=["silhouette"])
df_results['silhouette'] = df_results['silhouette'].round(3)
df_results['model_name'] = df_results['model_name'].str.replace('_clustering', '', regex=False)

print(df_results.to_latex())

\begin{tabular}{llllr}
\toprule
 & data_name & model_name & params & silhouette \\
\midrule
0 & UMAP & kmeans & {'k': 4} & 0.372000 \\
1 & UMAP & kmeans & {'k': 6} & 0.434000 \\
2 & UMAP & kmeans & {'k': 8} & 0.455000 \\
3 & UMAP & kmeans & {'k': 10} & 0.405000 \\
4 & UMAP & gmm & {'k': 4} & 0.509000 \\
5 & UMAP & gmm & {'k': 6} & 0.429000 \\
6 & UMAP & gmm & {'k': 8} & 0.465000 \\
7 & UMAP & gmm & {'k': 10} & 0.424000 \\
8 & UMAP & agglomerative & {'k': 4, 'linkage': 'complete', 'metric': 'euclidean'} & 0.352000 \\
9 & UMAP & agglomerative & {'k': 4, 'linkage': 'complete', 'metric': 'manhattan'} & 0.355000 \\
10 & UMAP & agglomerative & {'k': 4, 'linkage': 'complete', 'metric': 'cosine'} & 0.516000 \\
11 & UMAP & agglomerative & {'k': 4, 'linkage': 'average', 'metric': 'euclidean'} & 0.500000 \\
12 & UMAP & agglomerative & {'k': 4, 'linkage': 'average', 'metric': 'manhattan'} & 0.521000 \\
13 & UMAP & agglomerative & {'k': 4, 'linkage': 'average', 'metric': 'cosine'} & 0.523000 \\
14 