In [9]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
import torch
from transformers import RobertaTokenizer, RobertaModel
import tools as tl
from tqdm import tqdm

### For Macs

In [10]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Loading dataset

In [11]:
# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Create a DataFrame from the dataset
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})

# Map target labels to target names
df['label'] = df['label'].map(lambda x: newsgroups.target_names[x])

# df = df[df['label'].isin(['rec.motorcycles', 'rec.sport.baseball'])]
df = df.head(2000)


In [12]:
df['label'].value_counts()

label
soc.religion.christian      120
comp.windows.x              118
rec.motorcycles             114
comp.sys.mac.hardware       111
sci.electronics             109
comp.sys.ibm.pc.hardware    109
sci.crypt                   106
rec.sport.hockey            104
sci.space                   103
comp.os.ms-windows.misc     103
sci.med                     102
comp.graphics               102
rec.sport.baseball          101
rec.autos                    93
alt.atheism                  88
talk.politics.guns           88
talk.politics.mideast        87
misc.forsale                 87
talk.religion.misc           78
talk.politics.misc           77
Name: count, dtype: int64

### Creating Embeddings

In [13]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings = tl.generate_embeddings(df['text'].tolist(), tokenizer, model)
print("Embeddings generated!")

Generating embeddings...


Generating Embeddings: 100%|██████████| 125/125 [01:58<00:00,  1.05batch/s]

Embeddings generated!





### Clustering 

In [17]:
print("Reducing dimensionality and clustering...")
clusters = tl.perform_clustering(embeddings, n_clusters=20)
df['cluster'] = clusters
print("Clustering complete!")

Reducing dimensionality and clustering...
Clustering complete!


In [18]:

# Visualizing or interpreting the clusters
print("Cluster distribution:")
print(df['cluster'].value_counts())

# Save results to a CSV for further analysis
df[['text', 'cluster']].to_csv('../outputs/amazon_reviews/clustered_reviews.csv', index=False)
print("Results saved to clustered_reviews.csv")

Cluster distribution:
cluster
11    190
6     148
13    133
14    131
4     128
9     128
1     127
5     120
10    113
19    112
18    108
3     107
15    101
0      81
2      75
8      53
7      47
12     39
17     31
16     28
Name: count, dtype: int64
Results saved to clustered_reviews.csv


In [19]:
df['label'].value_counts()

label
soc.religion.christian      120
comp.windows.x              118
rec.motorcycles             114
comp.sys.mac.hardware       111
sci.electronics             109
comp.sys.ibm.pc.hardware    109
sci.crypt                   106
rec.sport.hockey            104
sci.space                   103
comp.os.ms-windows.misc     103
sci.med                     102
comp.graphics               102
rec.sport.baseball          101
rec.autos                    93
alt.atheism                  88
talk.politics.guns           88
talk.politics.mideast        87
misc.forsale                 87
talk.religion.misc           78
talk.politics.misc           77
Name: count, dtype: int64

In [20]:
from sklearn.metrics import adjusted_rand_score

# Assuming 'df' contains the true labels in the 'label' column and the predicted clusters in the 'cluster' column
true_labels = df['label']
predicted_clusters = df['cluster']

# Calculate the Adjusted Rand Index (ARI)
ari_score = adjusted_rand_score(true_labels, predicted_clusters)
print(f"Adjusted Rand Index (ARI): {ari_score}")

Adjusted Rand Index (ARI): 0.03809044659587082


In [None]:
from sklearn.metrics import silhouette_score

X = embeddings.cpu().numpy()  # Convert embeddings to numpy array if they are not already
cluster_results_km = pd.DataFrame({'K': range(6, 25), 'SIL': np.nan})
cluster_results_km.set_index('K', inplace=True)

for k in tqdm(cluster_results_km.index):
    km_model = KMeans(n_clusters=k, init='k-means++', random_state=42)
    y = km_model.fit_predict(X)
    cluster_results_km.loc[k, 'SIL'] = silhouette_score(X, y)

best_k = cluster_results_km['SIL'].idxmax()
best_silhouette_score = cluster_results_km['SIL'].max()

print(f"Best K: {best_k}, Best Silhouette Score: {best_silhouette_score}")

100%|██████████| 19/19 [00:06<00:00,  2.74it/s]

Best K: 7, Best Silhouette Score: 0.1515936702489853



