In [1]:
import json
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import torch
from transformers import RobertaTokenizer, RobertaModel
import tools as tl

  from .autonotebook import tqdm as notebook_tqdm


### For Mac

In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Defining Functions

In [3]:
def read_json_array(path):
  data = []
  with open(path, 'r') as file:
    for line in file:
      json_object = json.loads(line.strip())
      data.append(json_object)
  return data

### Loading dataset

In [4]:
dataset_path = '../datasets/amazon_reviews/'
data_path = dataset_path + "Musical_Instruments.json"
meta_path = dataset_path + "meta_Musical_Instruments.json"

data = read_json_array(data_path)
# Extract relevant fields for processing
df = pd.DataFrame(data)
df['text'] = df['reviewText']

In [5]:
# Taking only first 1000 for demo 
df = df.head(1000)

In [6]:
len(df)

1000

### Creating Embeddings

In [7]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings = tl.generate_embeddings(df['text'].tolist(), tokenizer, model)
print("Embeddings generated!")


Generating embeddings...


Generating Embeddings: 100%|██████████| 63/63 [00:28<00:00,  2.24batch/s]

Embeddings generated!





### Clustering 

In [None]:
print("Reducing dimensionality and clustering...")
clusters = tl.perform_clustering(embeddings, algo='kmeans', n_clusters=15)
df['cluster'] = clusters
print("Clustering complete!")

Reducing dimensionality and clustering...
Clustering complete!


In [9]:

# Visualizing or interpreting the clusters
print("Cluster distribution:")
print(df['cluster'].value_counts())

# Save results to a CSV for further analysis
df[['text', 'cluster']].to_csv('../outputs/amazon_reviews/clustered_reviews.csv', index=False)
print("Results saved to clustered_reviews.csv")

Cluster distribution:
cluster
0    436
1    220
2    192
4     78
3     74
Name: count, dtype: int64
Results saved to clustered_reviews.csv
