In [1]:
import json
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import torch
from transformers import RobertaTokenizer, RobertaModel

  from .autonotebook import tqdm as notebook_tqdm


### For Macs

In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Defining Functions

In [3]:

# Function to generate embeddings
def generate_embeddings(texts, tokenizer, model, batch_size=8):
    """
    Generate embeddings for a list of texts using DistilRoBERTa with mean pooling.
    """
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model = model.to(device)

    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to device

        with torch.no_grad():
            outputs = model(**inputs)
            # Use mean pooling for sentence representation
            # Exclude padding tokens by creating a mask
            attention_mask = inputs['attention_mask']
            token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)

            # Compute mean of embeddings, taking into account the attention mask
            masked_embeddings = token_embeddings * attention_mask.unsqueeze(-1)  # Shape: (batch_size, seq_length, hidden_size)
            # Sum embeddings and count non-padding tokens
            sum_embeddings = masked_embeddings.sum(dim=1)  # Shape: (batch_size, hidden_size)
            count_embeddings = attention_mask.sum(dim=1)  # Shape: (batch_size)

            # Avoid division by zero and compute mean
            mean_embeddings = sum_embeddings / count_embeddings.unsqueeze(-1).clamp(min=1e-9)  # Shape: (batch_size, hidden_size)
            embeddings.append(mean_embeddings)

    return torch.cat(embeddings, dim=0)

def reduce_dimensionality(embeddings, n_components=50):
    """
    Reduce the dimensionality of embeddings using PCA.
    """
    embeddings_np = embeddings.cpu().numpy()  # Move tensor to CPU and convert to NumPy
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings_np)
    return reduced_embeddings

def perform_clustering(embeddings, n_clusters=5):
    """
    Cluster embeddings using KMeans.
    """
    reduced_embeddings = reduce_dimensionality(embeddings)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(reduced_embeddings)
    return clusters


### Loading dataset

In [4]:
def get_data(path):
    df = pd.DataFrame()
    news = []
    labels = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                # Specify the encoding when opening the file
                with open(os.path.join(root, file), 'r', encoding='latin1') as f:
                    content = f.read()
                    articles = content.strip().split('\n\n')
                    for article in articles:
                        clean_article = article.strip()
                        if clean_article:  # Ensure non-empty articles
                            news.append(clean_article)
                            labels.append(os.path.basename(root))
    df = pd.DataFrame({'text': news, 'label': labels})
    return df

path = '../datasets/news'
# Get list of pathnames & labels
df = get_data(path)


In [5]:
len(df)

291086

In [6]:
# Taking only first 1000 for demo 
df = df.head(1000)

In [7]:
len(df)

1000

### Creating Embeddings

In [8]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings = generate_embeddings(df['text'].tolist(), tokenizer, model)
print("Embeddings generated!")


Generating embeddings...
Embeddings generated!


### Clustering 

In [9]:
print("Reducing dimensionality and clustering...")
clusters = perform_clustering(embeddings)
df['cluster'] = clusters
print("Clustering complete!")

Reducing dimensionality and clustering...
Clustering complete!


In [10]:

# Visualizing or interpreting the clusters
print("Cluster distribution:")
print(df['cluster'].value_counts())

# Save results to a CSV for further analysis
df[['text', 'cluster']].to_csv('../outputs/amazon_reviews/clustered_reviews.csv', index=False)
print("Results saved to clustered_reviews.csv")

Cluster distribution:
cluster
0    271
4    256
2    244
3    144
1     85
Name: count, dtype: int64
Results saved to clustered_reviews.csv
