In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
import torch
import umap
from sklearn.svm import SVC
import tools as tl 

In [3]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Loading Dataset

In [4]:
# Load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

df_train = pd.DataFrame({'text': newsgroups_train.data, 'label': newsgroups_train.target})
df_test = pd.DataFrame({'text': newsgroups_test.data, 'label': newsgroups_test.target})

df_train['label'] = df_train['label'].map(lambda x: newsgroups_train.target_names[x])
df_test['label'] = df_test['label'].map(lambda x: newsgroups_test.target_names[x])

df_train = df_train[df_train['label'].isin(['rec.sport.baseball', 'comp.graphics', 'sci.space', 'talk.religion.misc'])]
df_test = df_test[df_test['label'].isin(['rec.sport.baseball', 'comp.graphics', 'sci.space', 'talk.religion.misc'])]


In [5]:
len(df_train), len(df_test)

(2151, 1431)

In [6]:
df_train = df_train.sample(300)
df_test = df_test.sample(100)
len(df_train), len(df_test)

(300, 100)

### Embedding text

In [7]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings_train = tl.generate_embeddings(df_train['text'].tolist(), tokenizer, model)
embeddings_test = tl.generate_embeddings(df_test['text'].tolist(), tokenizer, model)

embeddings_test = torch.tensor(embeddings_test)
embeddings_train = torch.tensor(embeddings_train)
print("Embeddings generated!")

Generating embeddings...


Generating Embeddings: 100%|██████████| 19/19 [00:16<00:00,  1.17batch/s]
Generating Embeddings: 100%|██████████| 7/7 [00:05<00:00,  1.31batch/s]

Embeddings generated!



  embeddings_test = torch.tensor(embeddings_test)
  embeddings_train = torch.tensor(embeddings_train)


### Reducing Dimenions + training SVM

In [28]:
print("Reducing dimensionality...")
reduction_algo = 'none'
reduced_embeddings_train = tl.reduce_dimensionality(embeddings_train, algo=reduction_algo, n_components=50)
reduced_embeddings_test = tl.reduce_dimensionality(embeddings_test, algo=reduction_algo, n_components=50)
print("Complete")

# Fit an SVM model to the reduced embeddings
svm_model = SVC(kernel='linear')
svm_model.fit(reduced_embeddings_train, df_train['label'])

# Predict the labels
predicted_labels = svm_model.predict(reduced_embeddings_test)

# Add the predicted labels to the dataframe
df_test['cluster'] = predicted_labels
df_test['embedding'] = reduced_embeddings_test.tolist()

df_train['embedding'] = reduced_embeddings_train.tolist()

Reducing dimensionality...
Complete


### Evaluation

In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision
precision = precision_score(df_test['label'], df_test['cluster'], average='weighted')
print(f"Precision: {precision * 100:.2f}%")

# Calculate recall
recall = recall_score(df_test['label'], df_test['cluster'], average='weighted')
print(f"Recall: {recall * 100:.2f}%")

# Calculate F1 score
f1 = f1_score(df_test['label'], df_test['cluster'], average='weighted')
print(f"F1 Score: {f1 * 100:.2f}%")

Precision: 82.80%
Recall: 81.00%
F1 Score: 81.22%


In [30]:
from sklearn.metrics import silhouette_score

X = df_train['embedding'].apply(lambda x: np.array(x)).to_list()
y = df_train['label']
ss = silhouette_score(X, y)
print(f"Silhouette Score: {ss}")

Silhouette Score: 0.011120393408278308
