# Load datasets, clean, and split

In [36]:
from datasets import load_dataset
ds = load_dataset("mteb/amazon_polarity", cache_dir="caches/")
train_ds = ds["train"]
test_ds = ds["test"]

In [37]:
import re
def clean_text(row):
    text = row["text"]
    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    # Remove non-printable characters
    text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
    # Replace multiple spaces/newlines with single space
    text = re.sub(r"\s+", " ", text)
    # Optionally lowercase
    text = text.strip()  # Don't lowercase if case matters
    return {
        "text": text
    }

In [38]:
train_valid = train_ds.train_test_split(test_size=0.01, seed=42)
test_valid = test_ds.train_test_split(test_size=0.01, seed=42)

train_ds_reduced = train_valid["test"]
test_ds_reduced = test_valid["test"]

train_ds_reduced = train_ds_reduced.map(clean_text)
test_ds_reduced = test_ds_reduced.map(clean_text)

print(f"Train DS length: {len(train_ds_reduced)}")
print(f"Test DS length: {len(test_ds_reduced)}")

Train DS length: 36000
Test DS length: 4000


In [39]:
train_ds_reduced = train_ds_reduced.shuffle(seed=42).select(range(4281))

In [23]:
train_ds_reduced

Dataset({
    features: ['label', 'text', 'label_text'],
    num_rows: 1000
})

In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", cache_dir="caches/", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="caches/")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [41]:
def get_tokens(batch):
  tokens =  tokenizer(
    batch["text"],
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=256
  )
  return tokens

In [42]:
tokenized_dataset = train_ds_reduced.map(get_tokens, batch_size=16, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Map: 100%|██████████| 4281/4281 [00:00<00:00, 6861.68 examples/s]


In [43]:
final_dataset_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

final_dataset_train = final_dataset_split["train"]
final_dataset_test = final_dataset_split["test"]

In [44]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=default_data_collator)

for batch in dl:
    print(batch['input_ids'].shape)  # ❌ Likely to crash or be malformed
    break

torch.Size([4, 256])


In [45]:
from gc import callbacks
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=10,
  per_device_train_batch_size=8,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=50,
  load_best_model_at_end=True,
  metric_for_best_model="eval_accuracy",
  greater_is_better=True,
  eval_strategy="epoch",
  warmup_ratio=0.1,
  weight_decay=0.01,
  learning_rate=3e-5,
  lr_scheduler_type="linear",
)

In [46]:
from transformers import Trainer, default_data_collator
from torch.optim import AdamW
import evaluate
import numpy as np
from transformers import EarlyStoppingCallback, Trainer
accuracy_metric = evaluate.load("accuracy")
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )

    acc = accuracy_score(labels, preds)

    return {
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": f1,
    }
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=final_dataset_train,
  eval_dataset=final_dataset_test,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator,
  optimizers=(AdamW(model.parameters(), lr=5e-5), None),
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01)]
)

In [47]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2985,0.29469,0.897436,0.942105,0.84434,0.890547
2,0.224,0.362331,0.90676,0.943299,0.863208,0.901478
3,0.0517,0.469772,0.90676,0.887387,0.929245,0.907834




TrainOutput(global_step=1446, training_loss=0.23768880027623263, metrics={'train_runtime': 1006.0304, 'train_samples_per_second': 38.289, 'train_steps_per_second': 4.791, 'total_flos': 1520255677870080.0, 'train_loss': 0.23768880027623263, 'epoch': 3.0})

In [48]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)
from transformers import Trainer
import pandas as pd
CLASS_NAMES = ["negative", "positive"]  # adjust if you have more

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    # overall (macro) metrics
    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    # per-class metrics
    per_class = precision_recall_fscore_support(labels, preds, average=None, zero_division=0)
    p_cls, r_cls, f1_cls, support_cls = per_class

    # Flatten per-class metrics into scalars in the returned dict
    metrics = {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
    }
    for idx, name in enumerate(CLASS_NAMES):
        metrics[f"{name}_precision"] = p_cls[idx]
        metrics[f"{name}_recall"]    = r_cls[idx]
        metrics[f"{name}_f1"]        = f1_cls[idx]
        metrics[f"{name}_support"]   = support_cls[idx]

    return metrics
test_ds_reduced = test_ds_reduced.map(get_tokens, batch_size=16, batched=True)
if "labels" not in test_ds_reduced.column_names:
    test_ds_reduced = test_ds_reduced.rename_column("label", "labels")

evaluation_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_ds_reduced,
    data_collator=default_data_collator,
    optimizers=(AdamW(model.parameters(), lr=5e-5), None),
    compute_metrics=compute_metrics,
)
metrics = evaluation_trainer.evaluate()
pred_out = evaluation_trainer.predict(test_ds_reduced)

logits = torch.tensor(pred_out.predictions, dtype=torch.float32)
probs = torch.nn.functional.softmax(logits, dim=1) 
preds  = pred_out.predictions.argmax(axis=1)
labels = pred_out.label_ids



print("\n=== Confusion Matrix ===")
print(confusion_matrix(labels, preds))

print("\n=== Classification Report ===")
print(classification_report(labels, preds, target_names=CLASS_NAMES, digits=4))

print("=== Scalar metrics returned by Trainer ===")
for k, v in metrics.items():
    print(f"{k:20s}: {v:.4f}")

manual_metrics = compute_metrics((logits.numpy(), labels))

# === Add clean row ===
row = {
    "eval_loss": pred_out.metrics.get("test_loss", None),
    "eval_accuracy": manual_metrics["accuracy"],
    "eval_macro_f1": manual_metrics["macro_f1"],
    "eval_macro_precision": manual_metrics["macro_precision"],
    "eval_macro_recall": manual_metrics["macro_recall"],
    "eval_negative_precision": manual_metrics["negative_precision"],
    "eval_negative_recall": manual_metrics["negative_recall"],
    "eval_negative_f1": manual_metrics["negative_f1"],
    "eval_negative_support": manual_metrics["negative_support"],
    "eval_positive_precision": manual_metrics["positive_precision"],
    "eval_positive_recall": manual_metrics["positive_recall"],
    "eval_positive_f1": manual_metrics["positive_f1"],
    "eval_positive_support": manual_metrics["positive_support"],
    "data_amount": len(train_ds_reduced),
    "method": "Randomized",  # or "Randomized", etc.
}

df = pd.DataFrame([row])
print("\n=== Cleaned DataFrame Row ===")
print(df)







=== Confusion Matrix ===
[[1893   69]
 [ 259 1779]]

=== Classification Report ===
              precision    recall  f1-score   support

    negative     0.8796    0.9648    0.9203      1962
    positive     0.9627    0.8729    0.9156      2038

    accuracy                         0.9180      4000
   macro avg     0.9212    0.9189    0.9179      4000
weighted avg     0.9219    0.9180    0.9179      4000

=== Scalar metrics returned by Trainer ===
eval_loss           : 0.3156
eval_model_preparation_time: 0.0008
eval_accuracy       : 0.9180
eval_macro_f1       : 0.9179
eval_macro_precision: 0.9212
eval_macro_recall   : 0.9189
eval_negative_precision: 0.8796
eval_negative_recall: 0.9648
eval_negative_f1    : 0.9203
eval_negative_support: 1962.0000
eval_positive_precision: 0.9627
eval_positive_recall: 0.8729
eval_positive_f1    : 0.9156
eval_positive_support: 2038.0000
eval_runtime        : 90.7454
eval_samples_per_second: 44.0790
eval_steps_per_second: 5.5100

=== Cleaned DataFrame Row

In [49]:
excel_path = "results.xlsx"
sheet_name = "results"

# Load existing or create new
try:
    existing_df = pd.read_excel(excel_path, sheet_name=sheet_name)
    df = pd.concat([existing_df, df], ignore_index=True)
except FileNotFoundError:
    pass

# Save back
df.to_excel(excel_path, index=False, sheet_name=sheet_name)

In [51]:
model.save_pretrained("save/model/randomized_p1_5000")
tokenizer.save_pretrained("save/tokenizer/randomized_p1_5000")
print("Model and tokenizer saved successfully.")

Model and tokenizer saved successfully.


In [None]:
from datasets import Dataset
import numpy as np

# Assume train_ds is your full unshuffled dataset
train_size = len(train_ds)
rng = np.random.default_rng(seed=42)
shuffled_indices = rng.permutation(train_size)

# Get first 5000
selected_indices = shuffled_indices[:5000]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from sklearn.preprocessing import RobustScaler
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# === Step 0: Recover selected indices ===
# Assuming you originally had: train_ds.shuffle(seed=42).select(range(5000))
# And mean_pooled_bert_embeddings are computed from the full train_ds before shuffle

full_dataset_size = len(mean_pooled_bert_embeddings)
rng = np.random.default_rng(seed=42)
shuffled_indices = rng.permutation(full_dataset_size)
selected_indices = shuffled_indices[:1000]

# === Step 1: Clustering ===
scaler = RobustScaler()
label_umap = UMAP(n_components=50, random_state=42)
visual_umap = UMAP(n_components=2, random_state=42)
clusterer = HDBSCAN(min_cluster_size=200)

mean_pooled_bert_embeddings = np.load("bert_mean_pooled_embeddings.npy")
label_embeddings = label_umap.fit_transform(mean_pooled_bert_embeddings)
label_embeddings = visual_umap.fit_transform(label_embeddings)
label_embeddings = scaler.fit_transform(label_embeddings)

labels = clusterer.fit_predict(label_embeddings)

# === Step 2: Plot all points (faded) and selected points (highlighted) ===
X_umap = label_embeddings  # Already 2D, no need to recompute

plt.figure(figsize=(12, 10))
palette = sns.color_palette("colorblind", n_colors=len(set(labels)))

# Plot all samples with cluster labels, low alpha
sns.scatterplot(
    x=X_umap[:, 0], y=X_umap[:, 1],
    hue=labels,
    palette=palette,
    legend='full',
    alpha=0.3,
    s=30
)

# Highlight selected samples
selected_X = X_umap[selected_indices]
selected_labels = labels[selected_indices]
sns.scatterplot(
    x=selected_X[:, 0], y=selected_X[:, 1],
    hue=selected_labels,
    palette=palette,
    edgecolor="black",
    linewidth=0.6,
    alpha=1.0,
    s=70,
    legend=False  # Suppress duplicate legend
)

plt.title("HDBSCAN Clusters with Selected Training Samples Highlighted")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()