# Load datasets, clean, and split

In [1]:
from datasets import load_dataset
ds = load_dataset("mteb/amazon_polarity", cache_dir="caches/")
train_ds = ds["train"]
test_ds = ds["test"]

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
import re
def clean_text(row):
    text = row["text"]
    # Remove HTML tags
    text = re.sub(r"<.*?>", " ", text)
    # Remove non-printable characters
    text = re.sub(r"[\x00-\x1F\x7F]", " ", text)
    # Replace multiple spaces/newlines with single space
    text = re.sub(r"\s+", " ", text)
    # Optionally lowercase
    text = text.strip()  # Don't lowercase if case matters
    return {
        "text": text
    }

In [30]:
train_valid = train_ds.train_test_split(test_size=0.01, seed=42)
test_valid = test_ds.train_test_split(test_size=0.01, seed=42)

train_ds_reduced = train_valid["test"]
test_ds_reduced = test_valid["test"]

train_ds_reduced = train_ds_reduced.map(clean_text)
test_ds_reduced = test_ds_reduced.map(clean_text)

print(f"Train DS length: {len(train_ds_reduced)}")
print(f"Test DS length: {len(test_ds_reduced)}")

Train DS length: 36000
Test DS length: 4000


In [31]:
from datasets import concatenate_datasets
import math
def select_stratified(dataset, num_samples):
  concat_ds = []
  positive_ds = dataset.filter(lambda x: x["label"] == 1)
  negative_ds = dataset.filter(lambda x: x["label"] == 0)
  
  positive_count = len(positive_ds)
  negative_count = len(negative_ds)
  
  print(f"Positive samples: {positive_count}, Negative samples: {negative_count}")
  
  total = len(dataset)
  positive_ratio = positive_count / total if total > 0 else 0
  negative_ratio = negative_count / total if total > 0 else 0
  
  positive_samples = math.ceil(num_samples * positive_ratio) if math.ceil(num_samples * positive_ratio) < positive_count else positive_count
  negative_samples = math.ceil(num_samples * negative_ratio) if math.ceil(num_samples * negative_ratio) < negative_count else negative_count
  
  positive_subset = positive_ds.shuffle(seed=42).select(range(positive_samples))
  negative_subset = negative_ds.shuffle(seed=42).select(range(negative_samples))
  
  concat_ds.append(positive_subset)
  concat_ds.append(negative_subset)
  print(f"Ratio - Positive: {positive_ratio:.2f}, Negative: {negative_ratio:.2f}")
  print(f"Selected {positive_samples} positive samples and {negative_samples} negative samples.")
  
  
  return concatenate_datasets(concat_ds)

In [32]:
train_ds_reduced = select_stratified(train_ds_reduced, 4285)
train_ds_reduced

Positive samples: 18089, Negative samples: 17911
Ratio - Positive: 0.50, Negative: 0.50
Selected 2154 positive samples and 2132 negative samples.


Dataset({
    features: ['label', 'text', 'label_text'],
    num_rows: 4286
})

In [33]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", cache_dir="caches/", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="caches/")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [34]:
def get_tokens(batch):
  tokens =  tokenizer(
    batch["text"],
    truncation=True,
    padding="max_length",
    return_tensors="pt",
    max_length=256
  )
  return tokens

In [35]:
tokenized_dataset = train_ds_reduced.map(get_tokens, batch_size=16, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Map: 100%|██████████| 4286/4286 [00:00<00:00, 6545.71 examples/s]


In [37]:
final_dataset_split = tokenized_dataset.train_test_split(test_size=0.15, seed=42)

final_dataset_train = final_dataset_split["train"]
final_dataset_test = final_dataset_split["test"]

In [38]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=default_data_collator)

for batch in dl:
    print(batch['input_ids'].shape)  # ❌ Likely to crash or be malformed
    break

torch.Size([4, 256])


In [39]:
from gc import callbacks
from transformers import TrainingArguments

training_args = TrainingArguments(
  num_train_epochs=3,
  per_device_train_batch_size=8,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=50,
  load_best_model_at_end=True,
  metric_for_best_model="eval_accuracy",
  greater_is_better=True,
  eval_strategy="epoch",
  warmup_ratio=0.1,
  weight_decay=0.01,
  learning_rate=3e-5,
  lr_scheduler_type="linear",
)

In [40]:
from transformers import Trainer, default_data_collator
from torch.optim import AdamW
import evaluate
import numpy as np
from transformers import EarlyStoppingCallback, Trainer
accuracy_metric = evaluate.load("accuracy")
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )

    acc = accuracy_score(labels, preds)

    return {
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": f1,
    }
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=final_dataset_train,
  eval_dataset=final_dataset_test,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator,
  optimizers=(AdamW(model.parameters(), lr=5e-5), None),
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01)]
)

In [41]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2583,0.355544,0.909798,0.94898,0.86646,0.905844
2,0.0933,0.386014,0.92224,0.919753,0.925466,0.922601
3,0.0159,0.387572,0.92535,0.930818,0.919255,0.925




TrainOutput(global_step=1368, training_loss=0.20423141235148, metrics={'train_runtime': 1097.2711, 'train_samples_per_second': 9.96, 'train_steps_per_second': 1.247, 'total_flos': 1437770362014720.0, 'train_loss': 0.20423141235148, 'epoch': 3.0})

In [42]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)
from transformers import Trainer
import pandas as pd
CLASS_NAMES = ["negative", "positive"]  # adjust if you have more

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    # overall (macro) metrics
    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    # per-class metrics
    per_class = precision_recall_fscore_support(labels, preds, average=None, zero_division=0)
    p_cls, r_cls, f1_cls, support_cls = per_class

    # Flatten per-class metrics into scalars in the returned dict
    metrics = {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
    }
    for idx, name in enumerate(CLASS_NAMES):
        metrics[f"{name}_precision"] = p_cls[idx]
        metrics[f"{name}_recall"]    = r_cls[idx]
        metrics[f"{name}_f1"]        = f1_cls[idx]
        metrics[f"{name}_support"]   = support_cls[idx]

    return metrics
test_ds_reduced = test_ds_reduced.map(get_tokens, batch_size=16, batched=True)
if "labels" not in test_ds_reduced.column_names:
    test_ds_reduced = test_ds_reduced.rename_column("label", "labels")

evaluation_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_ds_reduced,
    data_collator=default_data_collator,
    optimizers=(AdamW(model.parameters(), lr=5e-5), None),
    compute_metrics=compute_metrics,
)
metrics = evaluation_trainer.evaluate()
pred_out = evaluation_trainer.predict(test_ds_reduced)

logits = torch.tensor(pred_out.predictions, dtype=torch.float32)
probs = torch.nn.functional.softmax(logits, dim=1) 
preds  = pred_out.predictions.argmax(axis=1)
labels = pred_out.label_ids



print("\n=== Confusion Matrix ===")
print(confusion_matrix(labels, preds))

print("\n=== Classification Report ===")
print(classification_report(labels, preds, target_names=CLASS_NAMES, digits=4))

print("=== Scalar metrics returned by Trainer ===")
for k, v in metrics.items():
    print(f"{k:20s}: {v:.4f}")

manual_metrics = compute_metrics((logits.numpy(), labels))

# === Add clean row ===
row = {
    "method": "Randomized",
    "stage": 1,
    "data_amount": len(train_ds_reduced),
    "eval_loss": pred_out.metrics.get("test_loss", None),
    "eval_accuracy": manual_metrics["accuracy"],
    "eval_macro_precision": manual_metrics["macro_precision"],
    "eval_macro_recall": manual_metrics["macro_recall"],
    "eval_macro_f1": manual_metrics["macro_f1"],
    "eval_negative_precision": manual_metrics["negative_precision"],
    "eval_negative_recall": manual_metrics["negative_recall"],
    "eval_negative_f1": manual_metrics["negative_f1"],
    "eval_positive_precision": manual_metrics["positive_precision"],
    "eval_positive_recall": manual_metrics["positive_recall"],
    "eval_positive_f1": manual_metrics["positive_f1"],
}

df = pd.DataFrame([row])
print("\n=== Cleaned DataFrame Row ===")
print(df)







=== Confusion Matrix ===
[[1835  127]
 [ 145 1893]]

=== Classification Report ===
              precision    recall  f1-score   support

    negative     0.9268    0.9353    0.9310      1962
    positive     0.9371    0.9289    0.9330      2038

    accuracy                         0.9320      4000
   macro avg     0.9319    0.9321    0.9320      4000
weighted avg     0.9320    0.9320    0.9320      4000

=== Scalar metrics returned by Trainer ===
eval_loss           : 0.3480
eval_model_preparation_time: 0.0011
eval_accuracy       : 0.9320
eval_macro_f1       : 0.9320
eval_macro_precision: 0.9319
eval_macro_recall   : 0.9321
eval_negative_precision: 0.9268
eval_negative_recall: 0.9353
eval_negative_f1    : 0.9310
eval_negative_support: 1962.0000
eval_positive_precision: 0.9371
eval_positive_recall: 0.9289
eval_positive_f1    : 0.9330
eval_positive_support: 2038.0000
eval_runtime        : 96.5473
eval_samples_per_second: 41.4300
eval_steps_per_second: 5.1790

=== Cleaned DataFrame Row

In [43]:
excel_path = "results.xlsx"
sheet_name = "results"

# Load existing or create new
try:
    existing_df = pd.read_excel(excel_path, sheet_name=sheet_name)
    df = pd.concat([existing_df, df], ignore_index=True)
except FileNotFoundError:
    pass

# Save back
df.to_excel(excel_path, index=False, sheet_name=sheet_name)

In [44]:
model.save_pretrained("save/model/randomized_p1_5000")
tokenizer.save_pretrained("save/tokenizer/randomized_p1_5000")
print("Model and tokenizer saved successfully.")

Model and tokenizer saved successfully.


In [None]:
from datasets import Dataset
import numpy as np

# Assume train_ds is your full unshuffled dataset
train_size = len(train_ds)
rng = np.random.default_rng(seed=42)
shuffled_indices = rng.permutation(train_size)

# Get first 5000
selected_indices = shuffled_indices[:5000]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from sklearn.preprocessing import RobustScaler
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# === Step 0: Recover selected indices ===
# Assuming you originally had: train_ds.shuffle(seed=42).select(range(5000))
# And mean_pooled_bert_embeddings are computed from the full train_ds before shuffle

full_dataset_size = len(mean_pooled_bert_embeddings)
rng = np.random.default_rng(seed=42)
shuffled_indices = rng.permutation(full_dataset_size)
selected_indices = shuffled_indices[:1000]

# === Step 1: Clustering ===
scaler = RobustScaler()
label_umap = UMAP(n_components=50, random_state=42)
visual_umap = UMAP(n_components=2, random_state=42)
clusterer = HDBSCAN(min_cluster_size=200)

mean_pooled_bert_embeddings = np.load("bert_mean_pooled_embeddings.npy")
label_embeddings = label_umap.fit_transform(mean_pooled_bert_embeddings)
label_embeddings = visual_umap.fit_transform(label_embeddings)
label_embeddings = scaler.fit_transform(label_embeddings)

labels = clusterer.fit_predict(label_embeddings)

# === Step 2: Plot all points (faded) and selected points (highlighted) ===
X_umap = label_embeddings  # Already 2D, no need to recompute

plt.figure(figsize=(12, 10))
palette = sns.color_palette("colorblind", n_colors=len(set(labels)))

# Plot all samples with cluster labels, low alpha
sns.scatterplot(
    x=X_umap[:, 0], y=X_umap[:, 1],
    hue=labels,
    palette=palette,
    legend='full',
    alpha=0.3,
    s=30
)

# Highlight selected samples
selected_X = X_umap[selected_indices]
selected_labels = labels[selected_indices]
sns.scatterplot(
    x=selected_X[:, 0], y=selected_X[:, 1],
    hue=selected_labels,
    palette=palette,
    edgecolor="black",
    linewidth=0.6,
    alpha=1.0,
    s=70,
    legend=False  # Suppress duplicate legend
)

plt.title("HDBSCAN Clusters with Selected Training Samples Highlighted")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()