In [None]:
!pip install transformers datasets evaluate scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install datasets --upgrade

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
from datasets import load_dataset

dataset = load_dataset("sms_spam")
dataset = dataset.rename_column("label", "labels")  # Ensure label column is named 'labels'

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sms', 'labels'],
        num_rows: 5959
    })
    test: Dataset({
        features: ['sms', 'labels'],
        num_rows: 1490
    })
})

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

In [None]:
from datasets import concatenate_datasets
from random import choices

# First, split the original dataset
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Now balance only the training set
train_set = dataset["train"]
spam = train_set.filter(lambda x: x["labels"] == 1)
ham = train_set.filter(lambda x: x["labels"] == 0)

# Upsample spam to match ham
upsampled_spam = concatenate_datasets([spam] * (len(ham) // len(spam)))
balanced_train = concatenate_datasets([ham, upsampled_spam]).shuffle(seed=42)

# Replace only the training set
dataset["train"] = balanced_train
# Keep test set untouched


Filter:   0%|          | 0/4459 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4459 [00:00<?, ? examples/s]

In [None]:
dataset['train'].filter(lambda x: x["labels"] == 0)

Filter:   0%|          | 0/5959 [00:00<?, ? examples/s]

Dataset({
    features: ['sms', 'labels'],
    num_rows: 3113
})

In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sms"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5959 [00:00<?, ? examples/s]

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="bert-finetuned-spam",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0004,0.013315,0.997315
2,0.0001,0.020788,0.995973
3,0.0001,0.02137,0.995973


TrainOutput(global_step=1119, training_loss=0.01941134078657859, metrics={'train_runtime': 302.5653, 'train_samples_per_second': 59.085, 'train_steps_per_second': 3.698, 'total_flos': 587378986650240.0, 'train_loss': 0.01941134078657859, 'epoch': 3.0})

In [None]:
trainer.save_model("bert-spam-ham-final")
tokenizer.save_pretrained("bert-spam-ham-final")

('bert-spam-ham-final/tokenizer_config.json',
 'bert-spam-ham-final/special_tokens_map.json',
 'bert-spam-ham-final/vocab.txt',
 'bert-spam-ham-final/added_tokens.json',
 'bert-spam-ham-final/tokenizer.json')

In [None]:
from transformers import pipeline

clf = pipeline("text-classification", model="bert-spam-ham-final", tokenizer="bert-spam-ham-final")
print(clf("Hey, are we still meeting today?"))

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9999207258224487}]


In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

(1115, 2) (1115,)


In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.9937219730941704, 'f1': 0.9761092150170648}

In [None]:
from transformers import pipeline

clf = pipeline("text-classification", model="bert-spam-ham-final", tokenizer="bert-spam-ham-final")

print(clf("You’ve won a free boat!"))
print(clf("Hey, can we reschedule the meeting to 3 PM?"))


Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9920870661735535}]
[{'label': 'LABEL_0', 'score': 0.9991400241851807}]


In [None]:
from collections import Counter
print(Counter(dataset["train"]["labels"]))


Counter({0: 3861, 1: 598})


In [None]:
label_map = {
    "LABEL_1": "SPAM",
    "LABEL_0": "HAM"
}


In [None]:
samples = [
    "Win a FREE car now by clicking here!",
    "URGENT! You’ve won $1000. Call now.",
    "Hey, meeting moved to 4 PM.",
    "Can you send the report by EOD?"
]

for text in samples:
    result = clf(text)[0]
    print(f"{text} → {label_map[result['label']]} ({result['score']:.2f})")


Win a FREE car now by clicking here! → SPAM (1.00)
URGENT! You’ve won $1000. Call now. → HAM (0.99)
Hey, meeting moved to 4 PM. → HAM (1.00)
Can you send the report by EOD? → HAM (1.00)
