# 1. Env Setup

In [1]:
!pip install -q transformers datasets torch scikit-learn

# 2. Import Libraries

In [3]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 3. Load Dataset

In [4]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'text'})

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print(df.sample(5))
print(f"Spam ratio: {df['label'].mean()*100:.2f}%")

      label                                               text
3427      0  \Hi darlin did youPhone me? Im atHome if youwa...
532       0                  Gudnite....tc...practice going on
1176      0  Did he say how fantastic I am by any chance, o...
330       0                             K.k:)apo k.good movie.
4769      0  CHEERS LOU! YEAH WAS A GOODNITE SHAME U NEVA C...
Spam ratio: 13.41%


# 4. Split into train/test

In [5]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state=42)

train_df.shape, test_df.shape

((4457, 2), (1115, 2))

# 5. Convert Pandas -> HuggingFace Dataset

In [6]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# 6. Tokenization

In [7]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenizer_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding="max_length", max_length=128)

train_dataset = train_dataset.map(tokenizer_fn, batched=True)
test_dataset = test_dataset.map(tokenizer_fn, batched=True)

Map:   0%|          | 0/4457 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

# 7. Format Dataset for Pytorch

In [8]:
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 8. Load Pretrained Model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 9. Freeze Layers

In [10]:
for param in model.distilbert.transformer.layer[:-2]:
    for p in param.parameters():
        p.requires_grad = False

# 10. Define Evaluation Metrics

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# 11. Training Arguments

In [22]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=50,
    report_to="none"
)

# 12. Trainer Setup

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


# 13. Train the Model

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0293,0.031549,0.993722,0.976271
2,0.0137,0.026409,0.993722,0.976431
3,0.0015,0.027513,0.993722,0.976431


TrainOutput(global_step=1674, training_loss=0.03071006675547917, metrics={'train_runtime': 97.0868, 'train_samples_per_second': 137.722, 'train_steps_per_second': 17.242, 'total_flos': 442805396857344.0, 'train_loss': 0.03071006675547917, 'epoch': 3.0})

# 14. Save Model

In [25]:
trainer.save_model("./spam-model")
tokenizer.save_pretrained("./spam-model")

('./spam-model/tokenizer_config.json',
 './spam-model/special_tokens_map.json',
 './spam-model/vocab.txt',
 './spam-model/added_tokens.json',
 './spam-model/tokenizer.json')

# 15. Evaluate on Test Set

In [26]:
preds = trainer.predict(test_dataset)
pred_labels = np.argmax(preds.predictions, axis=-1)
acc = accuracy_score(preds.label_ids, pred_labels)
f1 = f1_score(preds.label_ids, pred_labels)

print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Test Accuracy: 0.9937
Test F1 Score: 0.9764


# 16. Test Custom Messages

In [27]:
clf = pipeline("text-classification", model="./spam-model", tokenizer=tokenizer)

print(clf("Win a free vacation now! Click this link to claim your prize."))
print(clf("Hey, are we meeting for lunch today?"))

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.9991590976715088}]
[{'label': 'LABEL_0', 'score': 0.9993010759353638}]
