In [1]:
!pip install transformers torch


Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import random
import os
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# ---- Configuration ----
RANDOM_SEED = 42
DATA_PATH = 'sms_spam.csv' # expects the common two-column CSV: label,text
NUM_EXAMPLES = 200
TRAIN_SIZE = 150
TEST_SIZE = 50
MODEL_NAME = 'distilbert-base-uncased'
EPOCHS = 3
BATCH_SIZE = 16
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f41a9993d90>

In [7]:
# ---- Utility functions ----
KEYWORDS = ['free','win','winner','claim','offer','urgent','credit','prize','limited','cash']

def load_first_200(path):
    # Expect CSV with header: label,text OR tab-separated
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset file not found at {path}. Please place the SMS Spam dataset as '{path}'.")
    # try common separators
    try:
        df = pd.read_csv(path, encoding='latin-1')
    except Exception:
        df = pd.read_csv(path, sep='\t', header=None, names=['label','text'], encoding='latin-1')
    if 'v1' in df.columns and 'v2' in df.columns:
        # UCI format often has columns v1 (label) and v2 (text)
        df = df.rename(columns={'v1':'label','v2':'text'})
    df = df[['label','text']].dropna()
    df = df.iloc[:NUM_EXAMPLES].reset_index(drop=True)
    return df
    
def preprocess_text(s):
    return str(s).strip().lower()
    
def baseline_keyword_predict(texts):
    preds = []
    for t in texts:
        t_low = t.lower()
        if any(k in t_low for k in KEYWORDS):
            preds.append('spam')
        else:
            preds.append('ham')
    return preds

In [13]:
# ---- Main pipeline ----
def run_pipeline(data_path=DATA_PATH):
    df = load_first_200(data_path)
    df['text_clean'] = df['text'].apply(preprocess_text)
   
    # split first 150 -> train, last 50 -> test
    train_df = df.iloc[:TRAIN_SIZE].reset_index(drop=True)
    test_df = df.iloc[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE].reset_index(drop=True)
    
    # Baseline
    baseline_preds = baseline_keyword_predict(test_df['text_clean'].tolist())
    acc_baseline = accuracy_score(test_df['label'], baseline_preds)
    p, r, f, _ = precision_recall_fscore_support(test_df['label'], baseline_preds, labels=['spam','ham'], average=None)
    
    print('Baseline results:')
    print('Accuracy:', acc_baseline)
    print('Precision (spam):', p[0])
    print('Recall (spam):', r[0])
    print('F1 (spam):', f[0])
    
    # Simple TF-IDF + Logistic Regression (lightweight alternative to full finetune)
    vectorizer = TfidfVectorizer(max_features=2000)
    X_train = vectorizer.fit_transform(train_df['text_clean'].tolist())
    X_test = vectorizer.transform(test_df['text_clean'].tolist())
    y_train = (train_df['label'] == 'spam').astype(int)
    y_test = (test_df['label'] == 'spam').astype(int)
    
    lr = LogisticRegression(max_iter=500)
    lr.fit(X_train, y_train)
    lr_preds = lr.predict(X_test)
    acc_lr = accuracy_score(y_test, lr_preds)
    p_lr, r_lr, f_lr, _ = precision_recall_fscore_support(y_test, lr_preds, average='binary')
    
    print('\nTF-IDF + Logistic Regression results:')
    print('Accuracy:', acc_lr)
    print('Precision (spam):', p_lr)
    print('Recall (spam):', r_lr)
    print('F1 (spam):', f_lr)

# DistilBERT fine-tune 
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
        # Prepare datasets
        def tokenize_fn(examples):
            return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
        
        train_texts = train_df['text_clean'].tolist()
        train_labels = (train_df['label'] == 'spam').astype(int).tolist()
        test_texts = test_df['text_clean'].tolist()
        test_labels = (test_df['label'] == 'spam').astype(int).tolist()
        
        class SimpleDataset(torch.utils.data.Dataset):
            def __init__(self, texts, labels):
                self.texts = texts
                self.labels = labels
            def __len__(self):
                return len(self.texts)
            def __getitem__(self, idx):
                return {'text': self.texts[idx], 'label': self.labels[idx]}
        
        train_dataset = SimpleDataset(train_texts, train_labels)
        test_dataset = SimpleDataset(test_texts, test_labels)
        
        # Tokenize datasets lazily in collator
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            preds = np.argmax(logits, axis=-1)
            acc = (preds == labels).mean()
            p, r, f, _ = precision_recall_fscore_support(labels, preds, average='binary')
            return {'accuracy': acc, 'precision': p, 'recall': r, 'f1': f}
        
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=EPOCHS,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            evaluation_strategy='epoch',
            save_strategy='no',
            logging_strategy='epoch',
            learning_rate=2e-5,
            weight_decay=0.01,
            seed=RANDOM_SEED,
            disable_tqdm=False,
        )
        
        # Convert SimpleDataset to tokenized format for Trainer
        def collate_tokenized(batch):
            texts = [b['text'] for b in batch]
            labels = [b['label'] for b in batch]
            tokenized = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
            tokenized['labels'] = torch.tensor(labels)
            return tokenized
       
        # Small wrapper to work with Trainer
        class HFWrapper(torch.utils.data.Dataset):
            def __init__(self, df):
                self.texts = df['text_clean'].tolist()
                self.labels = (df['label'] == 'spam').astype(int).tolist()
            def __len__(self):
                return len(self.texts)
            def __getitem__(self, idx):
                return {'text': self.texts[idx], 'label': self.labels[idx]}
        
        hf_train = HFWrapper(train_df)
        hf_eval = HFWrapper(test_df)
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=hf_train,
            eval_dataset=hf_eval,
            tokenizer=tokenizer,
            data_collator=collate_tokenized,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()
        eval_res = trainer.evaluate()
        print('\nDistilBERT fine-tune results:')
        print(eval_res)
    
    except Exception as e:
        print('\nSkipping DistilBERT fine-tune (maybe no internet or heavy compute). Error:')
        print(e)
    
    # Print a few qualitative examples where methods disagree
    print('\nQualitative examples (test set):')
    for i in range(min(10, len(test_df))):
        text = test_df.loc[i,'text']
        true = test_df.loc[i,'label']
        base = baseline_keyword_predict([text])[0]
        lr_pred = 'spam' if lr.predict(vectorizer.transform([preprocess_text(text)]))[0]==1 else 'ham'
        print(f"- Text: {text}\n True: {true}\n Baseline: {base}\n TF-IDF+LR: {lr_pred}\n")

if __name__ == '__main__':
    run_pipeline()

Baseline results:
Accuracy: 0.8
Precision (spam): 0.3333333333333333
Recall (spam): 0.42857142857142855
F1 (spam): 0.375

TF-IDF + Logistic Regression results:
Accuracy: 0.86
Precision (spam): 0.0
Recall (spam): 0.0
F1 (spam): 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Skipping DistilBERT fine-tune (maybe no internet or heavy compute). Error:
TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

Qualitative examples (test set):
- Text: The wine is flowing and i'm i have nevering..
 True: ham
 Baseline: spam
 TF-IDF+LR: ham

- Text: Yup i thk cine is better cos no need 2 go down 2 plaza mah.
 True: ham
 Baseline: ham
 TF-IDF+LR: ham

- Text: Ok... Ur typical reply...
 True: ham
 Baseline: ham
 TF-IDF+LR: ham

- Text: As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
 True: ham
 Baseline: ham
 TF-IDF+LR: ham

- Text: You are everywhere dirt, on the floor, the windows, even on my shirt. And sometimes when i open my mouth, you are all that comes flowing out. I dream of my world without you, then half my chores are out too. A time of joy for me, lots of tv shows i.ll see. But i guess like all things you just must