In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from tqdm import tqdm


In [2]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.2-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.3
    Uninstalling transformers-4.50.3:
      Successfully uninstalled transformers-4.50.3
Successfully installed transformers-4.51.2


In [3]:
import transformers
print(transformers.__version__)
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AutoTokenizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

4.51.2


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
import kagglehub

path = kagglehub.dataset_download("venky73/spam-mails-dataset")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/venky73/spam-mails-dataset?dataset_version_number=1...


100%|██████████| 1.86M/1.86M [00:01<00:00, 1.76MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/venky73/spam-mails-dataset/versions/1


In [5]:
import os
file_path = os.path.join(path, "spam_ham_dataset.csv")
df = pd.read_csv(file_path)

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
important_words = {"free", "win", "cash", "prize", "urgent", "offer", "call", "credit", "loan", "guarantee"}

In [7]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9!$ ]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words or word in important_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [9]:
df['processed_text'] = df['text'].apply(preprocess_text)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=42)


In [11]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [12]:
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
print("Na\u00efve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Naïve Bayes Accuracy: 0.9497584541062802
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       742
           1       0.91      0.91      0.91       293

    accuracy                           0.95      1035
   macro avg       0.94      0.94      0.94      1035
weighted avg       0.95      0.95      0.95      1035



In [13]:
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9806763285024155
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.97      0.97      0.97       293

    accuracy                           0.98      1035
   macro avg       0.98      0.98      0.98      1035
weighted avg       0.98      0.98      0.98      1035



In [14]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        return encoding["input_ids"].squeeze(), encoding["attention_mask"].squeeze(), torch.tensor(self.labels[idx])


In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_dataset = SpamDataset(X_train.tolist(), y_train.tolist())
test_dataset = SpamDataset(X_test.tolist(), y_test.tolist())
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# New Section

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)
optimizer = Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print("DistilBERT Accuracy:", correct / total)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 259/259 [01:30<00:00,  2.86it/s, loss=0.00945]
Epoch 2: 100%|██████████| 259/259 [01:29<00:00,  2.89it/s, loss=0.00325]
Epoch 3: 100%|██████████| 259/259 [01:29<00:00,  2.89it/s, loss=0.502]


DistilBERT Accuracy: 0.9594202898550724


In [17]:
def predict_spam(model, tokenizer, text, device):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(text, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        output = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(output.logits, dim=1).item()
        return "Spam" if prediction == 1 else "Ham"

sample_text = "Don't forget to submit your report by EOD today."
print("Prediction:", predict_spam(model, tokenizer, sample_text, device))

Prediction: Ham


In [18]:
test_messages = [
    "Get a free loan with 0% interest now!",
    "Hey, how are you doing today?",
    "Urgent! Your account has been compromised. Click here to secure it.",
    "Let's meet for lunch at 1 PM."
]

for msg in test_messages:
    print(f"Message: {msg} -> Prediction: {predict_spam(model, tokenizer, msg, device)}")


Message: Get a free loan with 0% interest now! -> Prediction: Spam
Message: Hey, how are you doing today? -> Prediction: Ham
Message: Urgent! Your account has been compromised. Click here to secure it. -> Prediction: Spam
Message: Let's meet for lunch at 1 PM. -> Prediction: Ham


In [20]:
model.save_pretrained("spam_classifier_model")
tokenizer.save_pretrained("spam_classifier_model")

# Zip and download
!zip -r spam_classifier_model.zip spam_classifier_model
import shutil
from google.colab import files
shutil.make_archive("spam_classifier_model", 'zip', "spam_classifier_model")
files.download("spam_classifier_model.zip")


  adding: spam_classifier_model/ (stored 0%)
  adding: spam_classifier_model/config.json (deflated 43%)
  adding: spam_classifier_model/tokenizer_config.json (deflated 75%)
  adding: spam_classifier_model/special_tokens_map.json (deflated 42%)
  adding: spam_classifier_model/tokenizer.json (deflated 71%)
  adding: spam_classifier_model/model.safetensors (deflated 8%)
  adding: spam_classifier_model/vocab.txt (deflated 53%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#from google.colab import files
#import shutil

#shutil.make_archive("spam_classifier_model", 'zip', "spam_classifier_model")
#files.download("spam_classifier_model.zip")


In [None]:
#from google.colab import files
#files.download('/content/spam_classifier_model/vocab.txt')


In [None]:
#from google.colab import drive
#import shutil

# Mount your drive
#drive.mount('/content/drive')

# Copy the folder
#shutil.copytree('/content/spam_classifier_model', '/content/drive/MyDrive/spam_classifier_model')
