In [10]:
!pip install pandas numpy scikit-learn nltk transformers datasets torch tqdm
!python -m nltk.downloader punkt stopwords




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kingp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kingp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import pandas as pd

df = pd.read_csv("spam_ham_dataset.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [12]:
# If your dataset has label_text
if "label_text" in df.columns:
    df["label"] = df["label_text"].map({"ham":0, "spam":1})

# If your dataset has label column as text
df["label"] = df["label"].map({"ham":0, "spam":1}).fillna(df["label"])

df["label"] = df["label"].astype(int)

df["label"].unique()


array([0, 1])

In [13]:
import re, html
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

STOPWORDS = set(stopwords.words("english"))
PS = PorterStemmer()

def preprocess_for_tf(text):
    text = html.unescape(str(text)).lower()
    text = re.sub(r'(https?://\S+)', ' urltoken ', text)
    text = re.sub(r'www\.\S+', ' urltoken ', text)
    text = re.sub(r'\S+@\S+', ' emailtoken ', text)
    text = re.sub(r'<[^>]+>', ' htmltag ', text)
    text = re.sub(r'\d+', ' numtoken ', text)

    tokens = re.findall(r'\b\w+\b', text)
    tokens = [PS.stem(w) for w in tokens if w not in STOPWORDS]
    return " ".join(tokens)

def preprocess_for_bert(text):
    return html.unescape(str(text)).strip()

df["tf_text"] = df["text"].apply(preprocess_for_tf)
df["bert_text"] = df["text"].apply(preprocess_for_bert)


In [14]:
def count_urls(s): return len(re.findall(r'https?://\S+|www\.\S+', str(s)))
def count_html_tags(s): return len(re.findall(r'<[^>]+>', str(s)))
def count_exclaims(s): return str(s).count('!')
def num_words(s): return len(re.findall(r'\w+', str(s)))
def has_unsubscribe(s): return int("unsubscribe" in str(s).lower())

df["num_urls"] = df["text"].apply(count_urls)
df["num_html_tags"] = df["text"].apply(count_html_tags)
df["num_exclaims"] = df["text"].apply(count_exclaims)
df["num_words"] = df["text"].apply(num_words)
df["has_unsubscribe"] = df["text"].apply(has_unsubscribe)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = df["tf_text"]
y = df["label"]

X_train, X_hold, y_train, y_hold = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=5)
X_train_tfidf = tfidf.fit_transform(X_train)
X_hold_tfidf = tfidf.transform(X_hold)

lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_tfidf, y_train)

y_pred = lr.predict(X_hold_tfidf)

print(classification_report(y_hold, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.98      0.98       735
           1       0.96      0.96      0.96       300

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035



In [16]:
from sklearn.metrics import roc_curve, auc

y_prob_lr = lr.predict_proba(X_hold_tfidf)[:,1]
fpr, tpr, _ = roc_curve(y_hold, y_prob_lr)
print("LR AUC:", auc(fpr, tpr))


LR AUC: 0.9969931972789114


In [22]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df[["bert_text","label"]],
    test_size=0.1,
    stratify=df["label"],
    random_state=42
)

len(train_df), len(test_df)


(4653, 518)

In [23]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def encode_batch(batch):
    return tokenizer(
        batch["bert_text"].tolist(),
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )


OSError: [WinError 1114] A dynamic link library (DLL) initialization routine failed. Error loading "C:\Users\kingp\anaconda3\Lib\site-packages\torch\lib\c10.dll" or one of its dependencies.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmailDataset(Dataset):
    def __init__(self, df):
        self.texts = df["bert_text"].tolist()
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k,v in item.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = EmailDataset(train_df)
test_dataset = EmailDataset(test_df)
