## Import libaries

In [None]:
import pandas as pd # to handle dataframes/tables
import numpy as np  # to handle numerical operations
import torch # to handle tensor operations
from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # to evaluate model performance
from xgboost import XGBClassifier # XGBoost classifier
from transformers import AutoTokenizer, AutoModel # To use Advanced NLP models and tokenizers

In [2]:
# load dataset
# 'encoding' parameter is used to handle special characters in the dataset
df_raw = pd.read_csv('spam.csv', encoding='latin-1')

# display first 5 rows of the dataset
df_raw.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# remove unnecessary columns
df_raw = df_raw[['v1', 'v2']]

# rename columns for better understanding
df_raw.columns = ['label', 'message']

df_raw.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# copy the dataframe to a new variable for cleaning
df_clean = df_raw.copy()

## Advanced NLPs

In [5]:
# Advanced NLP Techniques for Spam Detection
# using BERT embeddings + Logistic Regression and XGBoost

# 1. Load pre-trained BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')
bert_model.eval()   # set to eval mode

# 2. Function to get BERT embeddings for one text
def get_bert_embeddings(text):
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=512
    )
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # mean over token dimension -> (hidden_size,)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# 3. Generate BERT embeddings for all messages
embeddings = np.array([get_bert_embeddings(msg) for msg in df_clean['message']])

# 4. Encode labels (ham=0, spam=1)
labels = df_clean['label'].map({'ham': 0, 'spam': 1}).values

# 5. Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

# 6. Helper to evaluate a model
def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, prec, rec, f1

results = {}

# 7. BERT + Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
results["BERT + Logistic Regression"] = eval_model(log_reg, X_test, y_test)

# 8. BERT + XGBoost
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)
results["BERT + XGBoost"] = eval_model(xgb_model, X_test, y_test)

# 9. Show metrics
metrics_names = ["Accuracy", "Precision", "Recall", "F1-Score"]
bert_results_df = pd.DataFrame.from_dict(results, orient="index", columns=metrics_names)
bert_results_df


2025-12-26 00:29:27.494488: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
BERT + Logistic Regression,0.992825,0.986207,0.959732,0.972789
BERT + XGBoost,0.989238,0.992806,0.926174,0.958333


In [6]:
# Advanced NLP Techniques for Spam Detection
# using DistilBERT embeddings + Logistic Regression and XGBoost 

# 1. Load pre-trained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
distilbert_model = AutoModel.from_pretrained(model_name)
distilbert_model.eval()   # set to eval mode

# 2. Function to get DistilBERT embeddings for one text
def get_distilbert_embeddings(text):
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=512
    )
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    # mean over token dimension -> (hidden_size,)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# 3. Generate DistilBERT embeddings for all messages
embeddings = np.array([get_distilbert_embeddings(msg) for msg in df_clean['message']])

# 4. Encode labels (ham=0, spam=1)
labels = df_clean['label'].map({'ham': 0, 'spam': 1}).values

# 5. Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

# 6. Helper to evaluate a model
def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, prec, rec, f1

results = {}

# 7. DistilBERT + Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
results["DistilBERT + Logistic Regression"] = eval_model(log_reg, X_test, y_test)

# 8. DistilBERT + XGBoost
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)
results["DistilBERT + XGBoost"] = eval_model(xgb_model, X_test, y_test)

# 9. Show metrics as DataFrame
metrics_names = ["Accuracy", "Precision", "Recall", "F1-Score"]
distilbert_results_df = pd.DataFrame.from_dict(results, orient="index", columns=metrics_names)
distilbert_results_df


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
DistilBERT + Logistic Regression,0.991928,0.986111,0.95302,0.969283
DistilBERT + XGBoost,0.990135,0.985915,0.939597,0.962199


In [7]:
# Advanced NLP Techniques for Spam Detection
# using RoBERTa embeddings + Logistic Regression and XGBoost

# 1. Load pre-trained RoBERTa model and tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
roberta_model = AutoModel.from_pretrained(model_name)
roberta_model.eval()   # set to eval mode

# 2. Function to get RoBERTa embeddings for one text
def get_roberta_embeddings(text):
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=512
    )
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    # mean over token dimension -> (hidden_size,)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# 3. Generate RoBERTa embeddings for all messages
embeddings = np.array([get_roberta_embeddings(msg) for msg in df_clean['message']])

# 4. Encode labels (ham=0, spam=1)
labels = df_clean['label'].map({'ham': 0, 'spam': 1}).values

# 5. Train–test split
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

# 6. Helper to evaluate a model
def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, prec, rec, f1

results = {}

# 7. RoBERTa + Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
results["RoBERTa + Logistic Regression"] = eval_model(log_reg, X_test, y_test)

# 8. RoBERTa + XGBoost
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)
results["RoBERTa + XGBoost"] = eval_model(xgb_model, X_test, y_test)

# 9. Show metrics as DataFrame
metrics_names = ["Accuracy", "Precision", "Recall", "F1-Score"]
roberta_results_df = pd.DataFrame.from_dict(results, orient="index", columns=metrics_names)
roberta_results_df


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
RoBERTa + Logistic Regression,0.992825,1.0,0.946309,0.972414
RoBERTa + XGBoost,0.992825,1.0,0.946309,0.972414
