## Import libaries

In [38]:
import pandas as pd # to handle dataframes/tables
import numpy as np  # to handle numerical operations
from sklearn.feature_extraction.text import TfidfVectorizer # to convert text to TF-IDF features
from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # to evaluate model performance
from xgboost import XGBClassifier # XGBoost classifier

from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV


In [25]:
# load dataset
# 'encoding' parameter is used to handle special characters in the dataset
df_raw = pd.read_csv('spam.csv', encoding='latin-1')

# display first 5 rows of the dataset
df_raw.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [26]:
# remove unnecessary columns
df_raw = df_raw[['v1', 'v2']]

# rename columns for better understanding
df_raw.columns = ['label', 'message']

df_raw.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
# copy the dataframe to a new variable for cleaning
df_clean = df_raw.copy()

In [28]:
# function to clean and tokenize the text data
def clean_text(text):
    # convert text to lowercase
    text = text.lower()
    # remove punctuation and special characters
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # tokenize the text into words
    tokens = text.split()
    return tokens


In [29]:
# apply the cleaning function to the 'message' column
df_clean['tokens'] = df_clean['message'].apply(clean_text)

# display the cleaned dataframe
df_clean.head()

Unnamed: 0,label,message,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."


In [30]:
df_clean.shape

(5572, 3)

In [31]:
# Convert tokens to TF-IDF features
# create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# fit and transform the tokens column into TF-IDF features
X = vectorizer.fit_transform(df_clean['tokens'].apply(lambda x: ' '.join(x)))

# display the shape of the TF-IDF matrix
X.shape

(5572, 9523)

In [32]:
# Train XGBoost Classifier

# encode labels: 'ham' as 0 and 'spam' as 1
df_clean['label_encoded'] = df_clean['label'].map({'ham': 0, 'spam': 1})
y = df_clean['label_encoded']
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
# create an XGBoost classifier
xgb_classifier = XGBClassifier(
  n_estimators=200,
  max_depth=4,
  learning_rate=0.1,
  subsample=0.9,
  colsample_bytree=0.9,
  random_state=42,
  n_jobs=-1,
  eval_metric="logloss")
# train the classifier
xgb_classifier.fit(X_train, y_train)
# make predictions on the test set
y_pred = xgb_classifier.predict(X_test) 
# evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.9830
Precision: 0.9852
Recall: 0.8867
F1 Score: 0.9333


In [35]:
# Train XGBoost using Cross-validation for more accuracy and avoid overfiting
# Stratified K-Fold cross-validation

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Metrics to track
scoring = {
  "accuracy": "accuracy",
  "precision": "precision",
  "recall": "recall",
  "f1": "f1"
}

# run cross-validation on training dats only
cv_results = cross_validate(
  xgb_classifier,
  X_train,
  y_train,
  cv=cv,
  scoring=scoring,
  n_jobs=-1,
  return_train_score=False
)

# show fold-by-fold scores
cv_scores_df = pd.DataFrame({
  "accuracy": cv_results["test_accuracy"],
  "precision": cv_results["test_precision"],
  "recall": cv_results["test_recall"],
  "f1": cv_results["test_f1"],
})

cv_scores_df

Unnamed: 0,accuracy,precision,recall,f1
0,0.969731,0.934579,0.833333,0.881057
1,0.977578,0.980769,0.85,0.910714
2,0.974186,0.961538,0.840336,0.896861
3,0.967452,0.978723,0.773109,0.86385
4,0.98541,0.964912,0.92437,0.944206


In [36]:
# Mean scores across folds
cv_scores_df.mean()

accuracy     0.974872
precision    0.964105
recall       0.844230
f1           0.899338
dtype: float64

In [42]:
# Hyperparameter tunning (RandomizedSearchCV)
xgb_for_search = XGBClassifier(
  objective="binary:logistic",
  eval_metric="logloss",
  random_state=42,
  n_jobs=-1
)

# Hyperparameter search space
param_dist = {
  "n_estimators": [100, 200, 300, 400],
  "max_depth": [3, 4, 5, 6],
  "learning_rate": [0.01, 0.05, 0.1, 0.2],
  "subsample": [0.7, 0.8, 0.9, 1.0],
  "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
  "gamma": [0, 0.5, 1.0]
}

# Randomized search (f1-score as main metric)
random_search = RandomizedSearchCV(
  estimator=xgb_for_search,
  param_distributions=param_dist,
  n_iter=20,
  scoring="f1",
  cv=cv,
  verbose=1,
  n_jobs=-1,
  random_state=42
)

# Run hyperparameter search on training data
random_search.fit(X_train, y_train)

# best model after tuning
best_xgb = random_search.best_estimator_

random_search.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [40]:
# Evaluate tuned model on unseen test data
y_pred_test = best_xgb.predict(X_test)

# compute metrics
# Compute metrics
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)

test_results = {
    "Accuracy": test_accuracy,
    "Precision": test_precision,
    "Recall": test_recall,
    "F1-Score": test_f1
}

test_results

# display
pd.DataFrame(test_results, index=["XGBoost (tuned on TF-IDF)"])


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
XGBoost (tuned on TF-IDF),0.98296,0.964539,0.906667,0.934708
