## Importing the Dataset

In [78]:
import kagglehub
uciml_sms_spam_collection_dataset_path = kagglehub.dataset_download('uciml/sms-spam-collection-dataset')

print('Data source import complete.')

Data source import complete.


In [79]:
import os
content = os.listdir(uciml_sms_spam_collection_dataset_path)

print(uciml_sms_spam_collection_dataset_path)
for item in content:
    print(item)

/kaggle/input/sms-spam-collection-dataset
spam.csv


## Importing the Libraries

In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import re

In [81]:
trainPath = r"/kaggle/input/sms-spam-collection-dataset/spam.csv"
trainData = pd.read_csv(trainPath, encoding='latin-1')
trainData.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Cleaning the Dataset

In [82]:
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

trainData = trainData.drop(columns=columns_to_drop, axis=1)

In [83]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Downloading the NLTK Packages

In [84]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [85]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

## Cleaning the messages in the v2 column

In [86]:
def clean_text_characters(text):
    if isinstance(text, str):
        # Remove numbers, punctuation, and symbols
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Optionally, convert to lowercase here or in remove_stopwords
        text = text.lower()
        return text
    return ""

def remove_stopwords(text):
    words = [word for word in text.split() if word.lower() not in stop_words]
    return " ".join(words)

def lemmatize_text(text):
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(words)

trainData['v2'] = trainData['v2'].apply(clean_text_characters).apply(remove_stopwords).apply(lemmatize_text)

In [87]:
trainData.head()

Unnamed: 0,v1,v2
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though


## Splitting the Dataset into train and test set

In [88]:
label_encoder = LabelEncoder()

X = trainData['v2'] # Features
y_original = trainData['v1']# Label
y_encoded = label_encoder.fit_transform(y_original)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [89]:
vector = TfidfVectorizer()

X_train_tfidf = vector.fit_transform(X_train)
X_test_tfidf = vector.transform(X_test)

## Intitalizing the Models

In [90]:
mnb_model = MultinomialNB()
log_reg = LogisticRegression(random_state=42)
svm_model = SVC(random_state=42)

## Training the Models

In [91]:
print("Training Multinomial Naive Bayes...")
mnb_model.fit(X_train_tfidf, y_train)
print("Multinomial Naive Bayes training complete.")

print("Training Logistic Regression...")
log_reg.fit(X_train_tfidf, y_train)
print("Logistic Regression training complete.")

print("Training SVM...")
svm_model.fit(X_train_tfidf, y_train)
print("SVM training complete.")

Training Multinomial Naive Bayes...
Multinomial Naive Bayes training complete.
Training Logistic Regression...
Logistic Regression training complete.
Training SVM...
SVM training complete.


## Making the Prediciton

In [92]:
print("Making predictions on test data...")
y_pred_log_reg = log_reg.predict(X_test_tfidf)
y_pred_mnb = mnb_model.predict(X_test_tfidf)
y_pred_svm = svm_model.predict(X_test_tfidf)
print("Predictions complete.")

Making predictions on test data...
Predictions complete.


## Evaluating the Models

In [93]:
print("\n--- Model Evaluation ---")

def evaluate_model(model_name, y_true, y_pred):
    print(f"\n----{model_name}----")
    print("Accuracy Score:")
    print(f"{accuracy_score(y_true, y_pred)* 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print('-' *30)

model = mnb_model
evaluate_model("Naive Bayes", y_test, y_pred_mnb)

model = log_reg
evaluate_model("Logistic Regression", y_test, y_pred_log_reg)

model = svm_model
evaluate_model("Support Vector Machine", y_test, y_pred_svm)


--- Model Evaluation ---

----Naive Bayes----
Accuracy Score:
96.68%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
[[965   0]
 [ 37 113]]
------------------------------

----Logistic Regression----
Accuracy Score:
94.80%
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.64      0.77       150

    accuracy                           0.95      1115
   macro avg       0.95      0.82      0.87      1115
weighted avg       0.95      0.95      0.94      1115

Confusion Matrix:
[[961   4]
 [ 54  96]]
------------------------------

----Support Vector Machine----
Ac

# Conclusion

*   We tested three models to detect spam messages: Naive Bayes, Logistic Regression, and Support Vector Machine (SVM).
*   All models did a good job overall, but **SVM was the best**.
*   SVM was most accurate and did the best job of catching spam without mistakenly marking important messages as spam.
*   Naive Bayes was also good, especially because it never marked a real message as spam.
*   For this task, **SVM is the top pick**.