In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.
Path to dataset files: /kaggle/input/sms-spam-collection-dataset


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np

In [3]:
data_file_path = path + "/spam.csv"
df = pd.read_csv(data_file_path, encoding='latin-1')

In [None]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1": "label", "v2": "message"})

print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]} messages")
print(f"Testing set size: {X_test.shape[0]} messages")


Training set size: 4457 messages
Testing set size: 1115 messages


In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f'\nText data transformed into a feature matrix of shape: {X_train_tfidf.shape}')


Text data transformed into a feature matrix of shape: (4457, 5000)


In [8]:
print("     Multinomial Naive Bayes     ")

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy Score: {accuracy_nb: .4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

     Multinomial Naive Bayes     
Accuracy Score:  0.9722

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.80      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.80      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [11]:
print("LOGISTIC REGRESSION")

lr_model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy Score: {accuracy_lr: .4f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr))

LOGISTIC REGRESSION
Accuracy Score:  0.9704

Classification Report:

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.78      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [13]:
print("SUMMARY")
print(f"Multinomial Naive Bayes Accuracy: {accuracy_nb: .4f}")
print(f"Logistic Regression Accuracy: {accuracy_lr: .4f}")

new_message = ["WINNER! You have won a free iPhone. Click the link now!"]
new_message_tfidf = tfidf_vectorizer.transform(new_message)

prediction = lr_model.predict(new_message_tfidf)[0]

print(f"\nTesting message: '{new_message[0]}'")
print(f"Logistic Regression Prediction: {'SPAM' if prediction == 'spam' else 'HAM'}")

SUMMARY
Multinomial Naive Bayes Accuracy:  0.9722
Logistic Regression Accuracy:  0.9704

Testing message: 'WINNER! You have won a free iPhone. Click the link now!'
Logistic Regression Prediction: HAM
