In [22]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


### 1. Load and Prepare Dataset

We load the SMS Spam Collection dataset (`spam.csv`) using pandas. 
Only the required columns are kept — `v1` (label) and `v2` (message) — and renamed to `label` and `message` respectively for clarity.


In [23]:
df = pd.read_csv("spam.csv", encoding='latin-1') # Read CSV
df = df[['v1', 'v2']]  # Selecting required columns
df.columns = ['label', 'message']  # Rename for clarity
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1}) #Convert Labels to 0 (ham) and 1 (spam)

### 2. Clean the Text Messages

We define a `clean_text` function to preprocess messages by:
- Converting to lowercase
- Removing digits and punctuation
- Stripping extra spaces

The cleaned messages are stored in a new column `clean_msg`.


In [25]:
def clean_text(msg):
    msg = msg.lower()  # Convert to lowercase
    msg = re.sub(r'\d+', '', msg)  # remove digits
    msg = msg.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    msg = msg.strip()  # remove extra spaces
    return msg

df['clean_msg'] = df['message'].apply(clean_text)

###  3. Vectorize Messages using TF-IDF

We convert the cleaned text messages into numeric vectors using `TfidfVectorizer`.  
Stopwords are removed automatically, and the resulting features (`X`) are used for model training.  
The target labels (`y`) are taken from the `label_num` column.


In [26]:
tfidf = TfidfVectorizer(stop_words='english') # Initialize TF-IDF with stopword removal
X = tfidf.fit_transform(df['clean_msg'])  # Independent variable
y = df['label_num']  # Dependent variable

###  4. Train-Test Split

We split the dataset into training and testing sets using an 80-20 ratio.  
This allows us to train the model on one portion of the data and test its performance on unseen data.  
Setting `random_state=42` ensures reproducibility.


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 5. Train the Spam Detection Model

We use the `Multinomial Naive Bayes` classifier, which is well-suited for text classification problems.  
The model is trained using the TF-IDF vectors from the training set.


In [28]:
model = MultinomialNB() # Create the Naive Bayes model
model.fit(X_train, y_train) # Train on training data

### 6. Evaluate the Model

We evaluate the model performance using:
- **Accuracy**: Overall correctness
- **Confusion Matrix**: Shows true/false positives and negatives
- **Classification Report**: Includes precision, recall, and F1-score for both classes


In [29]:
y_pred = model.predict(X_test) # Predict on test set

print("Accuracy:", accuracy_score(y_test, y_pred))  # Overall accuracy
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred)) # TP, TN, FP, FN
print("Classification Report:\n", classification_report(y_test, y_pred)) # Precision, Recall, F1

Accuracy: 0.968609865470852
Confusion Matrix:
 [[965   0]
 [ 35 115]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115



###  Model Evaluation Results

- **Accuracy**: 96.86%
- The model is highly accurate at identifying **ham** messages (Recall = 1.00).
- It correctly identifies **77% of spam** messages (Recall = 0.77).
- Precision for spam is 1.00, meaning there are no false alarms.
- Overall, the model performs very well, with room for improvement in capturing more spam messages.


### 7. Predict on Custom Messages

We define a helper function `predict_spam()` to clean a new message, vectorize it using the trained TF-IDF model, and predict whether it is `SPAM` or `NOT SPAM`.

We test it on two sample messages.


In [30]:
def predict_spam(msg):
    msg = clean_text(msg) # Clean the message
    vec = tfidf.transform([msg]) # Convert to TF-IDF vector
    pred = model.predict(vec)[0] # Predict using trained model
    return "SPAM" if pred == 1 else "NOT SPAM"

# Test
print(predict_spam("Congratulations! You’ve won a free voucher."))
print(predict_spam("Hello! Can we meet at 5 PM?"))

SPAM
NOT SPAM


### 8. Save Model and Vectorizer

We save the trained spam classifier and the TF-IDF vectorizer using `joblib`.  
These `.pkl` files can be loaded later for making predictions in a web app without retraining.


In [31]:
import joblib
joblib.dump(model, "spam_model.pkl") # Save trained model
joblib.dump(tfidf, "tfidf_vectorizer.pkl") # Save TF-IDF vectorizer

['tfidf_vectorizer.pkl']