IMPORTS

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import string
import warnings

DATASET LOADING

In [6]:
try:
    df = pd.read_csv('spam.csv', encoding='latin-1')

    # Rename the columns for easier access.
    df = df[['Category', 'Message']]
    df.columns = ['label', 'text']

    print("Dataset 'spam.csv' loaded successfully.")
    print("Columns renamed to 'label' and 'text'.")
    print("-" * 50)
    print("First 5 rows of the dataset:")
    print(df.head())
    print("-" * 50)

except FileNotFoundError:
    print("Error: The file 'spam.csv' was not found.")
    print("Please make sure the file is in the same directory as the script.")
    exit()

Dataset 'spam.csv' loaded successfully.
Columns renamed to 'label' and 'text'.
--------------------------------------------------
First 5 rows of the dataset:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
--------------------------------------------------


DATA PREPROCESSING

In [7]:
def clean_text(text):
    """
    This function performs several text preprocessing steps:
    1. Converts the text to lowercase.
    2. Removes HTML tags.
    3. Removes punctuation.
    4. Removes special characters.
    5. Joins the cleaned words back into a single string.
    """
    # Convert text to lowercase
    text = text.lower()

    # Remove HTML tags using a simple regex
    text = re.sub(r'<.*?>', '', text)

    # Remove all punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # You could add optional steps here like stop word removal and stemming/lemmatization.
    # For this assignment, the above steps are sufficient and match the presentation.

    return text

# Apply the cleaning function to the 'text' column of the dataframe.
df['cleaned_text'] = df['text'].apply(clean_text)

# Map the labels to numerical values (0 for ham, 1 for spam)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print("Data preprocessing complete.")
print("First 5 rows with cleaned text:")
print(df[['text', 'cleaned_text', 'label']].head())
print("-" * 50)

Data preprocessing complete.
First 5 rows with cleaned text:
                                                text  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                        cleaned_text  label  
0  go until jurong point crazy available only in ...      0  
1                            ok lar joking wif u oni      0  
2  free entry in 2 a wkly comp to win fa cup fina...      1  
3        u dun say so early hor u c already then say      0  
4  nah i dont think he goes to usf he lives aroun...      0  
--------------------------------------------------


DATA SPLITTING

In [8]:
# Split the dataset into features (X) and labels (y).
X = df['cleaned_text']
y = df['label']

# Split the data into an 80% training set and a 20% testing set.
# The 'random_state' ensures that the split is the same every time the code is run.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")
print("-" * 50)


Training set size: 4457 samples
Testing set size: 1115 samples
--------------------------------------------------


FEATURE EXTRACTION (Bag of Words)



In [9]:
# Initialize CountVectorizer to convert text data into a matrix of token counts.
# This represents the Bag of Words (BoW) model.
count_vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform both the training and test data.
# `fit_transform` learns the vocabulary from the training set and transforms it.
# `transform` uses the vocabulary learned from the training set to transform the test set.
X_train_vectorized = count_vectorizer.fit_transform(X_train)
X_test_vectorized = count_vectorizer.transform(X_test)

print("Feature extraction complete using CountVectorizer (Bag of Words).")
print(f"Number of unique words (features) found: {len(count_vectorizer.get_feature_names_out())}")
print("-" * 50)

Feature extraction complete using CountVectorizer (Bag of Words).
Number of unique words (features) found: 8420
--------------------------------------------------


MODEL TRAINING

In [10]:
# Initialize the Multinomial Naive Bayes classifier as specified in the assignment.
model = MultinomialNB()

# Train the model using the vectorized training data and the corresponding labels.
model.fit(X_train_vectorized, y_train)

print("Multinomial Naive Bayes model training complete.")
print("-" * 50)

Multinomial Naive Bayes model training complete.
--------------------------------------------------


7. MODEL EVALUATION

In [11]:
y_pred = model.predict(X_test_vectorized)

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on Test Data: {accuracy:.4f}")

# Generate and print the classification report which includes precision, recall, and F1-score.
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Generate and print the confusion matrix.
# It helps to visualize the model's performance by showing the number of
# true positives, true negatives, false positives, and false negatives.
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("-" * 50)

Model Accuracy on Test Data: 0.9874

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       966
        Spam       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115


Confusion Matrix:
[[966   0]
 [ 14 135]]
--------------------------------------------------


In [12]:
# ==============================================================================
# 8. PREDICTION ON A NEW EMAIL
# ==============================================================================
def predict_email_type(email_text):
    """
    This function takes a new email text, preprocesses it, vectorizes it,
    and uses the trained model to predict whether it is spam or ham.
    """
    # Clean the input text using the same preprocessing function
    cleaned_text = clean_text(email_text)

    # Vectorize the cleaned text using the same CountVectorizer.
    # The input must be in a list format, e.g., ['cleaned text here']
    vectorized_text = count_vectorizer.transform([cleaned_text])

    # Make a prediction
    prediction = model.predict(vectorized_text)

    # Map the numerical prediction back to 'Spam' or 'Ham'
    if prediction[0] == 1:
        return "Spam"
    else:
        return "Ham"

# Example usage of the prediction function with new emails
new_email_1 = "You have won a free iPhone. Click this link to claim your prize now!"
new_email_2 = "Hey John, remember to send the meeting agenda before the call tomorrow."
new_email_3 = "URGENT! Your account has been suspended. Verify your details immediately."
new_email_4 = "Dinner tonight? Let me know if you're free."

print("\n--- Model Predictions on New Emails ---")
print(f"1. '{new_email_1}' is predicted as: {predict_email_type(new_email_1)}")
print(f"2. '{new_email_2}' is predicted as: {predict_email_type(new_email_2)}")
print(f"3. '{new_email_3}' is predicted as: {predict_email_type(new_email_3)}")
print(f"4. '{new_email_4}' is predicted as: {predict_email_type(new_email_4)}")
print("-" * 50)



--- Model Predictions on New Emails ---
1. 'You have won a free iPhone. Click this link to claim your prize now!' is predicted as: Spam
2. 'Hey John, remember to send the meeting agenda before the call tomorrow.' is predicted as: Ham
3. 'URGENT! Your account has been suspended. Verify your details immediately.' is predicted as: Spam
4. 'Dinner tonight? Let me know if you're free.' is predicted as: Ham
--------------------------------------------------
