# Bag of Words with Lemmatization - Spam Classification Assignment

In [1]:
# Step 1: Install and import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vishalrathod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vishalrathod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vishalrathod\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Step 2: Sample dataset (You can replace this with a CSV file if needed)
data = {
    "label": ["ham", "spam", "ham", "ham", "spam"],
    "message": [
        "Hey, are we still meeting today?",
        "Congratulations! You have won a free ticket to Bahamas. Reply WIN to claim.",
        "I'll call you later.",
        "Are you available for the meeting tomorrow?",
        "WINNER!! You have been selected for a free iPhone. Text 'YES' to win!"
    ]
}
df = pd.DataFrame(data)
df


Unnamed: 0,label,message
0,ham,"Hey, are we still meeting today?"
1,spam,Congratulations! You have won a free ticket to...
2,ham,I'll call you later.
3,ham,Are you available for the meeting tomorrow?
4,spam,WINNER!! You have been selected for a free iPh...


In [3]:
# Step 3: Preprocessing function with lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

df['clean_message'] = df['message'].apply(preprocess)
df[['message', 'clean_message']]


Unnamed: 0,message,clean_message
0,"Hey, are we still meeting today?",hey still meeting today
1,Congratulations! You have won a free ticket to...,congratulation free ticket bahamas reply win c...
2,I'll call you later.,call later
3,Are you available for the meeting tomorrow?,available meeting tomorrow
4,WINNER!! You have been selected for a free iPh...,winner selected free iphone text win


In [4]:
# Step 4: Convert labels to binary
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})


In [5]:
# Step 5: Bag of Words model using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['clean_message'])
y = df['label_num']


In [6]:
# Step 6: Train/Test split and model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [7]:
# Step 7: Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Confusion Matrix:
 [[0 0]
 [2 0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

Accuracy Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
