In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "spam.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "uciml/sms-spam-collection-dataset",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
  pandas_kwargs={'encoding': 'latin-1'}
)

print("First 5 records:", df.head())

  from .autonotebook import tqdm as notebook_tqdm
  df = kagglehub.load_dataset(


First 5 records:      v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})

In [5]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to C:\Users\Tarun
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Tarun
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tarun
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Tarun
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):

    text = text.lower()   ## Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])    # Remove punctuation
    text = re.sub(r'\d+', '', text)     # Remove numbers
    tokens = nltk.word_tokenize(text, language='english') # Specify language as english
    processed_tokens = [
        lemmatizer.lemmatize(word) for word in tokens if word not in stop_words
    ]
    return ' '.join(processed_tokens)

In [7]:
df['processed_message'] = df['v2'].apply(preprocess_text)

print("\n--- Data After Preprocessing ---")
print(df[['v2', 'processed_message']].head())


X = df['processed_message']
y = df['v1']


--- Data After Preprocessing ---
                                                  v2  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                   processed_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                u dun say early hor u c already say  
4           nah dont think go usf life around though  


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Initialize and fit the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nShape of TF-IDF training matrix: {X_train_tfidf.shape}")


Shape of TF-IDF training matrix: (4457, 5000)


In [9]:
nb_classifier = MultinomialNB()

print("\n--- Training the Model... ---")
nb_classifier.fit(X_train_tfidf, y_train)
print("Model training complete.")


--- Training the Model... ---
Model training complete.


In [10]:
y_pred = nb_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f} ({accuracy:.2%})")

# Display confusion matrix
print("\nConfusion Matrix:")
# This shows True/False positives and negatives
print(confusion_matrix(y_test, y_pred))

# Display detailed classification report
print("\nClassification Report:")
# This shows precision, recall, and f1-score for each class
print(classification_report(y_test, y_pred, target_names=['Ham (0)', 'Spam (1)']))


--- Model Evaluation ---
Accuracy: 0.9641 (96.41%)

Confusion Matrix:
[[965   1]
 [ 39 110]]

Classification Report:
              precision    recall  f1-score   support

     Ham (0)       0.96      1.00      0.98       966
    Spam (1)       0.99      0.74      0.85       149

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.96      0.96      1115



In [11]:
def predict_message(message):
    preprocessed_message = preprocess_text(message)
    vectorized_message = tfidf_vectorizer.transform([preprocessed_message])
    prediction = nb_classifier.predict(vectorized_message)

    return "Spam" if prediction[0] == 1 else "Ham"


spam_example = "URGENT! You have won a 1 week FREE membership in our $10,000 Prize Jackpot! Text the word: CLAIM to No: 81010"
ham_example = "Hi, can we reschedule our meeting to 1 PM tomorrow?"

print(f"Message: '{spam_example}'\nPrediction: {predict_message(spam_example)}\n")
print(f"Message: '{ham_example}'\nPrediction: {predict_message(ham_example)}")


Message: 'URGENT! You have won a 1 week FREE membership in our $10,000 Prize Jackpot! Text the word: CLAIM to No: 81010'
Prediction: Spam

Message: 'Hi, can we reschedule our meeting to 1 PM tomorrow?'
Prediction: Ham
