### importing libraries

In [19]:
### importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
import re
import string
import pickle
import os

# To display plots inline
%matplotlib inline

### reading the csv

In [20]:
# --- 1. Load the dataset ---
# Make sure 'spam.csv' is in the same directory as this notebook
print("Loading the dataset...")
try:
    sms = pd.read_csv('spam.csv', encoding='latin-1')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please make sure it's in the same directory.")
    exit()

Loading the dataset...
Dataset loaded successfully!


###  dropping the unwanted columns and renaming the remaining columns

In [21]:
### dropping the unwanted columns and renaming the remaining columns
cols_to_drop = ['Unnamed: 2','Unnamed: 3','Unnamed: 4']
sms.drop(cols_to_drop, axis=1, inplace=True)
sms.columns = ['label','message']
print("\nFirst 5 rows of the cleaned data:")
print(sms.head())


First 5 rows of the cleaned data:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [22]:
# --- 2. Data Cleaning and Preprocessing ---
### This is a new, crucial step to improve model accuracy.
### The preprocessing function removes noise and prepares the text for vectorization.
print("\nPreprocessing text...")
def preprocess_text(text):
    """
    Cleans the input text by:
    1. Lowercasing.
    2. Removing punctuation.
    3. Removing numbers.
    4. Removing extra whitespace.
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove extra spaces and strip leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


Preprocessing text...


In [23]:
# Apply the preprocessing function to the entire dataset
sms['message'] = sms['message'].apply(preprocess_text)
print("Text preprocessing complete.")
print("\nFirst 5 rows after preprocessing:")
print(sms.head())

Text preprocessing complete.

First 5 rows after preprocessing:
  label                                            message
0   ham  go until jurong point crazy available only in ...
1   ham                            ok lar joking wif u oni
2  spam  free entry in a wkly comp to win fa cup final ...
3   ham        u dun say so early hor u c already then say
4   ham  nah i dont think he goes to usf he lives aroun...


### converting words into vectors and splitting data into train/test

In [24]:
### converting words into vectors and splitting data into train/test
# We will use TfidfVectorizer, which gives more weight to words that are rare but
# appear frequently in specific classes (like 'prize' or 'winner' in spam).
print("\nVectorizing messages and splitting data...")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


Vectorizing messages and splitting data...


In [25]:
# Create a TfidfVectorizer instance
tv = TfidfVectorizer(stop_words='english')

# Fit and transform the messages
X = tv.fit_transform(sms['message'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, sms['label'], test_size=0.3, random_state=101)
print("Vectorization and data split complete.")

Vectorization and data split complete.


## training the model (I also tried Tfidf Vectorizer but in this case CountVectorizer is giving better results)

In [26]:
### training the model
print("\nTraining the Multinomial Naive Bayes model...")
mnb = MultinomialNB()
mnb.fit(X_train, y_train)


Training the Multinomial Naive Bayes model...


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [27]:
# Check model accuracy
train_accuracy = mnb.score(X_train, y_train)
test_accuracy = mnb.score(X_test, y_test)

print(f"Training accuracy is --> {train_accuracy * 100:.2f}%")
print(f"Test accuracy is --> {test_accuracy * 100:.2f}%")
print("Model training complete.")

Training accuracy is --> 97.10%
Test accuracy is --> 96.95%
Model training complete.


In [28]:
### Saving the trained model and vectorizer
print("\nSaving the model and vectorizer...")
# Define the filenames for the model and vectorizer
model_filename = 'mnb_model.pkl'
vectorizer_filename = 'tfidf_vectorizer.pkl'


Saving the model and vectorizer...


In [29]:
# Save the trained Multinomial Naive Bayes model
with open(model_filename, 'wb') as model_file:
    pickle.dump(mnb, model_file)
    print(f"Model saved to '{os.path.abspath(model_filename)}'")

Model saved to 'c:\Users\payal\Downloads\SMS-Spam-Detector-main\SMS-Spam-Detector-main\mnb_model.pkl'


### live predictor 

In [30]:
# Save the TfidfVectorizer object
with open(vectorizer_filename, 'wb') as vectorizer_file:
    pickle.dump(tv, vectorizer_file)
    print(f"Vectorizer saved to '{os.path.abspath(vectorizer_filename)}'")

print("Process complete! You can now use the new .pkl files with your Flask app.")

Vectorizer saved to 'c:\Users\payal\Downloads\SMS-Spam-Detector-main\SMS-Spam-Detector-main\tfidf_vectorizer.pkl'
Process complete! You can now use the new .pkl files with your Flask app.
