# Step 1: Import Required Libraries

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
import joblib

# Step 2: Load the Dataset

In [2]:
# Load the dataset from a local file
dataset_path = 'spam.csv'

# Load the dataset with proper encoding
df = pd.read_csv(dataset_path, encoding='latin-1')

# Display the first few rows to understand the structure
print("\n******** First 5 rows of the dataset ********\n")
print(df.head())

# Check the column information
print("\n******** Dataset Info ********\n")
print(df.info())

# Check basic statistics of the dataset
print("\n******** Dataset Description ********\n")
print(df.describe(include='all'))



******** First 5 rows of the dataset ********

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

******** Dataset Info ********

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  1

# Step 3: Data Cleaning and Preprocessing

In [3]:
# Retain only relevant columns
df = df[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})

# Check for missing values
print("\n********Missing Values ********\n")
print(df.isnull().sum())

# Check the distribution of labels (ham/spam)
print("\n******** Label Distribution ********\n")
print(df['label'].value_counts())

# Encode labels: ham -> 0, spam -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display the first few rows of the cleaned dataset
print("\n******** Cleaned Dataset ********\n")
print(df.head())



********Missing Values ********

label      0
message    0
dtype: int64

******** Label Distribution ********

label
ham     4825
spam     747
Name: count, dtype: int64

******** Cleaned Dataset ********

   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


# Step 4: Text Cleaning

In [4]:
# Download stopwords for text preprocessing
nltk.download('stopwords')

# Initialize the PorterStemmer for stemming
ps = PorterStemmer()

# Function to clean and preprocess text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    
    # Rejoin the words into a single string
    return ' '.join(words)

# Apply the cleaning function to the message column
df['message'] = df['message'].apply(clean_text)

# Display a few cleaned messages
print("\n******** Cleaned Messages ********\n")
print(df.head())


[nltk_data] Downloading package stopwords to C:\Users\SURAJ
[nltk_data]     NATE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



******** Cleaned Messages ********

   label                                            message
0      0  go jurong point crazi avail bugi n great world...
1      0                              ok lar joke wif u oni
2      1  free entri 2 wkli comp win fa cup final tkt 21...
3      0                u dun say earli hor u c alreadi say
4      0          nah dont think goe usf live around though


# Step 5: Feature Extraction

In [5]:
# Initialize the CountVectorizer for Bag-of-Words
cv = CountVectorizer()

# Fit and transform the message column to create the feature matrix
X = cv.fit_transform(df['message']).toarray()

# Extract labels as the target variable
y = df['label']

# Display the shape of the feature matrix
print("\nFeature Matrix Shape : ", X.shape)



Feature Matrix Shape :  (5572, 8038)


# Step 6: Split the Dataset

In [6]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print("\nTraining Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)


Training Set Shape: (4457, 8038)
Testing Set Shape: (1115, 8038)


# Step 7: Train the Model

In [7]:
# Initialize the Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Print confirmation
print("\nModel training completed.")



Model training completed.


# Step 8: Evaluate the Model

In [8]:
# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Display the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Model Accuracy: 0.9775784753363229

Confusion Matrix:
[[952  13]
 [ 12 138]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.92      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# Step 9: Test with Custom Inputs

In [9]:
# Define a function to predict if a message is spam or not
def predict_spam(message):
    # Clean the input message
    message = clean_text(message)
    
    # Convert the message to numerical format
    vectorized_message = cv.transform([message]).toarray()
    
    # Predict and return the result
    prediction = model.predict(vectorized_message)
    return "Spam" if prediction == 1 else "Not Spam"

# Test the function with a sample message
sample_message = "Congratulations! You've won a free gift. Claim now!"
print("\nCustom Prediction : ")
print(predict_spam(sample_message))



Custom Prediction : 
Spam


# Step 10: Save the Model for Deployment

In [10]:
# Save the trained model and the CountVectorizer for future use
joblib.dump(model, 'spam_detector_surajnate_model.pkl')
joblib.dump(cv, 'vectorizer.pkl')

print("\nModel and vectorizer saved for deployment.")



Model and vectorizer saved for deployment.


The User Interface/app Implementation Link - https://email-spam-detection-by-surajnate.streamlit.app/