<a href="https://colab.research.google.com/github/TamoshreeDey/SpamEmailClassificationMachineLearningModel/blob/main/SpamEmailClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download('stopwords')
nltk.download('punkt')
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
path="/content/dataset/combined_data.csv"

dataset=pd.read_csv(path)

X=dataset.text
y=dataset.label

print(X)

0        ounce feather bowl hummingbird opec moment ala...
1        wulvob get your medircations online qnb ikud v...
2         computer connection from cnn com wednesday es...
3        university degree obtain a prosperous future m...
4        thanks for all your answers guys i know i shou...
                               ...                        
83443    hi given a date how do i get the last date of ...
83444    now you can order software on cd or download i...
83445    dear valued member canadianpharmacy provides a...
83446    subscribe change profile contact us long term ...
83447    get the most out of life ! viagra has helped m...
Name: text, Length: 83448, dtype: object


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train[2])

 computer connection from cnn com wednesday escapenumber may escapenumber escapenumber escapenumber escapenumber pm edt in this report next generation toys read brain waves google expands personalization youtube wins webby old fashioned rabbit ears laptop review next generation toys read brain waves a startup company aims to add more realistic elements to video games by using brain wave reading technology to help game developers make gaming more realistic http www cnn com escapenumber tech fun games escapenumber escapenumber mind reading toys ap index html google expands personalization google is stepping up efforts to allow its users to personalize how they search the web http www cnn com escapenumber tech internet escapenumber escapenumber google personalization reut index html youtube wins webby the co founders youtube are among the winners of the annual webby online achievement awards http www cnn com escapenumber tech internet escapenumber escapenumber webby awards ap index html o

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation

    # Tokenization using split() (faster than word_tokenize)
    tokens = text.split()

    # Remove stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return " ".join(tokens)  # Join tokens back into a string

X_train = X_train.map(preprocess_text)
X_test = X_test.map(preprocess_text)

In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)

# Print Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print Classification Report
print(classification_report(y_test, y_pred))

# Print Confusion Matrix
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.976452965847813
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      7938
           1       0.99      0.97      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690

[[7846   92]
 [ 301 8451]]


In [None]:
def predict_spam(email):
    email = preprocess_text(email)  # Preprocess
    email_tfidf = vectorizer.transform([email])  # Convert to TF-IDF
    prediction = model.predict(email_tfidf)  # Predict
    return "Spam" if prediction[0] == 1 else "Not Spam"

# Example
print(predict_spam("Subject: Meeting Reminder: Project Discussion at 3 PM Body: Hi Tamoshree, Just a quick reminder about our project discussion today at 3 PM. We'll go over the key milestones and next steps. Let me know if you have any updates before the meeting. Looking forward to it! Best regards, Riya"))
print(predict_spam('''Subject: Unusual Login Attempt Detected on Your Account
Body:

Dear User,

We noticed an unusual login attempt on your account from a new device in Tokyo, Japan. If this was you, no action is required. If not, please secure your account immediately.

🔗 Verify Your Account Now

Failure to act within 24 hours may result in account suspension.

Regards,
Security Team'''))

Not Spam
Spam


In [None]:
import joblib

# Save both the vectorizer and the model
joblib.dump(vectorizer, "vectorizer.joblib")
joblib.dump(model, "spam_classifier.joblib")

print("Model and vectorizer saved successfully! 🎉")

Model and vectorizer saved successfully! 🎉


In [None]:
from google.colab import files
!zip spam_classifier.zip vectorizer.joblib spam_classifier.joblib
files.download("spam_classifier.zip")


updating: vectorizer.joblib (deflated 56%)
updating: spam_classifier.joblib (deflated 50%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>