In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold

import warnings
warnings.filterwarnings("ignore")

In [3]:
email_data = [
    ("Congratulations! You won a $1000 gift card.", 1),
    ("Meeting at 3 PM tomorrow in the office.", 0),
    ("Get cheap loans instantly! No credit check required.", 1),
    ("Your Amazon order has been shipped.", 0),
    ("Limited time offer! Click here to claim your prize.", 1),
    ("Can we discuss the project report today?", 0),
    ("You have been selected for an exclusive deal!", 1),
    ("Family dinner is planned for Saturday evening.", 0),
    ("Earn $5000 working from home. Apply now!", 1),
    ("Please review the attached document before our call.", 0),
    ("Urgent: Update your account information to avoid suspension.", 1),
    ("Thanks for your help with the presentation.", 0),
    ("Win a free vacation to the Bahamas! Act now.", 1),
    ("Do you want to grab lunch tomorrow?", 0),
    ("Final warning: Your account will be deactivated.", 1),
    ("Your package will be delivered by 5 PM.", 0),
    ("Exclusive offer: Save 80% on luxury watches.", 1),
    ("Don't forget to submit your assignment.", 0),
    ("Claim your free gift card before it's too late.", 1),
    ("I'll send you the meeting notes shortly.", 0),
    ("Huge discounts on electronics! Limited stock available.", 1),
    ("Can you check the project proposal draft?", 0),
    ("Important: Verify your identity to secure your account.", 1),
    ("Looking forward to catching up this weekend.", 0),
    ("You've won a free iPhone! Click to claim now.", 1),
    ("Please approve the budget request for Q4.", 0),
    ("Make money fast! No investment required.", 1),
    ("Happy Birthday! Have a wonderful day.", 0),
    ("Urgent: Confirm your email to receive your prize.", 1),
    ("Let's finalize the agenda for the team meeting.", 0),
    ("Free trial of premium services! Sign up today.", 1),
    ("Thank you for attending the seminar.", 0),
    ("Act now: Get rich quick with this secret method!", 1),
    ("Here's the invoice for your records.", 0),
    ("Limited stock: Order now and save big!", 1),
    ("Can you help me with this Excel formula?", 0),
    ("Win big cash prizes! Enter the sweepstakes now.", 1),
    ("Are you available for a quick call later?", 0),
    ("Exclusive deal just for you! Don't miss out.", 1),
    ("We appreciate your feedback on our service.", 0),
    ("Click here to unlock your full credit score.", 1),
    ("Team meeting rescheduled to 2 PM.", 0),
    ("New crypto investment opportunity! Double your money.", 1),
    ("Check out the latest updates on the project.", 0),
    ("Urgent: Pay now to avoid legal action.", 1),
    ("Can you join the Zoom call at 10 AM?", 0),
    ("Earn rewards for every purchase you make!", 1),
    ("Let's prepare for the quarterly review meeting.", 0),
    ("Special offer: Lose weight fast with this one trick.", 1),
    ("I'll send over the revised report by EOD.", 0),
]


In [29]:
emails = [x[0] for x in email_data]

In [30]:
emails

['Congratulations! You won a $1000 gift card.',
 'Meeting at 3 PM tomorrow in the office.',
 'Get cheap loans instantly! No credit check required.',
 'Your Amazon order has been shipped.',
 'Limited time offer! Click here to claim your prize.',
 'Can we discuss the project report today?',
 'You have been selected for an exclusive deal!',
 'Family dinner is planned for Saturday evening.',
 'Earn $5000 working from home. Apply now!',
 'Please review the attached document before our call.',
 'Urgent: Update your account information to avoid suspension.',
 'Thanks for your help with the presentation.',
 'Win a free vacation to the Bahamas! Act now.',
 'Do you want to grab lunch tomorrow?',
 'Your package will be delivered by 5 PM.',
 'Exclusive offer: Save 80% on luxury watches.',
 "Don't forget to submit your assignment.",
 "Claim your free gift card before it's too late.",
 "I'll send you the meeting notes shortly.",
 'Huge discounts on electronics! Limited stock available.',
 'Can you c

In [36]:
vector = CountVectorizer(stop_words="english")

In [37]:
text_to_vec = vector.fit_transform(emails)

In [39]:
text_to_vec.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [41]:
vector.vocabulary_

{'congratulations': 29,
 'won': 162,
 '1000': 1,
 'gift': 60,
 'card': 21,
 'meeting': 88,
 'pm': 101,
 'tomorrow': 146,
 'office': 95,
 'cheap': 24,
 'loans': 82,
 'instantly': 69,
 'credit': 30,
 'check': 25,
 'required': 117,
 'amazon': 8,
 'order': 97,
 'shipped': 133,
 'limited': 80,
 'time': 144,
 'offer': 94,
 'click': 27,
 'claim': 26,
 'prize': 105,
 'discuss': 38,
 'project': 107,
 'report': 115,
 'today': 145,
 'selected': 128,
 'exclusive': 50,
 'deal': 34,
 'family': 51,
 'dinner': 36,
 'planned': 100,
 'saturday': 123,
 'evening': 48,
 'earn': 43,
 '5000': 2,
 'working': 164,
 'home': 64,
 'apply': 9,
 'review': 119,
 'attached': 13,
 'document': 39,
 'urgent': 152,
 'update': 150,
 'account': 4,
 'information': 68,
 'avoid': 16,
 'suspension': 139,
 'thanks': 143,
 'help': 63,
 'presentation': 104,
 'win': 161,
 'free': 59,
 'vacation': 153,
 'bahamas': 17,
 'act': 5,
 'want': 156,
 'grab': 61,
 'lunch': 85,
 'final': 54,
 'deactivated': 33,
 'package': 98,
 'delivered':

In [42]:
labels = [x[1] for x in email_data]

In [43]:
labels

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0]

In [44]:
model = MultinomialNB()

In [45]:
model.fit(text_to_vec, labels)

In [47]:
model.score(text_to_vec, labels)

1.0

In [49]:
new_mail = np.array(["Check out the latest updates on the project."])

In [50]:
new_mail_vec = vector.transform(new_mail)

In [52]:
new_mail_vec.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [53]:
model.predict(new_mail_vec)

array([0])

In [56]:
def spam_mail_detector(mail):
    arr = np.array([mail])
    vec = vector.transform(arr)
    res = model.predict(vec)
    return "Spam" if res[0]==1 else "Not Spam"

In [57]:
spam_mail_detector("Hey there, Congratulations you win lottery of 10000000")

'Spam'

In [58]:
spam_mail_detector("Hello Students, Your tomorrows session will be only in offline mode.")

'Not Spam'

In [59]:
X_tr, X_te, y_tr, y_te = train_test_split(text_to_vec, labels, train_size=0.75, random_state=453)

In [60]:
model2 = MultinomialNB()

In [61]:
model2.fit(X_tr, y_tr)
model2.score(X_tr, y_tr)

1.0

In [63]:
y_pr = model2.predict(X_te)
accuracy_score(y_pr, y_te)

0.8461538461538461