In [57]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split as tts
from sklearn.pipeline import Pipeline

# Assessing Dataset

In [4]:
df = pd.read_csv('spam.csv')

In [6]:
df['spam'] = df.Category.apply(lambda x: 1 if x=='spam' else 0)

In [10]:
df.drop('Category', axis =1, inplace = True)

#### Sample Emails

In [73]:
for i in range(19):
    print('Email no.:', i)
    print(df.Message[i])
    print('----------------------------------------')

Email no.: 0
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
----------------------------------------
Email no.: 1
Ok lar... Joking wif u oni...
----------------------------------------
Email no.: 2
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
----------------------------------------
Email no.: 3
U dun say so early hor... U c already then say...
----------------------------------------
Email no.: 4
Nah I don't think he goes to usf, he lives around here though
----------------------------------------
Email no.: 5
FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
----------------------------------------
Email no.: 6
Even my brother is not like to speak with me. They treat me like aids patent.
---------------------------

In [12]:
df.describe()

Unnamed: 0,spam
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Message  5572 non-null   object
 1   spam     5572 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


# Applying Vector Transformation

In [14]:
v = CountVectorizer()

In [19]:
emails = v.fit_transform(df.Message)

In [26]:
emails.shape

(5572, 8709)

In [29]:
v.get_feature_names_out().shape

(8709,)

In [36]:
df2 = pd.DataFrame(emails.toarray(), columns = v.get_feature_names_out())
df2.head(3)

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training Transformed Data

In [34]:
model = MultinomialNB()

In [38]:
X_train, X_test, y_train, y_test = tts(df2, df.spam, test_size = 0.25)

In [41]:
model.fit(X_train, y_train)

In [42]:
model.score(X_test, y_test)

0.9820531227566404

### Sample Spam Email ↓↓↓

In [46]:
list = ['''
        
Subject: URGENT! You've WON $1 MILLION in the Nigerian Lottery! 🇳🇬
Dearest [Recipient Name],

Congratulations! You have been randomly selected as the GRAND PRIZE WINNER of the prestigious Nigerian National Lottery!

Your lucky ticket number, 1234-5678-9012, drawn in our recent mega-draw, entitles you to a life-changing sum of $1,000,000 (USD)! Imagine the possibilities: a luxurious beachside villa, a fleet of exotic cars, endless shopping sprees... the world is your oyster!

But hurry! This incredible offer is time-sensitive. To claim your fortune, you must act FAST! Simply follow these THREE EASY STEPS:

Respond to this email IMMEDIATELY with your full name, address, and phone number. We need to verify your identity as the rightful winner.
Pay a processing fee of just $999.99 USD to cover administrative costs and international transfer charges. We accept all major credit cards and Bitcoin!
Sit back and relax! Your winnings will be delivered to your doorstep within 48 hours of processing your claim.
Don't miss out on this once-in-a-lifetime opportunity! Act now and secure your future wealth! Remember, fortune favors the bold!

Sincerely,

The Nigerian National Lottery Team

P.S. This is a LIMITED-TIME OFFER! Delay not, or risk losing your chance to claim your $1 MILLION prize!

P.P.S. Please disregard any warnings you may receive about this email. They are from jealous rivals who don't want you to experience the joys of wealth! ''']


In [47]:
new_list = v.transform(list)

#### The model correctly Predicts the spam Email

In [48]:
model.predict(new_list)

array([1], dtype=int64)

In [55]:
model.predict_proba(new_list)

array([[1.98913139e-15, 1.00000000e+00]])

### Sample Ham Email ↓↓↓

In [50]:
list2 = ['''

Subject: Re: Hiking Trip - Gear Recommendations? ⛰️
Hi [Friend's Name],

Hope you're doing well!

So excited about our upcoming hiking trip to [National Park Name]! Can't wait to explore the trails, take in the scenery, and roast marshmallows under the stars.

Speaking of the trip, I wanted to follow up on your question about gear recommendations. I've been doing some research and here are a few things I'd suggest based on what I know about the park and the time of year we're going:

Hiking boots: A good pair of waterproof boots is essential for tackling the varied terrain. I've been using these [Brand name] boots for a while now and they've held up great in all conditions. They're lightweight, comfortable, and have good ankle support.
Image of Hiking bootsOpens in a new window
www.switchbacktravel.com
Hiking boots
Backpack: Choose a backpack that's comfortable and fits well. I recommend trying on a few different sizes and weights to find one that feels right. I use a [Brand name] 30L backpack for day hikes, which is perfect for carrying water, snacks, sunscreen, and other essentials.

Hiking backpack
Layers: The weather in the mountains can change quickly, so it's important to pack layers. A base layer, a fleece or mid-layer, and a waterproof jacket should do the trick. I also like to bring a beanie and gloves, just in case.
Water bottle and snacks: Stay hydrated and fueled throughout the hike with plenty of water and snacks. I usually pack a reusable water bottle, Trail Mix, and some granola bars.
Sunscreen and insect repellent: Don't forget to protect yourself from the sun and bugs! Choose a broad-spectrum sunscreen with SPF 30 or higher, and apply insect repellent with DEET.
Let me know if you have any other questions about gear or the trip in general. I'm happy to help in any way I can!

Looking forward to hitting the trails with you!

Best,


''']

In [51]:
new2_list = v.transform(list2)

#### The model correctly Predicts the ham Email

In [52]:
model.predict(new2_list)

array([0], dtype=int64)

In [56]:
model.predict_proba(new2_list)

array([[1.00000000e+00, 1.75184216e-65]])

# Using Pipeline to Transform and Train Data

In [58]:
pipe = Pipeline([('Vectorizer', CountVectorizer()), 
                 ('model', MultinomialNB())
])

In [59]:
X_train, X_test, y_train, y_test = tts(df.Message, df.spam, test_size = 0.2)

In [60]:
pipe.fit(X_train, y_train)

In [61]:
pipe.score(X_test, y_test)

0.9847533632286996

In [62]:
pipe.predict(list)

array([1], dtype=int64)

In [63]:
pipe.predict(list2)

array([0], dtype=int64)