### Import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib

### Load your dataset:

In [2]:
df = pd.read_csv('emails.csv')
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [3]:
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
df

Unnamed: 0,text,spam
0,Subject: contact details dear mr . kaminski ...,0
1,Subject: [ ilug - social ] we want to trade wi...,1
2,"Subject: re : "" expected tail loss "" for equit...",0
3,Subject: re : job application thank you . nu...,0
4,Subject: re : willow and pathstar evaluations ...,0
...,...,...
5723,Subject: 2000 projects in order to better und...,0
5724,"Subject: re : message 1 quentin , thanks for...",0
5725,Subject: save your money by getting an oem sof...,1
5726,Subject: re : f / u to dr . kaminski @ enron f...,0


### Define a preprocessing function to remove punctuation and stopwords:

In [5]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

### Split the data into training and testing sets:

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['spam'], test_size=0.2, random_state=42)

In [7]:
x_train

4860    Subject: research intelligence  stinson ,  ple...
1520    Subject: re : garp 2001 convention  frank ,  i...
2083    Subject: localized software , all languages av...
527     Subject: perfect logo charset = koi 8 - r " > ...
3113    Subject: need an outstanding logo now ?  worki...
                              ...                        
3772    Subject: research meeting  all  john sherriff ...
5191    Subject: 3 - urgent - to prevent loss of infor...
5226    Subject: vince and stinson ,  i got this resum...
5390    Subject: re : brazil  fyi . this is a deal i '...
860     Subject: ola oladeji  we are in the process of...
Name: text, Length: 4582, dtype: object

### Define a Pipeline object that combines CountVectorizer and MultinomialNB:

In [8]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(analyzer=process)),
    ('classifier', MultinomialNB())
])

### Train the model using the training data:

In [9]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 CountVectorizer(analyzer=<function process at 0x0000023D38793550>)),
                ('classifier', MultinomialNB())])

### Test the model using the testing data:

In [10]:
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.9904013961605584


### Use the confusion_matrix() function to calculate the confusion matrix for the predictions:

In [11]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:')
print(cm)

Confusion matrix:
[[873   7]
 [  4 262]]


### Save the trained model to a file:

In [12]:
joblib.dump(pipeline, 'spam_classifier.joblib')

['spam_classifier.joblib']

### Load the saved model from the file:

In [13]:
model = joblib.load('spam_classifier.joblib')

In [21]:
example_messages = [    'Get a free laptop today!',    'Hi John, can you send me the report by Friday?',    'Congratulations! You have won a free trip to Hawaii!']
predictions = model.predict(example_messages)
print(predictions)


[1 1 0]


In [22]:
# m = ['Hi John, can you send me the report by Friday?']
m = ["Dear consumer your electricity will disconnect at 9:30 tonight Because your previous bill was not update. Immediately contact electricity Officer 7439025664"]

In [23]:
predictions = model.predict(m)
print(predictions)

[0]


🙂🙂🙂🙂🙂🙂🙂