### Import necessary libraries:

In [1]:
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib

### Load your dataset:

In [2]:
df = pd.read_csv('emails.csv')
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [3]:
# df = df.sample(frac=1).reset_index(drop=True)

### Define a preprocessing function to remove punctuation and stopwords:

In [4]:
def process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

### Split the data into training and testing sets:

In [5]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['spam'], test_size=0.2, random_state=42)

In [6]:
x_train

4860    Subject: re : request for payroll reclassifica...
1520    Subject: re : pending approval for ibuyit requ...
2083    Subject: congratulations !  hi vince :  i just...
527     Subject: secretly record all internet activity...
3113    Subject: interviews scheduled for monday , nov...
                              ...                        
3772    Subject: john sherriff ' s copper position  te...
5191    Subject: the national forum on corporate finan...
5226    Subject: re : my first draft  quentin ,  i for...
5390    Subject: why johan dahl and the mri energy sta...
860     Subject: perfect visual solution for your busi...
Name: text, Length: 4582, dtype: object

### Define a Pipeline object that combines CountVectorizer and MultinomialNB:

In [7]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(analyzer=process)),
    ('classifier', MultinomialNB())
])

### Train the model using the training data:

In [8]:
pipeline.fit(x_train, y_train)

### Test the model using the testing data:

In [9]:
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 0.9886561954624782


### Use the confusion_matrix() function to calculate the confusion matrix for the predictions:

In [10]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:')
print(cm)

Confusion matrix:
[[850   6]
 [  7 283]]


### Save the trained model to a file:

In [11]:
joblib.dump(pipeline, 'spam_classifier.joblib')

['spam_classifier.joblib']

### Load the saved model from the file:

In [12]:
model = joblib.load('spam_classifier.joblib')

In [13]:
example_messages = [    'Get a free laptop today!',    'Hi John, can you send me the report by Friday?',    'Congratulations! You have won a free trip to Hawaii!']
predictions = model.predict(example_messages)
print(predictions)


[0 1 0]


In [14]:
# m = ['Hi John, can you send me the report by Friday?']
m = ["Dear consumer your electricity will disconnect at 9:30 tonight Because your previous bill was not update. Immediately contact electricity Officer 7439025664"]

In [15]:
predictions = model.predict(m)
print(predictions)

[0]


🙂🙂🙂🙂🙂🙂🙂

In [20]:
mail = ['Subject:For connection request Dear Saurabh, this is Abhishek here I want to connect with you best regard Abhisekh']
print(model.predict(mail))

[0]
