# **IMPORT LIBRARIES**

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

# **LOAD AND PREPROCESS DATA**

In [None]:
data = pd.read_csv('emails.csv')
data.columns

Index(['text', 'spam'], dtype='object')

In [None]:
print(data.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


In [None]:
print(data.isnull().sum())

text    0
spam    0
dtype: int64


In [None]:
data = data.dropna().reset_index(drop=True)

# **SPLIT DATA INTO TRAINING AND TESTING SETS**

In [None]:
X = data['text']
y = data['spam']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **BUILD THE MODEL PIPELINE**

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

# **TRAIN THE MODEL**

In [None]:
pipeline.fit(X_train, y_train)

# **EVALUATE THE MODEL**

In [None]:
y_pred = pipeline.predict(X_test)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.8926701570680629
Confusion Matrix:
[[856   0]
 [123 167]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93       856
           1       1.00      0.58      0.73       290

    accuracy                           0.89      1146
   macro avg       0.94      0.79      0.83      1146
weighted avg       0.91      0.89      0.88      1146



# **USING THE TRAINED MODEL FOR PREDICTION**

In [None]:
new_emails = [
    "Congratulations! You've won a free cruise. Claim your prize now!",
    "Hey, just checking in on the status of the project. Could you update me?",
    "Click this link to get a discount on your next purchase!"
]

In [None]:
predicted_labels = pipeline.predict(new_emails)

In [None]:
for email, label in zip(new_emails, predicted_labels):
    print(f'Email: {email}')
    print(f'Predicted Label: {"Spam" if label == 1 else "Ham"}')
    print()

Email: Congratulations! You've won a free cruise. Claim your prize now!
Predicted Label: Ham

Email: Hey, just checking in on the status of the project. Could you update me?
Predicted Label: Ham

Email: Click this link to get a discount on your next purchase!
Predicted Label: Ham

