# Email Spam Detection

In [64]:
import numpy as np
import pandas as pd

In [65]:
# Load the dataset
df = pd.read_csv("D://Dataset//spam.csv",encoding='latin1')

In [66]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [67]:
# Drop unnecessary columns
df = df.drop(['Unnamed: 2' ,'Unnamed: 3','Unnamed: 4'],axis=1)

In [68]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [69]:
# Rename columns
df = df.rename(columns={'v1':'Category', 'v2':'Message'})

In [70]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [71]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [72]:
df.info

<bound method DataFrame.info of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham              Will Ì_ b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [73]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [74]:
# Create a binary 'Spam' column
df['Spam']=df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [54]:
df

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [75]:
# Split the data into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(df.Message,df.Spam, test_size = 0.25)

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
from sklearn.naive_bayes import MultinomialNB

In [76]:
# Build a Naive Bayes classifier pipeline
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

# Train the Model

In [77]:
# Train the classifier
clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [78]:
# Sample emails for prediction
emails = [
    'Sounds great! Are you home now?',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
]

In [79]:
# Predict the labels for the sample emails
predictions = clf.predict(emails)
print(predictions)

[0 1]


In [80]:
accuracy = clf.score(X_test,y_test)
print(f"Accuracy on the test set:{accuracy:.2%}")

Accuracy on the test set:98.49%


In [81]:
from sklearn.metrics import classification_report, confusion_matrix
# Predict the labels on the test set
y_pred = clf.predict(X_test)

# Print the classification report
print("Classification Report")
print(classification_report(y_test,y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1214
           1       0.96      0.92      0.94       179

    accuracy                           0.98      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [82]:
# Print the confusion matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,y_pred))

Confusion Matrix
[[1208    6]
 [  15  164]]


In [83]:
# Calculate and print accuracy
accuracy = clf.score(X_test,y_test)
print(f"Accuracy on the test set: {accuracy:.2%}")

Accuracy on the test set: 98.49%
