In [46]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [16]:
dataset = pd.read_csv(r"D:\Data Science\spam.csv", encoding='latin1')

In [32]:
data = dataset[['v1', 'v2']]
data.columns = ['label', 'message']

In [34]:
data.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [38]:
data.shape

(5572, 2)

In [36]:
data.isnull().sum()

label      0
message    0
dtype: int64

In [62]:
tfidf = TfidfVectorizer(stop_words='english')
x = tfidf.fit_transform(data['message'])
y = data['label']

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [70]:
# Naive Bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
nb_pred = nb.predict(x_test)

In [76]:
nb.score(x_train , y_train)*100 , nb.score(x_test , y_test)*100

(97.97435897435898, 96.77033492822966)

In [78]:
# J48 Decision Tree
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_pred = dt.predict(x_test)

In [80]:
dt.score(x_train , y_train)*100 , dt.score(x_test , y_test)*100

(100.0, 96.83014354066985)

In [86]:
def evaluate_model(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {acc * 100:.2f}%")

print("Naive Bayes Multinomial:")
evaluate_model(y_test, nb_pred)

print("J48 Decision Tree:")
evaluate_model(y_test, dt_pred)

Naive Bayes Multinomial:
Confusion Matrix:
[[1453    0]
 [  54  165]]
Accuracy: 96.77%
J48 Decision Tree:
Confusion Matrix:
[[1432   21]
 [  32  187]]
Accuracy: 96.83%


In [94]:
# Example: Predicting for a new email

# Step 1: New email to classify
new_email = ["England v Macedonia - don't miss the goals/team news. Txt ur national team to 87077 e.g., ENGLAND to 87077 Try: WALES, SCOTLAND 4txt/Ì¼1.20 POBOXox36504W45WQ 16+"
]
# Step 2: Preprocess and transform the new email
new_email_vectorized = tfidf.transform(new_email)  # Use the same TF-IDF vectorizer

# Step 3: Predict using the trained models
nb_prediction = nb.predict(new_email_vectorized)  # Naive Bayes model
dt_prediction = dt.predict(new_email_vectorized)  # Decision Tree model

# Step 4: Interpret results
if nb_prediction[0] == 1:
    print("Naive Bayes Prediction: Spam")
else:
    print("Naive Bayes Prediction: Not Spam")

if dt_prediction[0] == 1:
    print("Decision Tree Prediction: Spam")
else:
    print("Decision Tree Prediction: Not Spam")




Naive Bayes Prediction: Not Spam
Decision Tree Prediction: Not Spam
