In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [57]:
data = pd.read_csv("spam.csv", encoding="latin-1")
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

In [56]:
# Preprocess the data 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raksh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
def preprocess_text(text):
    word_tokens = word_tokenize(text.lower())
    filtered_text = [stemmer.stem(word) for word in word_tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_text)

data['processed_text'] = data['text'].apply(preprocess_text)


In [48]:
# Take text to vectorize it
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['processed_text'])
y = data['label'].map({'ham': 0, 'spam': 1})

In [53]:
#Split and train dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

In [54]:
# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.968609865470852
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.88      0.89      0.88       150

    accuracy                           0.97      1115
   macro avg       0.93      0.94      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [52]:
# predictions
new_email = ["Congratulations! You've won a free vacation. Click here to claim your prize."]
new_email_processed = preprocess_text(new_email[0])
new_email_vectorized = vectorizer.transform([new_email_processed])
prediction = model.predict(new_email_vectorized)
print("Prediction:", "spam" if prediction[0] == 1 else "ham")


Prediction: spam


In [59]:
# prediction with user input
# predictions
new_email = input("Enter subject of the mail: ")
new_email_processed = preprocess_text(new_email[0])
new_email_vectorized = vectorizer.transform([new_email_processed])
prediction = model.predict(new_email_vectorized)
print("Prediction:", "spam" if prediction[0] == 1 else "ham")

Enter subject of the mail: Happy to announce you have been selected for xyz
Prediction: ham
