In [None]:
import string
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

import joblib


In [None]:
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
df = pd.read_csv(
    "SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "text"],
    encoding="latin-1"
)

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print("Dataset Loaded Successfully")
df.head()


In [None]:
stemmer = PorterStemmer()
stopwords_set = set(stopwords.words('english'))

corpus = []

for text in df['text']:
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stopwords_set]
    corpus.append(" ".join(words))


In [None]:
vectorizer = CountVectorizer(min_df=2)
X = vectorizer.fit_transform(corpus)
y = df['label']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

print("Precision :", precision_score(y_test, y_pred))
print("Recall    :", recall_score(y_test, y_pred))
print("F1 Score  :", f1_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


In [None]:
joblib.dump(model, "spam_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("Model and Vectorizer Saved Successfully")


In [None]:
model = joblib.load("spam_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")

email = "Congratulations! You have won a free cash prize"

email_text = email.lower()
email_text = email_text.translate(str.maketrans('', '', string.punctuation))
email_text = email_text.split()
email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
email_text = " ".join(email_text)

x_email = vectorizer.transform([email_text])
prediction = model.predict(x_email)

print("\nPrediction Result:")
print("SPAM " if prediction[0] == 1 else "HAM ")
