In [1]:
!pip install pandas numpy scikit-learn nltk



In [2]:
!wget https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
!wget https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2


--2025-02-20 17:07:41--  https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to spamassassin.apache.org (spamassassin.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1183768 (1.1M) [application/x-bzip2]
Saving to: ‘20030228_spam.tar.bz2’


2025-02-20 17:07:42 (22.3 MB/s) - ‘20030228_spam.tar.bz2’ saved [1183768/1183768]

--2025-02-20 17:07:42--  https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to spamassassin.apache.org (spamassassin.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1612216 (1.5M) [application/x-bzip2]
Saving to: ‘20030228_easy_ham.tar.bz2’


2025-02-20 17:07:43 (34.3 MB/s) - ‘20030228_easy_ham.tar.bz2’ saved [1612216/16

In [3]:
import tarfile

def extract_tar(file_name):
    with tarfile.open(file_name, "r:bz2") as tar:
        tar.extractall()

extract_tar("20030228_spam.tar.bz2")
extract_tar("20030228_easy_ham.tar.bz2")


In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
import re

nltk.download('stopwords')
from nltk.corpus import stopwords

# Load spam and ham emails
spam_dir = "spam"
ham_dir = "easy_ham"

def load_emails(directory, label):
    emails = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), "r", encoding="latin-1") as f:
            emails.append((f.read(), label))
    return emails

spam_emails = load_emails(spam_dir, 1)
ham_emails = load_emails(ham_dir, 0)

# Create DataFrame
df = pd.DataFrame(spam_emails + ham_emails, columns=["text", "label"])

# Preprocess text
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

df["text"] = df["text"].apply(clean_text)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Convert text to numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# Train model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.86


In [6]:
def predict_spam(email_text):
    email_tfidf = vectorizer.transform([email_text])
    prediction = model.predict(email_tfidf)[0]
    return "Spam" if prediction == 1 else "Not Spam"

# Example
email = "Congratulations! You've won a free iPhone! Click here to claim."
print(predict_spam(email))


Not Spam


In [7]:
import pickle

# Save model and vectorizer
pickle.dump(model, open("spam_classifier.pkl", "wb"))
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))
