In [3]:
%pip install  nltk

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

# Set NLTK data path
nltk_data_path = os.path.expanduser('~/nltk_data')
if nltk_data_path not in nltk.data.path:
    nltk.data.path.append(nltk_data_path)

# Download NLTK resources with error handling
try:
    nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
    nltk.download('punkt_tab', download_dir=nltk_data_path, quiet=True)  # Added punkt_tab
    nltk.download('stopwords', download_dir=nltk_data_path, quiet=True)
    # Verify resources
    word_tokenize("test sentence")
    stopwords.words('english')
    print("NLTK resources loaded successfully.")
except Exception as e:
    print(f"Failed to load NLTK resources: {e}")
    print("Please ensure internet connectivity and write permissions for ~/nltk_data")
    print("Try running: nltk.download('punkt_tab', download_dir='~/nltk_data')")
    exit(1)

# Sample dataset
data = {
    'email': [
        'Win a free iPhone now!!! Click here!',
        'Meeting at 10am tomorrow, please confirm.',
        'Get rich quick! Buy our course!',
        'Lunch plans this weekend? Let me know.',
        'Limited time offer! Discount viagra pills.',
        'Project deadline is next Friday.'
    ],
    'label': ['spam', 'ham', 'spam', 'ham', 'spam', 'ham']
}
df = pd.DataFrame(data)

# Preprocess text
def preprocess_text(text):
    try:
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    except Exception as e:
        print(f"Preprocessing error: {e}")
        return text

# Apply preprocessing
df['cleaned_email'] = df['email'].apply(preprocess_text)

# Convert to features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_email'])
y = df['label'].map({'spam': 1, 'ham': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Evaluate
y_pred = nb_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Predict on new email
new_email = "Win a free vacation! Click now!"
cleaned_new_email = preprocess_text(new_email)
new_email_vector = vectorizer.transform([cleaned_new_email])
prediction = nb_classifier.predict(new_email_vector)
print("New email prediction:", "Spam" if prediction[0] == 1 else "Ham")

NLTK resources loaded successfully.
Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

         Ham       0.50      1.00      0.67         1
        Spam       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

New email prediction: Ham


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import urllib.request
import tarfile
import os
import shutil

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Function to download and extract SpamAssassin dataset
def download_spamassassin_dataset():
    base_url = "https://spamassassin.apache.org/old/publiccorpus/"
    dataset_files = [
        "20030228_spam.tar.bz2",
        "20030228_easy_ham.tar.bz2",
        "20030228_spam_2.tar.bz2",
        "20021010_spam.tar.bz2",
        "20021010_easy_ham.tar.bz2",
        "20021010_hard_ham.tar.bz2"
    ]
    extract_path = "spamassassin"
    
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)
        print("Downloading SpamAssassin dataset...")
        for dataset_file in dataset_files:
            dataset_path = dataset_file
            try:
                urllib.request.urlretrieve(base_url + dataset_file, dataset_path)
                print(f"Extracting {dataset_file}...")
                with tarfile.open(dataset_path, "r:bz2") as tar:
                    tar.extractall(path=extract_path)
                os.remove(dataset_path)
            except Exception as e:
                print(f"Failed to download/extract {dataset_file}: {e}")
                continue
        
        # Organize into spam/ham directories
        spam_dir = os.path.join(extract_path, 'spam')
        ham_dir = os.path.join(extract_path, 'ham')
        os.makedirs(spam_dir, exist_ok=True)
        os.makedirs(ham_dir, exist_ok=True)
        
        for subdir in os.listdir(extract_path):
            subdir_path = os.path.join(extract_path, subdir)
            if os.path.isdir(subdir_path) and subdir not in ['spam', 'ham']:
                for filename in os.listdir(subdir_path):
                    src = os.path.join(subdir_path, filename)
                    if 'spam' in subdir.lower():
                        dst = os.path.join(spam_dir, f"{subdir}_{filename}")
                    else:
                        dst = os.path.join(ham_dir, f"{subdir}_{filename}")
                    shutil.move(src, dst)
                os.rmdir(subdir_path)
        
        print(f"Dataset extracted to {extract_path} with spam/ham subdirectories.")
    return extract_path

# Function to load emails from directory
def load_emails(directory):
    emails = []
    labels = []
    
    for folder in ['ham', 'spam']:
        folder_path = os.path.join(directory, folder)
        if not os.path.exists(folder_path):
            raise ValueError(f"Directory {folder_path} not found. Check extraction.")
        
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path) and not filename.startswith('.'):
                with open(file_path, 'r', encoding='latin-1', errors='ignore') as file:
                    try:
                        content = file.read()
                        # Extract body (skip headers)
                        body_start = content.find('\n\n') + 2 if '\n\n' in content else 0
                        body = content[body_start:].strip()
                        if body:  # Only append non-empty emails
                            emails.append(body)
                            labels.append(1 if folder == 'spam' else 0)
                    except Exception as e:
                        print(f"Skipping {filename}: {e}")
                        continue
    
    if not emails:
        raise ValueError("No emails loaded. Check dataset extraction.")
    
    print(f"Loaded {len(emails)} emails ({sum(labels)} spam, {len(labels) - sum(labels)} ham)")
    return emails, labels

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and short tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Main function
def main():
    # Download and load dataset
    dataset_path = download_spamassassin_dataset()
    emails, labels = load_emails(dataset_path)
    
    # Preprocess emails
    print("Preprocessing emails...")
    processed_emails = [preprocess_text(email) for email in emails if email.strip()]
    
    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000, min_df=2)
    X = vectorizer.fit_transform(processed_emails)
    y = np.array(labels)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train Naive Bayes classifier
    print("Training Naive Bayes classifier...")
    nb_classifier = MultinomialNB(alpha=0.5)
    nb_classifier.fit(X_train, y_train)
    
    # Make predictions
    y_pred = nb_classifier.predict(X_test)
    
    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Print results
    print("\nModel Performance Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    
    # Example of classifying a new email
    sample_email = """
    Subject: Get Rich Quick!
    Dear Friend, congratulations! You've won a million dollars! Click here to claim your prize now!
    """
    processed_sample = preprocess_text(sample_email)
    sample_vector = vectorizer.transform([processed_sample])
    prediction = nb_classifier.predict(sample_vector)
    print("\nSample Email Classification:")
    print("Spam" if prediction[0] == 1 else "Not Spam")

if __name__ == "__main__":
    main()

Downloading SpamAssassin dataset...
Extracting 20030228_spam.tar.bz2...
Extracting 20030228_easy_ham.tar.bz2...
Extracting 20030228_spam_2.tar.bz2...
Extracting 20021010_spam.tar.bz2...
Extracting 20021010_easy_ham.tar.bz2...
Extracting 20021010_hard_ham.tar.bz2...
Dataset extracted to spamassassin with spam/ham subdirectories.
Loaded 7702 emails (2400 spam, 5302 ham)
Preprocessing emails...
Training Naive Bayes classifier...

Model Performance Metrics:
Accuracy: 0.9689
Precision: 0.9337
Recall: 0.9688
F1 Score: 0.9509

Confusion Matrix:
[[1028   33]
 [  15  465]]

Sample Email Classification:
Spam
