In [17]:
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [18]:
def load_data(directory):
    """Loads data from a directory with individual email files."""
    if not os.path.exists(directory):
        raise FileNotFoundError(f"The directory {directory} does not exist.")

    texts, labels = [], []

    # Read all files in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        # Determine the label from the filename (assuming naming convention)
        # Example: "3-1msg1.txt" might represent spam, use your naming convention.
        label = 1 if "spam" in filename.lower() else 0

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read().strip()
                if content:  # Ensure non-empty content
                    texts.append(content)
                    labels.append(label)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    if not texts:
        raise ValueError("No valid data found in the directory.")

    return texts, labels

In [19]:
train_dir = r"C:\Users\Dell\Downloads\train_test_mails\train-mails"
test_dir = r"C:\Users\Dell\Downloads\train_test_mails\test-mails"

In [20]:
X_train, y_train = load_data(train_dir)
X_test, y_test = load_data(test_dir)


In [21]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

In [22]:
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [23]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [24]:
y_pred = model.predict(X_test_tfidf)

In [25]:
print("Test Performance:")
print(classification_report(y_test, y_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

Test Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       260

    accuracy                           1.00       260
   macro avg       1.00      1.00      1.00       260
weighted avg       1.00      1.00      1.00       260

Test Accuracy: 100.00%


In [26]:
joblib.dump(model, "spam_classifier.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
