In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Load dataset
data = pd.read_csv('data.csv')

In [3]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    clean_words = [w for w in words if not w in stop_words]
    return ' '.join(clean_words)

In [4]:
data['text'] = data['text'].apply(clean_text)

In [5]:
# Feature extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['text']).toarray()

In [6]:
# Label encoding
y = np.array(data['label'].replace({'ham': 0, 'spam': 1}))

In [7]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [8]:
# Model training
model = MultinomialNB()
model.fit(X_train, y_train)

In [9]:
# Model evaluation
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8181818181818182


In [11]:
# Test the model on a new email
def predict_email(filepath):
    # Read email from file
    with open(filepath, 'r') as f:
        email_text = f.read()

    # Preprocess the text
    email_text = clean_text(email_text)

    # Convert the text to a feature vector using the same CountVectorizer used for training
    email_vector = vectorizer.transform([email_text]).toarray()

    # Make a prediction using the pre-trained model
    prediction = model.predict(email_vector)

    # Print the result
    if prediction[0] == 0:
        print('The email is classified as HAM.')
    else:
        print('The email is classified as SPAM.')

# Test the model on a sample email
predict_email('true_email.txt')


The email is classified as HAM.
