In [2]:
# Import Python libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
# Dataset contains SMS messages labeled as spam or ham
data = pd.read_csv("C:/Users/prave/Downloads/archive/spam.csv", encoding='latin-1')[['v1', 'v2']]
data.columns = ['label', 'message'] 

# Convert labels to binary format (ham = 0, spam = 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Split the dataset into features and target
texts = data['message']
labels = data['label']

# Divide the data into training and testing sets
msg_train, msg_test, label_train, label_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

# Transform the text data into TF-IDF feature vectors
tfidf = TfidfVectorizer(stop_words='english')
X_train = tfidf.fit_transform(msg_train)
X_test = tfidf.transform(msg_test)

# Train a Multinomial Naive Bayes model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, label_train)
nb_predictions = nb_classifier.predict(X_test)

# Train a Support Vector Machine model
svm_classifier = LinearSVC()
svm_classifier.fit(X_train, label_train)
svm_predictions = svm_classifier.predict(X_test)

# Evaluate the performance of both models
print("=== Naive Bayes Results ===")
print("Accuracy:", accuracy_score(label_test, nb_predictions))
print("Confusion Matrix:\n", confusion_matrix(label_test, nb_predictions))
print("Classification Report:\n", classification_report(label_test, nb_predictions))

print("\n=== Support Vector Machine Results ===")
print("Accuracy:", accuracy_score(label_test, svm_predictions))
print("Confusion Matrix:\n", confusion_matrix(label_test, svm_predictions))
print("Classification Report:\n", classification_report(label_test, svm_predictions))

# Predict a custom email message
sample_msg = ["URGENT: Your loan is approved. Click here to receive the amount."]
sample_vector = tfidf.transform(sample_msg)

print("\n=== Sample Email Classification ===")
print("Message:", sample_msg[0])
print("Naive Bayes Prediction:", "Spam" if nb_classifier.predict(sample_vector)[0] else "Ham")
print("SVM Prediction:", "Spam" if svm_classifier.predict(sample_vector)[0] else "Ham")

=== Naive Bayes Results ===
Accuracy: 0.9668161434977578
Confusion Matrix:
 [[965   0]
 [ 37 113]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


=== Support Vector Machine Results ===
Accuracy: 0.97847533632287
Confusion Matrix:
 [[960   5]
 [ 19 131]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.96      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115


=== Sample Email Classification ===
Message: URGENT: Your loan is approved. Click here to