In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

# Load dataset
df = pd.read_csv(r"C:\Users\Vignesh\Downloads\emails.csv")
print(df.head())
print()
print(df.info())

# Check missing values
print(df.isnull().sum())

# Check class distribution
print(df['spam'].value_counts())

# Split features and labels
X = df['text']
y = df['spam']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train)
y_pred_tfidf = mnb.predict(X_test_tfidf)

print("Accuracy :", accuracy_score(y_test, y_pred_tfidf))
print("Precision :", precision_score(y_test, y_pred_tfidf))
print("Recall :", recall_score(y_test, y_pred_tfidf))
print("F1 Score :", f1_score(y_test, y_pred_tfidf))

# Gaussian Naive Bayes
gnb = GaussianNB()
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

gnb.fit(X_train_dense, y_train)
y_pred_gnb = gnb.predict(X_test_dense)

print("Accuracy :", accuracy_score(y_test, y_pred_gnb))
print("Precision :", precision_score(y_test, y_pred_gnb))
print("Recall :", recall_score(y_test, y_pred_gnb))
print("F1 Score :", f1_score(y_test, y_pred_gnb))

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB
None
text    0
spam    0
dtype: int64
spam
0    4360
1    1368
Name: count, dtype: int64
Accuracy : 0.9781849912739965
Precision : 0.9853479853479854
Recall : 0.9275862068965517
F1 Score : 0.955595026642984
Accuracy : 0.9511343804537522
Precision : 0.968
Recall : 0.8344827586206897
F1 Score : 0.8962962962962963
