In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv("DataSet/HamOrSpam.csv")

# Preprocess the text data
# (You may need additional preprocessing steps based on your dataset)
data['v2'] = data['v2'].str.lower()

# Split into features and labels
X = data['v2']
y = data['v1']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a classifier model
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy*100,"%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate percentage of ham and spam emails
ham_percentage = (y.value_counts()['ham'] / len(y)) * 100
spam_percentage = (y.value_counts()['spam'] / len(y)) * 100

print("Percentage of Ham Emails:", ham_percentage)
print("Percentage of Spam Emails:", spam_percentage)

Accuracy: 95.92760180995475 %
Classification Report:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98       954
        spam       1.00      0.70      0.82       151

    accuracy                           0.96      1105
   macro avg       0.98      0.85      0.90      1105
weighted avg       0.96      0.96      0.96      1105

Percentage of Ham Emails: 86.47719044170891
Percentage of Spam Emails: 13.522809558291094
