In [None]:
%pip install pandas scikit-learn

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
# Load dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Convert labels to binary (ham = 0, spam = 1)
df['label_num'] = df.label.map({'ham':0, 'spam':1})

# Features and labels
X = df['message']
y = df['label_num']

# Convert text to numerical vectors
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)

# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1432   16]
 [   9  215]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1448
           1       0.93      0.96      0.95       224

    accuracy                           0.99      1672
   macro avg       0.96      0.97      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [10]:
myemail = ["you have a meeting at 10am tomorrow"]

c_count = vectorizer.transform(myemail)
my_prediction = model.predict(c_count)
label = 'spam' if my_prediction[0] == 1 else 'ham'
print(f"\nThe email is classified as: {label}")

print(my_prediction)


The email is classified as: ham
[0]
