In [1]:
# Step 1: Import required libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd

# Step 2: Load the dataset
# This dataset has two columns: 'label' (spam/ham), 'message' (email content)
df = pd.read_csv('https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv', 
                 sep='\t', header=None, names=['label', 'message'])

# Step 3: Convert labels to 0 and 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 4: Split the data
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Step 5: Convert text to numbers using CountVectorizer (Bag of Words)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)  # Learn vocab from training data
X_test_vec = vectorizer.transform(X_test)        # Transform test data using same vocab

# Step 6: Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Step 7: Predict on test data
y_pred = model.predict(X_test_vec)

# Step 8: Check Accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy of Spam Classifier:", acc)

# Step 9: Test with your own message
sample = ["Win ₹10,000 cash now! Click here to claim."]
sample_vec = vectorizer.transform(sample)
print("Prediction (1 = Spam, 0 = Not Spam):", model.predict(sample_vec)[0])



Accuracy of Spam Classifier: 0.9919282511210762
Prediction (1 = Spam, 0 = Not Spam): 1
