In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load the dataset
# The dataset is expected to be a tab-separated file named 'SMSSpamCollection'
try:
    df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'text'])
except FileNotFoundError:
    print("Error: 'SMSSpamCollection' not found.")
    print("Please make sure the dataset file is in the same directory as the script.")
    exit()

# Map labels to numerical values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train_vectorized, y_train)

# Vectorize the test data
X_test_vectorized = vectorizer.transform(X_test)

# Make predictions on the test data
y_pred = model.predict(X_test_vectorized)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy}')

# Example prediction
def predict_sms(text):
    text_vectorized = vectorizer.transform([text])
    prediction = model.predict(text_vectorized)
    return "spam" if prediction[0] == 1 else "ham"

# Test with some examples
example_spam = "Congratulations! You've won a free cruise to the Bahamas. Click here to claim your prize."
example_ham = "Hey, are we still on for dinner tonight?"

print(f"Prediction for '{example_spam}': {predict_sms(example_spam)}")
print(f"Prediction for '{example_ham}': {predict_sms(example_ham)}")


Model Accuracy: 0.9668161434977578
Prediction for 'Congratulations! You've won a free cruise to the Bahamas. Click here to claim your prize.': spam
Prediction for 'Hey, are we still on for dinner tonight?': ham
