In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load the SMS Spam Collection Dataset
# Dataset URL
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
# Load the dataset
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Display dataset information
print(f"Dataset Shape: {data.shape}")
print(f"First 5 rows of the dataset:\n{data.head()}")

# Step 2: Data Preprocessing
# Map 'ham' to 0 and 'spam' to 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Check for null values
print(f"Null values:\n{data.isnull().sum()}")

# Step 3: Split the dataset into training and testing sets
X = data['message']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 5: Train a Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluate the Model
y_pred = model.predict(X_test_tfidf)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 7: Test the Model with Sample Messages
sample_messages = [
    "Congratulations! You've won a $1000 gift card. Call now to claim your prize!",
    "Hi, are we still on for lunch tomorrow?",
    "Exclusive offer! Buy one get one free. Limited time only.",
    "Can you send me the meeting notes from today?"
]

# Transform sample messages using the TF-IDF vectorizer
sample_tfidf = tfidf.transform(sample_messages)

# Predict whether the messages are spam or ham
predictions = model.predict(sample_tfidf)

print("\nSample Message Predictions:")
for msg, pred in zip(sample_messages, predictions):
    print(f"Message: \"{msg}\" --> {'Spam' if pred == 1 else 'Ham'}")

# Step 8: Save the Model and Vectorizer
import pickle
with open("spam_classifier.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

print("\nModel and vectorizer saved successfully!")



Dataset Shape: (5572, 2)
First 5 rows of the dataset:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Null values:
label      0
message    0
dtype: int64
Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[965   1]
 [ 22 127]]

Sample Message Predictions:
Message: "Congratulations! You've won a $1000 gift card. Call now to claim your prize!" --> Spam
Messag