# 📧 Spam SMS Detection using Machine Learning

This notebook demonstrates how to classify SMS messages as **spam** or **ham (legitimate)** using **Natural Language Processing (NLP)** techniques like **TF-IDF Vectorization** and the **Multinomial Naive Bayes** algorithm.

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
import string
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Step 2: Load dataset (replace with your path if needed)
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
df.head()

In [None]:
# Step 3: Preprocess the text data
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df['cleaned'] = df['message'].apply(clean_text)
df.head()

In [None]:
# Step 4: Convert labels to binary
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

In [None]:
# Step 5: Vectorize the text using TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned'])
y = df['label_num']

In [None]:
# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Step 8: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))