# Spam Detection Model

This notebook contains the complete workflow for building a spam detection model using the SMS Spam Collection Dataset.

In [None]:
# Step 1: Install Required Libraries
!pip install pandas scikit-learn nltk

In [None]:
# Step 2: Load the Dataset
import pandas as pd

# Load the dataset
data = pd.read_csv('data/spam.csv', encoding='latin-1')

# Display the first few rows
data.head()

In [None]:
# Step 3: Data Cleaning
# Drop unnecessary columns
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# Display the cleaned dataset
data.head()

In [None]:
# Step 4: Preprocess the Data
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords
nltk.download('stopwords')

# Preprocess the text data
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

data['cleaned_message'] = data['message'].apply(preprocess_text)

In [None]:
# Step 5: Encode Labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  # 0 for ham, 1 for spam

In [None]:
# Step 6: Split the Data
from sklearn.model_selection import train_test_split

X = data['cleaned_message']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Step 8: Train the Model
from sklearn.naive_bayes import MultinomialNB

# Train the model
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

In [None]:
# Step 9: Make Predictions
# Make predictions
y_pred = model.predict(X_test_vectorized)

In [None]:
# Step 10: Evaluate the Model
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

In [None]:
# Step 11: Save the Model (Optional)
import joblib

# Save the model
joblib.dump(model, 'spam_detection_model.pkl')
# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

In [None]:
# Step 12: Load the Model (Optional)
# Load the model
model = joblib.load('spam_detection_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example usage
new_message = "Congratulations! You've won a $1,000 Walmart gift card. Click here to claim."
new_message_vectorized = vectorizer.transform([preprocess_text(new_message)])
prediction = model.predict(new_message_vectorized)
print("Spam" if prediction[0] == 1 else "Ham")