In [None]:
# Detecting spam or ham (non-spam) messages is a common Natural Language
# Processing (NLP) problem. It involves classifying text messages or emails into one of these two
# categories. Here's a step-by-step guide on how to build a basic spam detection model using Python
# and common NLP libraries like scikit-learn and NLTK.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

In [None]:
# Display first few rows
print(df.head())

In [None]:
# Define stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# Function to preprocess the text
def preprocess_text(text):
# Convert text to lowercase
text = text.lower()
# Remove non-alphabetic characters
text = re.sub(r'\W', ' ', text)
# Remove numbers
text = re.sub(r'\d', ' ', text)
# Remove single characters

text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
# Remove multiple spaces
text = re.sub(r'\s+', ' ', text)
# Remove stopwords
text = ' '.join(word for word in text.split() if word not in stop_words)
return text

In [None]:
# Apply preprocessing to the messages
df['message'] = df['message'].apply(preprocess_text)

In [None]:
# Display some processed messages
print(df.head())

In [None]:
# Convert labels to binary (1 for spam, 0 for ham)
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

In [None]:
# Feature extraction using TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['message']).toarray()
y = df['label'].values

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialize and train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set

y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['ham', 'spam'])

In [None]:
# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"\nClassification Report:\n{report}")