# 13. Assignment: SMS Spam Text Preprocessing 📝

This notebook focuses on the complete process of cleaning and preparing text data from the `SMSSpamCollection` dataset.

In [None]:
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

### 1. Load Data

In [None]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])
display(df.head())

### 2. Create Preprocessing Function

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetic characters
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

### 3. Apply Function and View Results

In [None]:
df['processed_message'] = df['message'].apply(preprocess_text)
print("SMS preprocessing complete!")
display(df[['label', 'message', 'processed_message']].head())